{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999581764951903, "eval_steps": 500, "global_step": 1195, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000836470096194061, "grad_norm": 109.33856201171875, "learning_rate": 2.4999999999999998e-06, "loss": 24.0328, "step": 1 }, { "epoch": 0.004182350480970306, "grad_norm": 97.91401672363281, "learning_rate": 1.2499999999999999e-05, "loss": 22.4374, "step": 5 }, { "epoch": 0.008364700961940611, "grad_norm": 36.984405517578125, "learning_rate": 2.4999999999999998e-05, "loss": 20.9539, "step": 10 }, { "epoch": 0.012547051442910916, "grad_norm": 20.562854766845703, "learning_rate": 3.75e-05, "loss": 18.0157, "step": 15 }, { "epoch": 0.016729401923881223, "grad_norm": 7.916600227355957, "learning_rate": 4.9999999999999996e-05, "loss": 15.4241, "step": 20 }, { "epoch": 0.020911752404851526, "grad_norm": 6.871030807495117, "learning_rate": 6.25e-05, "loss": 14.6159, "step": 25 }, { "epoch": 0.025094102885821833, "grad_norm": 6.986784934997559, "learning_rate": 7.5e-05, "loss": 13.6058, "step": 30 }, { "epoch": 0.029276453366792136, "grad_norm": 3.074172019958496, "learning_rate": 8.75e-05, "loss": 12.7264, "step": 35 }, { "epoch": 0.033458803847762446, "grad_norm": 2.548049211502075, "learning_rate": 9.999999999999999e-05, "loss": 12.7985, "step": 40 }, { "epoch": 0.037641154328732745, "grad_norm": 3.184255361557007, "learning_rate": 0.0001125, "loss": 12.2921, "step": 45 }, { "epoch": 0.04182350480970305, "grad_norm": 4.834357738494873, "learning_rate": 0.000125, "loss": 11.7192, "step": 50 }, { "epoch": 0.04600585529067336, "grad_norm": 8.06936264038086, "learning_rate": 0.00013749999999999998, "loss": 11.0815, "step": 55 }, { "epoch": 0.050188205771643665, "grad_norm": 12.874434471130371, "learning_rate": 0.00015, "loss": 9.8728, "step": 60 }, { "epoch": 0.05437055625261397, "grad_norm": 18.794525146484375, "learning_rate": 0.00016249999999999997, "loss": 7.7488, "step": 65 }, { "epoch": 0.05855290673358427, "grad_norm": 21.42744255065918, "learning_rate": 0.000175, "loss": 4.8002, "step": 70 }, { "epoch": 0.06273525721455459, "grad_norm": 7.021483898162842, "learning_rate": 0.00018749999999999998, "loss": 2.2667, "step": 75 }, { "epoch": 0.06691760769552489, "grad_norm": 4.35729455947876, "learning_rate": 0.00019999999999999998, "loss": 1.7236, "step": 80 }, { "epoch": 0.07109995817649518, "grad_norm": 2.531404972076416, "learning_rate": 0.0002125, "loss": 1.4388, "step": 85 }, { "epoch": 0.07528230865746549, "grad_norm": 1.7126120328903198, "learning_rate": 0.000225, "loss": 1.2603, "step": 90 }, { "epoch": 0.0794646591384358, "grad_norm": 1.713827133178711, "learning_rate": 0.00023749999999999997, "loss": 1.1149, "step": 95 }, { "epoch": 0.0836470096194061, "grad_norm": 0.6418541073799133, "learning_rate": 0.00025, "loss": 1.0645, "step": 100 }, { "epoch": 0.08782936010037641, "grad_norm": 0.9687772989273071, "learning_rate": 0.0002625, "loss": 1.0047, "step": 105 }, { "epoch": 0.09201171058134672, "grad_norm": 0.8204954266548157, "learning_rate": 0.00027499999999999996, "loss": 0.986, "step": 110 }, { "epoch": 0.09619406106231702, "grad_norm": 0.5046463012695312, "learning_rate": 0.0002875, "loss": 0.9229, "step": 115 }, { "epoch": 0.10037641154328733, "grad_norm": 1.1709442138671875, "learning_rate": 0.0003, "loss": 0.9344, "step": 120 }, { "epoch": 0.10455876202425764, "grad_norm": 0.9831328392028809, "learning_rate": 0.0002999839868651235, "loss": 0.8812, "step": 125 }, { "epoch": 0.10874111250522794, "grad_norm": 1.4019944667816162, "learning_rate": 0.0002999359508794339, "loss": 0.8814, "step": 130 }, { "epoch": 0.11292346298619825, "grad_norm": 0.78233802318573, "learning_rate": 0.00029985590229902073, "loss": 0.8701, "step": 135 }, { "epoch": 0.11710581346716854, "grad_norm": 1.4517076015472412, "learning_rate": 0.0002997438582149335, "loss": 0.8753, "step": 140 }, { "epoch": 0.12128816394813885, "grad_norm": 2.320331335067749, "learning_rate": 0.0002995998425495327, "loss": 0.8464, "step": 145 }, { "epoch": 0.12547051442910917, "grad_norm": 1.0486135482788086, "learning_rate": 0.000299423886051382, "loss": 0.8498, "step": 150 }, { "epoch": 0.12965286491007946, "grad_norm": 0.9131810069084167, "learning_rate": 0.0002992160262886831, "loss": 0.8468, "step": 155 }, { "epoch": 0.13383521539104978, "grad_norm": 1.4284504652023315, "learning_rate": 0.0002989763076412549, "loss": 0.8088, "step": 160 }, { "epoch": 0.13801756587202008, "grad_norm": 0.6106438040733337, "learning_rate": 0.000298704781291058, "loss": 0.8215, "step": 165 }, { "epoch": 0.14219991635299037, "grad_norm": 0.5358127951622009, "learning_rate": 0.0002984015052112665, "loss": 0.8201, "step": 170 }, { "epoch": 0.1463822668339607, "grad_norm": 1.5017443895339966, "learning_rate": 0.0002980665441538907, "loss": 0.7957, "step": 175 }, { "epoch": 0.15056461731493098, "grad_norm": 1.1214942932128906, "learning_rate": 0.00029769996963595184, "loss": 0.8083, "step": 180 }, { "epoch": 0.1547469677959013, "grad_norm": 2.1036202907562256, "learning_rate": 0.0002973018599242125, "loss": 0.7929, "step": 185 }, { "epoch": 0.1589293182768716, "grad_norm": 1.0557819604873657, "learning_rate": 0.0002968723000184662, "loss": 0.7868, "step": 190 }, { "epoch": 0.16311166875784192, "grad_norm": 0.9558168649673462, "learning_rate": 0.00029641138163338907, "loss": 0.7812, "step": 195 }, { "epoch": 0.1672940192388122, "grad_norm": 0.771851122379303, "learning_rate": 0.0002959192031789579, "loss": 0.7846, "step": 200 }, { "epoch": 0.17147636971978253, "grad_norm": 1.288865089416504, "learning_rate": 0.0002953958697394391, "loss": 0.777, "step": 205 }, { "epoch": 0.17565872020075282, "grad_norm": 2.001302480697632, "learning_rate": 0.000294841493050952, "loss": 0.7797, "step": 210 }, { "epoch": 0.17984107068172314, "grad_norm": 0.7828574776649475, "learning_rate": 0.0002942561914776124, "loss": 0.7815, "step": 215 }, { "epoch": 0.18402342116269343, "grad_norm": 1.4854490756988525, "learning_rate": 0.00029364008998626086, "loss": 0.7608, "step": 220 }, { "epoch": 0.18820577164366373, "grad_norm": 1.1406800746917725, "learning_rate": 0.00029299332011978107, "loss": 0.747, "step": 225 }, { "epoch": 0.19238812212463405, "grad_norm": 1.7346460819244385, "learning_rate": 0.00029231601996901433, "loss": 0.7555, "step": 230 }, { "epoch": 0.19657047260560434, "grad_norm": 1.7754813432693481, "learning_rate": 0.0002916083341432763, "loss": 0.7626, "step": 235 }, { "epoch": 0.20075282308657466, "grad_norm": 1.2126661539077759, "learning_rate": 0.00029087041373948135, "loss": 0.7237, "step": 240 }, { "epoch": 0.20493517356754495, "grad_norm": 1.930538535118103, "learning_rate": 0.00029010241630988217, "loss": 0.7672, "step": 245 }, { "epoch": 0.20911752404851527, "grad_norm": 2.1792216300964355, "learning_rate": 0.0002893045058284311, "loss": 0.7416, "step": 250 }, { "epoch": 0.21329987452948557, "grad_norm": 1.416754961013794, "learning_rate": 0.0002884768526557703, "loss": 0.7196, "step": 255 }, { "epoch": 0.2174822250104559, "grad_norm": 1.6103583574295044, "learning_rate": 0.0002876196335028581, "loss": 0.7397, "step": 260 }, { "epoch": 0.22166457549142618, "grad_norm": 1.0755459070205688, "learning_rate": 0.0002867330313932402, "loss": 0.7644, "step": 265 }, { "epoch": 0.2258469259723965, "grad_norm": 0.8303298354148865, "learning_rate": 0.000285817235623972, "loss": 0.7393, "step": 270 }, { "epoch": 0.2300292764533668, "grad_norm": 1.4747998714447021, "learning_rate": 0.00028487244172520246, "loss": 0.7121, "step": 275 }, { "epoch": 0.23421162693433709, "grad_norm": 2.582953929901123, "learning_rate": 0.0002838988514184267, "loss": 0.7361, "step": 280 }, { "epoch": 0.2383939774153074, "grad_norm": 2.413325309753418, "learning_rate": 0.0002828966725734167, "loss": 0.74, "step": 285 }, { "epoch": 0.2425763278962777, "grad_norm": 0.7637856006622314, "learning_rate": 0.0002818661191638393, "loss": 0.7096, "step": 290 }, { "epoch": 0.24675867837724802, "grad_norm": 1.757056713104248, "learning_rate": 0.0002808074112215711, "loss": 0.7205, "step": 295 }, { "epoch": 0.25094102885821834, "grad_norm": 0.8766753077507019, "learning_rate": 0.0002797207747897198, "loss": 0.7098, "step": 300 }, { "epoch": 0.2551233793391886, "grad_norm": 1.449209213256836, "learning_rate": 0.00027860644187436195, "loss": 0.725, "step": 305 }, { "epoch": 0.2593057298201589, "grad_norm": 0.6825206875801086, "learning_rate": 0.0002774646503950078, "loss": 0.6938, "step": 310 }, { "epoch": 0.26348808030112925, "grad_norm": 1.119585394859314, "learning_rate": 0.0002762956441338036, "loss": 0.698, "step": 315 }, { "epoch": 0.26767043078209957, "grad_norm": 0.9425824880599976, "learning_rate": 0.0002750996726834817, "loss": 0.7189, "step": 320 }, { "epoch": 0.27185278126306983, "grad_norm": 0.5979897975921631, "learning_rate": 0.0002738769913940706, "loss": 0.7039, "step": 325 }, { "epoch": 0.27603513174404015, "grad_norm": 1.8769757747650146, "learning_rate": 0.00027262786131837573, "loss": 0.7035, "step": 330 }, { "epoch": 0.2802174822250105, "grad_norm": 1.1395800113677979, "learning_rate": 0.0002713525491562421, "loss": 0.6898, "step": 335 }, { "epoch": 0.28439983270598074, "grad_norm": 1.0573526620864868, "learning_rate": 0.0002700513271976119, "loss": 0.7042, "step": 340 }, { "epoch": 0.28858218318695106, "grad_norm": 0.5185459852218628, "learning_rate": 0.0002687244732643881, "loss": 0.6914, "step": 345 }, { "epoch": 0.2927645336679214, "grad_norm": 2.7914602756500244, "learning_rate": 0.0002673722706511174, "loss": 0.7049, "step": 350 }, { "epoch": 0.2969468841488917, "grad_norm": 3.0459792613983154, "learning_rate": 0.000265995008064504, "loss": 0.7148, "step": 355 }, { "epoch": 0.30112923462986196, "grad_norm": 2.1906723976135254, "learning_rate": 0.00026459297956176885, "loss": 0.7074, "step": 360 }, { "epoch": 0.3053115851108323, "grad_norm": 1.6257227659225464, "learning_rate": 0.00026316648448786536, "loss": 0.6985, "step": 365 }, { "epoch": 0.3094939355918026, "grad_norm": 0.7152910828590393, "learning_rate": 0.00026171582741156725, "loss": 0.6875, "step": 370 }, { "epoch": 0.3136762860727729, "grad_norm": 2.4449851512908936, "learning_rate": 0.0002602413180604401, "loss": 0.6787, "step": 375 }, { "epoch": 0.3178586365537432, "grad_norm": 0.5180588960647583, "learning_rate": 0.000258743271254712, "loss": 0.6724, "step": 380 }, { "epoch": 0.3220409870347135, "grad_norm": 1.5739381313323975, "learning_rate": 0.00025722200684005715, "loss": 0.7076, "step": 385 }, { "epoch": 0.32622333751568383, "grad_norm": 0.8701817989349365, "learning_rate": 0.00025567784961930546, "loss": 0.6841, "step": 390 }, { "epoch": 0.3304056879966541, "grad_norm": 1.474747896194458, "learning_rate": 0.0002541111292830951, "loss": 0.713, "step": 395 }, { "epoch": 0.3345880384776244, "grad_norm": 1.8884798288345337, "learning_rate": 0.00025252218033947993, "loss": 0.6893, "step": 400 }, { "epoch": 0.33877038895859474, "grad_norm": 0.8834472894668579, "learning_rate": 0.00025091134204250997, "loss": 0.6966, "step": 405 }, { "epoch": 0.34295273943956506, "grad_norm": 0.6324520707130432, "learning_rate": 0.00024927895831979745, "loss": 0.6882, "step": 410 }, { "epoch": 0.3471350899205353, "grad_norm": 2.353163480758667, "learning_rate": 0.00024762537769908535, "loss": 0.6829, "step": 415 }, { "epoch": 0.35131744040150564, "grad_norm": 1.3682096004486084, "learning_rate": 0.00024595095323383365, "loss": 0.6912, "step": 420 }, { "epoch": 0.35549979088247596, "grad_norm": 0.9962055087089539, "learning_rate": 0.0002442560424278399, "loss": 0.6857, "step": 425 }, { "epoch": 0.3596821413634463, "grad_norm": 1.1282930374145508, "learning_rate": 0.00024254100715890846, "loss": 0.6696, "step": 430 }, { "epoch": 0.36386449184441655, "grad_norm": 0.934388279914856, "learning_rate": 0.00024080621360158717, "loss": 0.6841, "step": 435 }, { "epoch": 0.36804684232538687, "grad_norm": 1.4339077472686768, "learning_rate": 0.00023905203214898558, "loss": 0.6705, "step": 440 }, { "epoch": 0.3722291928063572, "grad_norm": 1.0309265851974487, "learning_rate": 0.00023727883733369292, "loss": 0.6706, "step": 445 }, { "epoch": 0.37641154328732745, "grad_norm": 1.9208811521530151, "learning_rate": 0.00023548700774781242, "loss": 0.6637, "step": 450 }, { "epoch": 0.3805938937682978, "grad_norm": 1.0379974842071533, "learning_rate": 0.00023367692596212858, "loss": 0.68, "step": 455 }, { "epoch": 0.3847762442492681, "grad_norm": 1.852662444114685, "learning_rate": 0.00023184897844442495, "loss": 0.6589, "step": 460 }, { "epoch": 0.3889585947302384, "grad_norm": 1.1750479936599731, "learning_rate": 0.00023000355547697027, "loss": 0.6675, "step": 465 }, { "epoch": 0.3931409452112087, "grad_norm": 1.6473002433776855, "learning_rate": 0.00022814105107318952, "loss": 0.6709, "step": 470 }, { "epoch": 0.397323295692179, "grad_norm": 1.2356650829315186, "learning_rate": 0.00022626186289353913, "loss": 0.6652, "step": 475 }, { "epoch": 0.4015056461731493, "grad_norm": 1.1605840921401978, "learning_rate": 0.00022436639216060275, "loss": 0.6698, "step": 480 }, { "epoch": 0.40568799665411964, "grad_norm": 1.5935866832733154, "learning_rate": 0.00022245504357342716, "loss": 0.6688, "step": 485 }, { "epoch": 0.4098703471350899, "grad_norm": 0.810558557510376, "learning_rate": 0.00022052822522111522, "loss": 0.6524, "step": 490 }, { "epoch": 0.41405269761606023, "grad_norm": 0.7008018493652344, "learning_rate": 0.00021858634849569576, "loss": 0.6924, "step": 495 }, { "epoch": 0.41823504809703055, "grad_norm": 1.7558863162994385, "learning_rate": 0.0002166298280042877, "loss": 0.6711, "step": 500 }, { "epoch": 0.4224173985780008, "grad_norm": 1.573688268661499, "learning_rate": 0.00021465908148057787, "loss": 0.6674, "step": 505 }, { "epoch": 0.42659974905897113, "grad_norm": 1.4761265516281128, "learning_rate": 0.00021267452969563153, "loss": 0.6706, "step": 510 }, { "epoch": 0.43078209953994145, "grad_norm": 1.7749208211898804, "learning_rate": 0.00021067659636805403, "loss": 0.6469, "step": 515 }, { "epoch": 0.4349644500209118, "grad_norm": 1.0164939165115356, "learning_rate": 0.00020866570807352337, "loss": 0.6764, "step": 520 }, { "epoch": 0.43914680050188204, "grad_norm": 1.6237319707870483, "learning_rate": 0.00020664229415371266, "loss": 0.6694, "step": 525 }, { "epoch": 0.44332915098285236, "grad_norm": 1.5586035251617432, "learning_rate": 0.00020460678662462194, "loss": 0.6562, "step": 530 }, { "epoch": 0.4475115014638227, "grad_norm": 1.771645188331604, "learning_rate": 0.0002025596200843394, "loss": 0.6622, "step": 535 }, { "epoch": 0.451693851944793, "grad_norm": 0.5951160788536072, "learning_rate": 0.0002005012316202506, "loss": 0.651, "step": 540 }, { "epoch": 0.45587620242576327, "grad_norm": 0.793093740940094, "learning_rate": 0.00019843206071571692, "loss": 0.6634, "step": 545 }, { "epoch": 0.4600585529067336, "grad_norm": 1.0352814197540283, "learning_rate": 0.0001963525491562421, "loss": 0.6636, "step": 550 }, { "epoch": 0.4642409033877039, "grad_norm": 0.843008816242218, "learning_rate": 0.00019426314093514717, "loss": 0.6407, "step": 555 }, { "epoch": 0.46842325386867417, "grad_norm": 1.7540709972381592, "learning_rate": 0.00019216428215877425, "loss": 0.638, "step": 560 }, { "epoch": 0.4726056043496445, "grad_norm": 0.5828922390937805, "learning_rate": 0.00019005642095123895, "loss": 0.6625, "step": 565 }, { "epoch": 0.4767879548306148, "grad_norm": 0.7700462937355042, "learning_rate": 0.00018794000735875208, "loss": 0.6428, "step": 570 }, { "epoch": 0.48097030531158513, "grad_norm": 0.8344655632972717, "learning_rate": 0.00018581549325353126, "loss": 0.6553, "step": 575 }, { "epoch": 0.4851526557925554, "grad_norm": 1.2676873207092285, "learning_rate": 0.000183683332237322, "loss": 0.6645, "step": 580 }, { "epoch": 0.4893350062735257, "grad_norm": 1.4888837337493896, "learning_rate": 0.00018154397954454993, "loss": 0.6859, "step": 585 }, { "epoch": 0.49351735675449604, "grad_norm": 0.7020601034164429, "learning_rate": 0.00017939789194512472, "loss": 0.6456, "step": 590 }, { "epoch": 0.49769970723546636, "grad_norm": 1.1964813470840454, "learning_rate": 0.00017724552764691545, "loss": 0.6594, "step": 595 }, { "epoch": 0.5018820577164367, "grad_norm": 1.1332772970199585, "learning_rate": 0.00017508734619791966, "loss": 0.6606, "step": 600 }, { "epoch": 0.506064408197407, "grad_norm": 1.6122368574142456, "learning_rate": 0.00017292380838814577, "loss": 0.6468, "step": 605 }, { "epoch": 0.5102467586783772, "grad_norm": 0.8950415849685669, "learning_rate": 0.00017075537615123042, "loss": 0.6615, "step": 610 }, { "epoch": 0.5144291091593476, "grad_norm": 1.9138753414154053, "learning_rate": 0.00016858251246581216, "loss": 0.6683, "step": 615 }, { "epoch": 0.5186114596403179, "grad_norm": 0.9320158362388611, "learning_rate": 0.00016640568125668117, "loss": 0.6734, "step": 620 }, { "epoch": 0.5227938101212881, "grad_norm": 1.2331713438034058, "learning_rate": 0.00016422534729572738, "loss": 0.6582, "step": 625 }, { "epoch": 0.5269761606022585, "grad_norm": 1.1182340383529663, "learning_rate": 0.00016204197610270816, "loss": 0.6533, "step": 630 }, { "epoch": 0.5311585110832288, "grad_norm": 0.6500148773193359, "learning_rate": 0.00015985603384585542, "loss": 0.6396, "step": 635 }, { "epoch": 0.5353408615641991, "grad_norm": 0.9531376361846924, "learning_rate": 0.00015766798724234506, "loss": 0.6337, "step": 640 }, { "epoch": 0.5395232120451694, "grad_norm": 0.7729771733283997, "learning_rate": 0.00015547830345864885, "loss": 0.6498, "step": 645 }, { "epoch": 0.5437055625261397, "grad_norm": 0.6831007599830627, "learning_rate": 0.0001532874500107902, "loss": 0.6404, "step": 650 }, { "epoch": 0.54788791300711, "grad_norm": 1.2160038948059082, "learning_rate": 0.00015109589466452594, "loss": 0.658, "step": 655 }, { "epoch": 0.5520702634880803, "grad_norm": 1.015773057937622, "learning_rate": 0.00014890410533547404, "loss": 0.6507, "step": 660 }, { "epoch": 0.5562526139690506, "grad_norm": 1.208256721496582, "learning_rate": 0.00014671254998920976, "loss": 0.6399, "step": 665 }, { "epoch": 0.560434964450021, "grad_norm": 0.7431871294975281, "learning_rate": 0.00014452169654135115, "loss": 0.6534, "step": 670 }, { "epoch": 0.5646173149309912, "grad_norm": 0.6661595702171326, "learning_rate": 0.00014233201275765494, "loss": 0.6343, "step": 675 }, { "epoch": 0.5687996654119615, "grad_norm": 1.2753201723098755, "learning_rate": 0.00014014396615414458, "loss": 0.6296, "step": 680 }, { "epoch": 0.5729820158929319, "grad_norm": 1.4110949039459229, "learning_rate": 0.00013795802389729184, "loss": 0.6452, "step": 685 }, { "epoch": 0.5771643663739021, "grad_norm": 1.4824358224868774, "learning_rate": 0.00013577465270427262, "loss": 0.6348, "step": 690 }, { "epoch": 0.5813467168548725, "grad_norm": 1.8900264501571655, "learning_rate": 0.00013359431874331886, "loss": 0.6509, "step": 695 }, { "epoch": 0.5855290673358428, "grad_norm": 1.652632236480713, "learning_rate": 0.0001314174875341878, "loss": 0.6206, "step": 700 }, { "epoch": 0.589711417816813, "grad_norm": 1.1248772144317627, "learning_rate": 0.00012924462384876953, "loss": 0.6299, "step": 705 }, { "epoch": 0.5938937682977834, "grad_norm": 0.7448098659515381, "learning_rate": 0.00012707619161185423, "loss": 0.6483, "step": 710 }, { "epoch": 0.5980761187787537, "grad_norm": 0.6708864569664001, "learning_rate": 0.00012491265380208032, "loss": 0.6473, "step": 715 }, { "epoch": 0.6022584692597239, "grad_norm": 0.8381022810935974, "learning_rate": 0.00012275447235308453, "loss": 0.6356, "step": 720 }, { "epoch": 0.6064408197406943, "grad_norm": 1.3462333679199219, "learning_rate": 0.00012060210805487529, "loss": 0.6388, "step": 725 }, { "epoch": 0.6106231702216646, "grad_norm": 1.2015129327774048, "learning_rate": 0.00011845602045545008, "loss": 0.6258, "step": 730 }, { "epoch": 0.6148055207026348, "grad_norm": 0.7825962901115417, "learning_rate": 0.00011631666776267803, "loss": 0.6401, "step": 735 }, { "epoch": 0.6189878711836052, "grad_norm": 0.9470372200012207, "learning_rate": 0.00011418450674646868, "loss": 0.6501, "step": 740 }, { "epoch": 0.6231702216645755, "grad_norm": 0.9243600368499756, "learning_rate": 0.00011205999264124786, "loss": 0.6195, "step": 745 }, { "epoch": 0.6273525721455459, "grad_norm": 1.931402325630188, "learning_rate": 0.00010994357904876106, "loss": 0.6264, "step": 750 }, { "epoch": 0.6315349226265161, "grad_norm": 1.7754958868026733, "learning_rate": 0.00010783571784122577, "loss": 0.6351, "step": 755 }, { "epoch": 0.6357172731074864, "grad_norm": 0.627479076385498, "learning_rate": 0.00010573685906485282, "loss": 0.6395, "step": 760 }, { "epoch": 0.6398996235884568, "grad_norm": 2.4568421840667725, "learning_rate": 0.0001036474508437579, "loss": 0.6257, "step": 765 }, { "epoch": 0.644081974069427, "grad_norm": 1.9016671180725098, "learning_rate": 0.0001015679392842831, "loss": 0.6446, "step": 770 }, { "epoch": 0.6482643245503973, "grad_norm": 0.7173120975494385, "learning_rate": 9.949876837974944e-05, "loss": 0.6312, "step": 775 }, { "epoch": 0.6524466750313677, "grad_norm": 1.9048292636871338, "learning_rate": 9.744037991566058e-05, "loss": 0.622, "step": 780 }, { "epoch": 0.6566290255123379, "grad_norm": 1.2454332113265991, "learning_rate": 9.5393213375378e-05, "loss": 0.6219, "step": 785 }, { "epoch": 0.6608113759933082, "grad_norm": 1.0283215045928955, "learning_rate": 9.33577058462873e-05, "loss": 0.6236, "step": 790 }, { "epoch": 0.6649937264742786, "grad_norm": 1.8116226196289062, "learning_rate": 9.133429192647661e-05, "loss": 0.6244, "step": 795 }, { "epoch": 0.6691760769552488, "grad_norm": 0.8519335389137268, "learning_rate": 8.932340363194595e-05, "loss": 0.6253, "step": 800 }, { "epoch": 0.6733584274362192, "grad_norm": 1.004647135734558, "learning_rate": 8.73254703043685e-05, "loss": 0.6278, "step": 805 }, { "epoch": 0.6775407779171895, "grad_norm": 0.945331871509552, "learning_rate": 8.534091851942214e-05, "loss": 0.6251, "step": 810 }, { "epoch": 0.6817231283981597, "grad_norm": 0.4643709659576416, "learning_rate": 8.337017199571235e-05, "loss": 0.6298, "step": 815 }, { "epoch": 0.6859054788791301, "grad_norm": 0.8933060169219971, "learning_rate": 8.141365150430421e-05, "loss": 0.6419, "step": 820 }, { "epoch": 0.6900878293601004, "grad_norm": 2.5518875122070312, "learning_rate": 7.947177477888472e-05, "loss": 0.6424, "step": 825 }, { "epoch": 0.6942701798410706, "grad_norm": 0.7830976247787476, "learning_rate": 7.754495642657282e-05, "loss": 0.6292, "step": 830 }, { "epoch": 0.698452530322041, "grad_norm": 1.2565546035766602, "learning_rate": 7.563360783939722e-05, "loss": 0.6308, "step": 835 }, { "epoch": 0.7026348808030113, "grad_norm": 1.588156819343567, "learning_rate": 7.373813710646083e-05, "loss": 0.6249, "step": 840 }, { "epoch": 0.7068172312839816, "grad_norm": 0.6486766934394836, "learning_rate": 7.185894892681048e-05, "loss": 0.6308, "step": 845 }, { "epoch": 0.7109995817649519, "grad_norm": 0.7454473972320557, "learning_rate": 6.999644452302975e-05, "loss": 0.6267, "step": 850 }, { "epoch": 0.7151819322459222, "grad_norm": 0.6658061146736145, "learning_rate": 6.815102155557501e-05, "loss": 0.6162, "step": 855 }, { "epoch": 0.7193642827268926, "grad_norm": 1.0052908658981323, "learning_rate": 6.632307403787138e-05, "loss": 0.644, "step": 860 }, { "epoch": 0.7235466332078628, "grad_norm": 0.7472626566886902, "learning_rate": 6.451299225218754e-05, "loss": 0.616, "step": 865 }, { "epoch": 0.7277289836888331, "grad_norm": 0.587893009185791, "learning_rate": 6.27211626663071e-05, "loss": 0.6318, "step": 870 }, { "epoch": 0.7319113341698035, "grad_norm": 0.898607611656189, "learning_rate": 6.0947967851014405e-05, "loss": 0.6409, "step": 875 }, { "epoch": 0.7360936846507737, "grad_norm": 0.7444003224372864, "learning_rate": 5.919378639841281e-05, "loss": 0.6214, "step": 880 }, { "epoch": 0.740276035131744, "grad_norm": 1.199029564857483, "learning_rate": 5.745899284109154e-05, "loss": 0.6184, "step": 885 }, { "epoch": 0.7444583856127144, "grad_norm": 1.2874826192855835, "learning_rate": 5.57439575721601e-05, "loss": 0.6233, "step": 890 }, { "epoch": 0.7486407360936846, "grad_norm": 1.3848364353179932, "learning_rate": 5.4049046766166335e-05, "loss": 0.6043, "step": 895 }, { "epoch": 0.7528230865746549, "grad_norm": 0.6463631987571716, "learning_rate": 5.237462230091467e-05, "loss": 0.6361, "step": 900 }, { "epoch": 0.7570054370556253, "grad_norm": 1.0429089069366455, "learning_rate": 5.07210416802025e-05, "loss": 0.6206, "step": 905 }, { "epoch": 0.7611877875365956, "grad_norm": 0.7253705263137817, "learning_rate": 4.908865795748999e-05, "loss": 0.6312, "step": 910 }, { "epoch": 0.7653701380175659, "grad_norm": 0.498542457818985, "learning_rate": 4.74778196605201e-05, "loss": 0.6421, "step": 915 }, { "epoch": 0.7695524884985362, "grad_norm": 0.6464426517486572, "learning_rate": 4.58888707169049e-05, "loss": 0.6047, "step": 920 }, { "epoch": 0.7737348389795065, "grad_norm": 0.6658442616462708, "learning_rate": 4.432215038069449e-05, "loss": 0.623, "step": 925 }, { "epoch": 0.7779171894604768, "grad_norm": 0.6936389207839966, "learning_rate": 4.277799315994286e-05, "loss": 0.6226, "step": 930 }, { "epoch": 0.7820995399414471, "grad_norm": 0.5900949835777283, "learning_rate": 4.125672874528797e-05, "loss": 0.6314, "step": 935 }, { "epoch": 0.7862818904224174, "grad_norm": 0.9340611100196838, "learning_rate": 3.97586819395599e-05, "loss": 0.6252, "step": 940 }, { "epoch": 0.7904642409033877, "grad_norm": 1.272733211517334, "learning_rate": 3.8284172588432716e-05, "loss": 0.6236, "step": 945 }, { "epoch": 0.794646591384358, "grad_norm": 0.7782835364341736, "learning_rate": 3.6833515512134606e-05, "loss": 0.6096, "step": 950 }, { "epoch": 0.7988289418653283, "grad_norm": 0.6020464301109314, "learning_rate": 3.540702043823113e-05, "loss": 0.6124, "step": 955 }, { "epoch": 0.8030112923462986, "grad_norm": 0.675359845161438, "learning_rate": 3.4004991935496004e-05, "loss": 0.5955, "step": 960 }, { "epoch": 0.8071936428272689, "grad_norm": 0.5639395117759705, "learning_rate": 3.262772934888265e-05, "loss": 0.6069, "step": 965 }, { "epoch": 0.8113759933082393, "grad_norm": 0.4711320698261261, "learning_rate": 3.1275526735611896e-05, "loss": 0.6102, "step": 970 }, { "epoch": 0.8155583437892095, "grad_norm": 1.0711911916732788, "learning_rate": 2.9948672802388135e-05, "loss": 0.6391, "step": 975 }, { "epoch": 0.8197406942701798, "grad_norm": 0.4709874987602234, "learning_rate": 2.8647450843757897e-05, "loss": 0.6186, "step": 980 }, { "epoch": 0.8239230447511502, "grad_norm": 0.5402533411979675, "learning_rate": 2.7372138681624244e-05, "loss": 0.613, "step": 985 }, { "epoch": 0.8281053952321205, "grad_norm": 0.6864106059074402, "learning_rate": 2.6123008605929375e-05, "loss": 0.6215, "step": 990 }, { "epoch": 0.8322877457130907, "grad_norm": 0.5939123630523682, "learning_rate": 2.4900327316518326e-05, "loss": 0.6168, "step": 995 }, { "epoch": 0.8364700961940611, "grad_norm": 0.7122395038604736, "learning_rate": 2.3704355866196373e-05, "loss": 0.6053, "step": 1000 }, { "epoch": 0.8406524466750314, "grad_norm": 0.6920621395111084, "learning_rate": 2.2535349604992153e-05, "loss": 0.6097, "step": 1005 }, { "epoch": 0.8448347971560016, "grad_norm": 1.7864691019058228, "learning_rate": 2.1393558125638066e-05, "loss": 0.6382, "step": 1010 }, { "epoch": 0.849017147636972, "grad_norm": 0.7472600936889648, "learning_rate": 2.027922521028018e-05, "loss": 0.6159, "step": 1015 }, { "epoch": 0.8531994981179423, "grad_norm": 1.722579836845398, "learning_rate": 1.9192588778428842e-05, "loss": 0.6011, "step": 1020 }, { "epoch": 0.8573818485989126, "grad_norm": 0.5121240019798279, "learning_rate": 1.813388083616068e-05, "loss": 0.6031, "step": 1025 }, { "epoch": 0.8615641990798829, "grad_norm": 0.5083288550376892, "learning_rate": 1.7103327426583265e-05, "loss": 0.5845, "step": 1030 }, { "epoch": 0.8657465495608532, "grad_norm": 0.46453267335891724, "learning_rate": 1.6101148581573274e-05, "loss": 0.6031, "step": 1035 }, { "epoch": 0.8699289000418235, "grad_norm": 0.8352246880531311, "learning_rate": 1.5127558274797535e-05, "loss": 0.6024, "step": 1040 }, { "epoch": 0.8741112505227938, "grad_norm": 0.8040021061897278, "learning_rate": 1.4182764376028006e-05, "loss": 0.635, "step": 1045 }, { "epoch": 0.8782936010037641, "grad_norm": 0.7152721881866455, "learning_rate": 1.326696860675981e-05, "loss": 0.6162, "step": 1050 }, { "epoch": 0.8824759514847345, "grad_norm": 0.6833348274230957, "learning_rate": 1.2380366497141886e-05, "loss": 0.6217, "step": 1055 }, { "epoch": 0.8866583019657047, "grad_norm": 0.6803585886955261, "learning_rate": 1.1523147344229716e-05, "loss": 0.6218, "step": 1060 }, { "epoch": 0.890840652446675, "grad_norm": 0.9396490454673767, "learning_rate": 1.069549417156887e-05, "loss": 0.6176, "step": 1065 }, { "epoch": 0.8950230029276454, "grad_norm": 0.7006051540374756, "learning_rate": 9.89758369011781e-06, "loss": 0.6123, "step": 1070 }, { "epoch": 0.8992053534086156, "grad_norm": 0.5886880159378052, "learning_rate": 9.129586260518634e-06, "loss": 0.5923, "step": 1075 }, { "epoch": 0.903387703889586, "grad_norm": 0.5032349228858948, "learning_rate": 8.391665856723655e-06, "loss": 0.619, "step": 1080 }, { "epoch": 0.9075700543705563, "grad_norm": 0.51763916015625, "learning_rate": 7.683980030985654e-06, "loss": 0.6039, "step": 1085 }, { "epoch": 0.9117524048515265, "grad_norm": 0.588211715221405, "learning_rate": 7.006679880218974e-06, "loss": 0.6057, "step": 1090 }, { "epoch": 0.9159347553324969, "grad_norm": 0.6631506681442261, "learning_rate": 6.359910013739122e-06, "loss": 0.6106, "step": 1095 }, { "epoch": 0.9201171058134672, "grad_norm": 0.5523665547370911, "learning_rate": 5.743808522387544e-06, "loss": 0.6058, "step": 1100 }, { "epoch": 0.9242994562944374, "grad_norm": 0.5903011560440063, "learning_rate": 5.158506949047975e-06, "loss": 0.6321, "step": 1105 }, { "epoch": 0.9284818067754078, "grad_norm": 0.6151677966117859, "learning_rate": 4.604130260560873e-06, "loss": 0.6171, "step": 1110 }, { "epoch": 0.9326641572563781, "grad_norm": 0.5324861407279968, "learning_rate": 4.080796821042082e-06, "loss": 0.6184, "step": 1115 }, { "epoch": 0.9368465077373483, "grad_norm": 0.5442612171173096, "learning_rate": 3.5886183666109405e-06, "loss": 0.6069, "step": 1120 }, { "epoch": 0.9410288582183187, "grad_norm": 0.4981847405433655, "learning_rate": 3.1276999815337544e-06, "loss": 0.62, "step": 1125 }, { "epoch": 0.945211208699289, "grad_norm": 0.45795848965644836, "learning_rate": 2.6981400757874584e-06, "loss": 0.6027, "step": 1130 }, { "epoch": 0.9493935591802594, "grad_norm": 0.78538978099823, "learning_rate": 2.3000303640481386e-06, "loss": 0.6084, "step": 1135 }, { "epoch": 0.9535759096612296, "grad_norm": 0.5453292727470398, "learning_rate": 1.9334558461092663e-06, "loss": 0.6043, "step": 1140 }, { "epoch": 0.9577582601421999, "grad_norm": 0.5924518704414368, "learning_rate": 1.598494788733462e-06, "loss": 0.6033, "step": 1145 }, { "epoch": 0.9619406106231703, "grad_norm": 0.5484139323234558, "learning_rate": 1.2952187089419642e-06, "loss": 0.616, "step": 1150 }, { "epoch": 0.9661229611041405, "grad_norm": 0.5363529324531555, "learning_rate": 1.0236923587450263e-06, "loss": 0.6196, "step": 1155 }, { "epoch": 0.9703053115851108, "grad_norm": 0.6777392625808716, "learning_rate": 7.839737113168931e-07, "loss": 0.6102, "step": 1160 }, { "epoch": 0.9744876620660812, "grad_norm": 0.5920884609222412, "learning_rate": 5.761139486180178e-07, "loss": 0.6075, "step": 1165 }, { "epoch": 0.9786700125470514, "grad_norm": 0.4196658730506897, "learning_rate": 4.0015745046725336e-07, "loss": 0.6017, "step": 1170 }, { "epoch": 0.9828523630280217, "grad_norm": 0.6661626100540161, "learning_rate": 2.5614178506644934e-07, "loss": 0.5787, "step": 1175 }, { "epoch": 0.9870347135089921, "grad_norm": 0.6183582544326782, "learning_rate": 1.4409770097926765e-07, "loss": 0.6188, "step": 1180 }, { "epoch": 0.9912170639899623, "grad_norm": 0.5524072051048279, "learning_rate": 6.40491205661009e-08, "loss": 0.6089, "step": 1185 }, { "epoch": 0.9953994144709327, "grad_norm": 0.5690078735351562, "learning_rate": 1.6013134876491362e-08, "loss": 0.6079, "step": 1190 }, { "epoch": 0.999581764951903, "grad_norm": 0.4828049838542938, "learning_rate": 0.0, "loss": 0.5901, "step": 1195 }, { "epoch": 0.999581764951903, "eval_loss": 1.2799842357635498, "eval_runtime": 0.8401, "eval_samples_per_second": 5.951, "eval_steps_per_second": 1.19, "step": 1195 }, { "epoch": 0.999581764951903, "step": 1195, "total_flos": 9.109418934146171e+17, "train_loss": 1.440422640086218, "train_runtime": 6570.4384, "train_samples_per_second": 2.911, "train_steps_per_second": 0.182 } ], "logging_steps": 5, "max_steps": 1195, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.109418934146171e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }