{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.942375227311212, "eval_steps": 100000, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000942375227311212, "grad_norm": 14.25, "learning_rate": 1e-05, "loss": 0.46586317, "memory(GiB)": 64.76, "step": 1, "train_speed(iter/s)": 0.003324 }, { "epoch": 0.00471187613655606, "grad_norm": 2.46875, "learning_rate": 9.999648647603774e-06, "loss": 0.26192743, "memory(GiB)": 75.3, "step": 5, "train_speed(iter/s)": 0.003362 }, { "epoch": 0.00942375227311212, "grad_norm": 1.15625, "learning_rate": 9.998221363123425e-06, "loss": 0.10271888, "memory(GiB)": 75.3, "step": 10, "train_speed(iter/s)": 0.003359 }, { "epoch": 0.01413562840966818, "grad_norm": 1.1796875, "learning_rate": 9.995696500215899e-06, "loss": 0.09046092, "memory(GiB)": 75.3, "step": 15, "train_speed(iter/s)": 0.003358 }, { "epoch": 0.01884750454622424, "grad_norm": 1.140625, "learning_rate": 9.992074613325435e-06, "loss": 0.08653282, "memory(GiB)": 75.3, "step": 20, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.023559380682780302, "grad_norm": 1.078125, "learning_rate": 9.987356497795944e-06, "loss": 0.08451628, "memory(GiB)": 75.3, "step": 25, "train_speed(iter/s)": 0.003358 }, { "epoch": 0.02827125681933636, "grad_norm": 1.09375, "learning_rate": 9.981543189696349e-06, "loss": 0.0772208, "memory(GiB)": 75.3, "step": 30, "train_speed(iter/s)": 0.003356 }, { "epoch": 0.03298313295589242, "grad_norm": 1.125, "learning_rate": 9.97463596559307e-06, "loss": 0.08322463, "memory(GiB)": 75.3, "step": 35, "train_speed(iter/s)": 0.003356 }, { "epoch": 0.03769500909244848, "grad_norm": 1.09375, "learning_rate": 9.966636342269706e-06, "loss": 0.07725406, "memory(GiB)": 75.3, "step": 40, "train_speed(iter/s)": 0.003355 }, { "epoch": 0.04240688522900454, "grad_norm": 1.15625, "learning_rate": 9.957546076393944e-06, "loss": 0.07683957, "memory(GiB)": 75.3, "step": 45, "train_speed(iter/s)": 0.003356 }, { "epoch": 0.047118761365560605, "grad_norm": 1.1328125, "learning_rate": 9.947367164131823e-06, "loss": 0.07508552, "memory(GiB)": 75.3, "step": 50, "train_speed(iter/s)": 0.003355 }, { "epoch": 0.05183063750211667, "grad_norm": 1.0703125, "learning_rate": 9.936101840709373e-06, "loss": 0.07236413, "memory(GiB)": 75.3, "step": 55, "train_speed(iter/s)": 0.003353 }, { "epoch": 0.05654251363867272, "grad_norm": 1.0703125, "learning_rate": 9.923752579921787e-06, "loss": 0.07231579, "memory(GiB)": 75.3, "step": 60, "train_speed(iter/s)": 0.003349 }, { "epoch": 0.06125438977522878, "grad_norm": 1.0234375, "learning_rate": 9.910322093590177e-06, "loss": 0.07145001, "memory(GiB)": 75.3, "step": 65, "train_speed(iter/s)": 0.003348 }, { "epoch": 0.06596626591178484, "grad_norm": 1.0546875, "learning_rate": 9.895813330966086e-06, "loss": 0.07301619, "memory(GiB)": 75.3, "step": 70, "train_speed(iter/s)": 0.003349 }, { "epoch": 0.0706781420483409, "grad_norm": 1.1015625, "learning_rate": 9.880229478083849e-06, "loss": 0.0724276, "memory(GiB)": 75.3, "step": 75, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.07539001818489696, "grad_norm": 1.0390625, "learning_rate": 9.863573957060953e-06, "loss": 0.06874905, "memory(GiB)": 75.3, "step": 80, "train_speed(iter/s)": 0.003349 }, { "epoch": 0.08010189432145302, "grad_norm": 1.0859375, "learning_rate": 9.845850425346563e-06, "loss": 0.07212579, "memory(GiB)": 75.3, "step": 85, "train_speed(iter/s)": 0.003347 }, { "epoch": 0.08481377045800909, "grad_norm": 1.1171875, "learning_rate": 9.827062774918377e-06, "loss": 0.07294501, "memory(GiB)": 75.3, "step": 90, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.08952564659456515, "grad_norm": 0.98828125, "learning_rate": 9.807215131427966e-06, "loss": 0.06517277, "memory(GiB)": 75.3, "step": 95, "train_speed(iter/s)": 0.003345 }, { "epoch": 0.09423752273112121, "grad_norm": 0.984375, "learning_rate": 9.786311853294799e-06, "loss": 0.06962139, "memory(GiB)": 75.3, "step": 100, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.09894939886767727, "grad_norm": 0.98828125, "learning_rate": 9.764357530749178e-06, "loss": 0.06724482, "memory(GiB)": 75.3, "step": 105, "train_speed(iter/s)": 0.003339 }, { "epoch": 0.10366127500423333, "grad_norm": 1.015625, "learning_rate": 9.741356984824234e-06, "loss": 0.06572815, "memory(GiB)": 75.3, "step": 110, "train_speed(iter/s)": 0.003339 }, { "epoch": 0.10837315114078938, "grad_norm": 1.0390625, "learning_rate": 9.717315266297277e-06, "loss": 0.06739124, "memory(GiB)": 75.3, "step": 115, "train_speed(iter/s)": 0.003342 }, { "epoch": 0.11308502727734544, "grad_norm": 0.9375, "learning_rate": 9.692237654580658e-06, "loss": 0.06834027, "memory(GiB)": 75.3, "step": 120, "train_speed(iter/s)": 0.003342 }, { "epoch": 0.1177969034139015, "grad_norm": 1.078125, "learning_rate": 9.66612965656245e-06, "loss": 0.0658385, "memory(GiB)": 75.3, "step": 125, "train_speed(iter/s)": 0.003343 }, { "epoch": 0.12250877955045757, "grad_norm": 1.0859375, "learning_rate": 9.638997005397174e-06, "loss": 0.0717117, "memory(GiB)": 75.3, "step": 130, "train_speed(iter/s)": 0.003344 }, { "epoch": 0.12722065568701363, "grad_norm": 0.9765625, "learning_rate": 9.610845659246833e-06, "loss": 0.0667814, "memory(GiB)": 75.3, "step": 135, "train_speed(iter/s)": 0.003344 }, { "epoch": 0.13193253182356968, "grad_norm": 0.9140625, "learning_rate": 9.581681799972528e-06, "loss": 0.06573244, "memory(GiB)": 75.3, "step": 140, "train_speed(iter/s)": 0.003345 }, { "epoch": 0.13664440796012575, "grad_norm": 1.03125, "learning_rate": 9.551511831776966e-06, "loss": 0.06967602, "memory(GiB)": 75.3, "step": 145, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.1413562840966818, "grad_norm": 0.90625, "learning_rate": 9.520342379798141e-06, "loss": 0.06216406, "memory(GiB)": 75.3, "step": 150, "train_speed(iter/s)": 0.003345 }, { "epoch": 0.14606816023323788, "grad_norm": 1.0390625, "learning_rate": 9.488180288654485e-06, "loss": 0.06460171, "memory(GiB)": 75.3, "step": 155, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.15078003636979392, "grad_norm": 1.078125, "learning_rate": 9.45503262094184e-06, "loss": 0.06467786, "memory(GiB)": 75.3, "step": 160, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.15549191250635, "grad_norm": 1.0625, "learning_rate": 9.420906655682553e-06, "loss": 0.06358048, "memory(GiB)": 75.3, "step": 165, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.16020378864290605, "grad_norm": 1.015625, "learning_rate": 9.385809886727044e-06, "loss": 0.06778824, "memory(GiB)": 75.3, "step": 170, "train_speed(iter/s)": 0.003343 }, { "epoch": 0.16491566477946212, "grad_norm": 1.046875, "learning_rate": 9.349750021108212e-06, "loss": 0.06321884, "memory(GiB)": 75.3, "step": 175, "train_speed(iter/s)": 0.003343 }, { "epoch": 0.16962754091601817, "grad_norm": 0.97265625, "learning_rate": 9.31273497734901e-06, "loss": 0.06310185, "memory(GiB)": 75.3, "step": 180, "train_speed(iter/s)": 0.003344 }, { "epoch": 0.17433941705257422, "grad_norm": 0.9765625, "learning_rate": 9.274772883723587e-06, "loss": 0.06271737, "memory(GiB)": 75.3, "step": 185, "train_speed(iter/s)": 0.003344 }, { "epoch": 0.1790512931891303, "grad_norm": 0.97265625, "learning_rate": 9.235872076472378e-06, "loss": 0.06393245, "memory(GiB)": 75.3, "step": 190, "train_speed(iter/s)": 0.003345 }, { "epoch": 0.18376316932568634, "grad_norm": 1.03125, "learning_rate": 9.196041097971509e-06, "loss": 0.06558744, "memory(GiB)": 75.3, "step": 195, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.18847504546224242, "grad_norm": 0.98046875, "learning_rate": 9.155288694856942e-06, "loss": 0.06127087, "memory(GiB)": 75.3, "step": 200, "train_speed(iter/s)": 0.003345 }, { "epoch": 0.19318692159879847, "grad_norm": 0.875, "learning_rate": 9.113623816103775e-06, "loss": 0.06313071, "memory(GiB)": 75.3, "step": 205, "train_speed(iter/s)": 0.003343 }, { "epoch": 0.19789879773535454, "grad_norm": 1.0, "learning_rate": 9.071055611061102e-06, "loss": 0.06330621, "memory(GiB)": 75.3, "step": 210, "train_speed(iter/s)": 0.003343 }, { "epoch": 0.2026106738719106, "grad_norm": 0.9453125, "learning_rate": 9.027593427442867e-06, "loss": 0.06415906, "memory(GiB)": 75.3, "step": 215, "train_speed(iter/s)": 0.003343 }, { "epoch": 0.20732255000846667, "grad_norm": 0.94140625, "learning_rate": 8.98324680927517e-06, "loss": 0.06299359, "memory(GiB)": 75.3, "step": 220, "train_speed(iter/s)": 0.003343 }, { "epoch": 0.21203442614502271, "grad_norm": 0.9609375, "learning_rate": 8.938025494800454e-06, "loss": 0.06004124, "memory(GiB)": 75.3, "step": 225, "train_speed(iter/s)": 0.003343 }, { "epoch": 0.21674630228157876, "grad_norm": 0.97265625, "learning_rate": 8.891939414339048e-06, "loss": 0.06477681, "memory(GiB)": 75.3, "step": 230, "train_speed(iter/s)": 0.003344 }, { "epoch": 0.22145817841813484, "grad_norm": 0.92578125, "learning_rate": 8.844998688108535e-06, "loss": 0.06010489, "memory(GiB)": 75.3, "step": 235, "train_speed(iter/s)": 0.003344 }, { "epoch": 0.22617005455469089, "grad_norm": 0.9609375, "learning_rate": 8.797213624001403e-06, "loss": 0.05960445, "memory(GiB)": 75.3, "step": 240, "train_speed(iter/s)": 0.003345 }, { "epoch": 0.23088193069124696, "grad_norm": 1.0, "learning_rate": 8.748594715321512e-06, "loss": 0.06301316, "memory(GiB)": 75.3, "step": 245, "train_speed(iter/s)": 0.003345 }, { "epoch": 0.235593806827803, "grad_norm": 0.94921875, "learning_rate": 8.699152638479817e-06, "loss": 0.06120233, "memory(GiB)": 75.3, "step": 250, "train_speed(iter/s)": 0.003345 }, { "epoch": 0.24030568296435909, "grad_norm": 0.97265625, "learning_rate": 8.6488982506499e-06, "loss": 0.06014684, "memory(GiB)": 75.3, "step": 255, "train_speed(iter/s)": 0.003345 }, { "epoch": 0.24501755910091513, "grad_norm": 1.0, "learning_rate": 8.597842587383797e-06, "loss": 0.05922247, "memory(GiB)": 75.3, "step": 260, "train_speed(iter/s)": 0.003345 }, { "epoch": 0.24972943523747118, "grad_norm": 0.97265625, "learning_rate": 8.545996860188668e-06, "loss": 0.05851297, "memory(GiB)": 75.3, "step": 265, "train_speed(iter/s)": 0.003345 }, { "epoch": 0.25444131137402726, "grad_norm": 1.0625, "learning_rate": 8.493372454064809e-06, "loss": 0.05934198, "memory(GiB)": 75.3, "step": 270, "train_speed(iter/s)": 0.003345 }, { "epoch": 0.2591531875105833, "grad_norm": 0.90234375, "learning_rate": 8.439980925005587e-06, "loss": 0.06134464, "memory(GiB)": 75.3, "step": 275, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.26386506364713935, "grad_norm": 0.90234375, "learning_rate": 8.385833997459804e-06, "loss": 0.05825667, "memory(GiB)": 75.3, "step": 280, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.26857693978369546, "grad_norm": 0.8828125, "learning_rate": 8.330943561757092e-06, "loss": 0.06092241, "memory(GiB)": 75.3, "step": 285, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.2732888159202515, "grad_norm": 0.91796875, "learning_rate": 8.275321671496862e-06, "loss": 0.05940055, "memory(GiB)": 75.3, "step": 290, "train_speed(iter/s)": 0.003347 }, { "epoch": 0.27800069205680755, "grad_norm": 0.9140625, "learning_rate": 8.218980540901417e-06, "loss": 0.05920713, "memory(GiB)": 75.3, "step": 295, "train_speed(iter/s)": 0.003347 }, { "epoch": 0.2827125681933636, "grad_norm": 0.92578125, "learning_rate": 8.16193254213377e-06, "loss": 0.05777416, "memory(GiB)": 75.3, "step": 300, "train_speed(iter/s)": 0.003347 }, { "epoch": 0.2874244443299197, "grad_norm": 0.91015625, "learning_rate": 8.104190202580811e-06, "loss": 0.05302551, "memory(GiB)": 75.3, "step": 305, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.29213632046647575, "grad_norm": 0.91796875, "learning_rate": 8.045766202102358e-06, "loss": 0.05804279, "memory(GiB)": 75.3, "step": 310, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.2968481966030318, "grad_norm": 0.9375, "learning_rate": 7.986673370246743e-06, "loss": 0.05822692, "memory(GiB)": 75.3, "step": 315, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.30156007273958785, "grad_norm": 1.0078125, "learning_rate": 7.926924683433523e-06, "loss": 0.06007032, "memory(GiB)": 75.3, "step": 320, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.3062719488761439, "grad_norm": 0.921875, "learning_rate": 7.866533262103937e-06, "loss": 0.06018423, "memory(GiB)": 75.3, "step": 325, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.3109838250127, "grad_norm": 0.9375, "learning_rate": 7.805512367839742e-06, "loss": 0.05931915, "memory(GiB)": 75.3, "step": 330, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.31569570114925605, "grad_norm": 1.015625, "learning_rate": 7.743875400451047e-06, "loss": 0.0566447, "memory(GiB)": 75.3, "step": 335, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.3204075772858121, "grad_norm": 0.8203125, "learning_rate": 7.681635895033798e-06, "loss": 0.05161901, "memory(GiB)": 75.3, "step": 340, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.32511945342236814, "grad_norm": 1.0, "learning_rate": 7.6188075189975644e-06, "loss": 0.05694907, "memory(GiB)": 75.3, "step": 345, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.32983132955892425, "grad_norm": 1.0390625, "learning_rate": 7.555404069064245e-06, "loss": 0.05555046, "memory(GiB)": 75.3, "step": 350, "train_speed(iter/s)": 0.003347 }, { "epoch": 0.3345432056954803, "grad_norm": 0.97265625, "learning_rate": 7.491439468238404e-06, "loss": 0.05587023, "memory(GiB)": 75.3, "step": 355, "train_speed(iter/s)": 0.003347 }, { "epoch": 0.33925508183203634, "grad_norm": 0.96875, "learning_rate": 7.426927762749867e-06, "loss": 0.05913154, "memory(GiB)": 75.3, "step": 360, "train_speed(iter/s)": 0.003347 }, { "epoch": 0.3439669579685924, "grad_norm": 0.89453125, "learning_rate": 7.361883118969248e-06, "loss": 0.05830712, "memory(GiB)": 75.3, "step": 365, "train_speed(iter/s)": 0.003348 }, { "epoch": 0.34867883410514844, "grad_norm": 0.89453125, "learning_rate": 7.2963198202971055e-06, "loss": 0.05937972, "memory(GiB)": 75.3, "step": 370, "train_speed(iter/s)": 0.003348 }, { "epoch": 0.35339071024170454, "grad_norm": 0.9375, "learning_rate": 7.230252264027398e-06, "loss": 0.0565136, "memory(GiB)": 75.3, "step": 375, "train_speed(iter/s)": 0.003348 }, { "epoch": 0.3581025863782606, "grad_norm": 0.96875, "learning_rate": 7.163694958185928e-06, "loss": 0.05636386, "memory(GiB)": 75.3, "step": 380, "train_speed(iter/s)": 0.003348 }, { "epoch": 0.36281446251481664, "grad_norm": 0.96875, "learning_rate": 7.09666251834447e-06, "loss": 0.06038175, "memory(GiB)": 75.3, "step": 385, "train_speed(iter/s)": 0.003348 }, { "epoch": 0.3675263386513727, "grad_norm": 0.92578125, "learning_rate": 7.0291696644112705e-06, "loss": 0.05833557, "memory(GiB)": 75.3, "step": 390, "train_speed(iter/s)": 0.003349 }, { "epoch": 0.3722382147879288, "grad_norm": 0.8359375, "learning_rate": 6.9612312173986675e-06, "loss": 0.05632974, "memory(GiB)": 75.3, "step": 395, "train_speed(iter/s)": 0.003348 }, { "epoch": 0.37695009092448484, "grad_norm": 0.921875, "learning_rate": 6.892862096168469e-06, "loss": 0.05656151, "memory(GiB)": 75.3, "step": 400, "train_speed(iter/s)": 0.003348 }, { "epoch": 0.3816619670610409, "grad_norm": 0.98828125, "learning_rate": 6.824077314155877e-06, "loss": 0.05432441, "memory(GiB)": 75.3, "step": 405, "train_speed(iter/s)": 0.003347 }, { "epoch": 0.38637384319759693, "grad_norm": 0.9453125, "learning_rate": 6.75489197607262e-06, "loss": 0.05709869, "memory(GiB)": 75.3, "step": 410, "train_speed(iter/s)": 0.003347 }, { "epoch": 0.391085719334153, "grad_norm": 1.0546875, "learning_rate": 6.6853212745900585e-06, "loss": 0.05979726, "memory(GiB)": 75.3, "step": 415, "train_speed(iter/s)": 0.003348 }, { "epoch": 0.3957975954707091, "grad_norm": 0.9140625, "learning_rate": 6.615380487002969e-06, "loss": 0.0600209, "memory(GiB)": 75.3, "step": 420, "train_speed(iter/s)": 0.003348 }, { "epoch": 0.40050947160726513, "grad_norm": 0.94140625, "learning_rate": 6.545084971874738e-06, "loss": 0.0563777, "memory(GiB)": 75.3, "step": 425, "train_speed(iter/s)": 0.003348 }, { "epoch": 0.4052213477438212, "grad_norm": 0.91796875, "learning_rate": 6.474450165664722e-06, "loss": 0.05698464, "memory(GiB)": 75.3, "step": 430, "train_speed(iter/s)": 0.003348 }, { "epoch": 0.40993322388037723, "grad_norm": 0.890625, "learning_rate": 6.4034915793385e-06, "loss": 0.05311573, "memory(GiB)": 75.3, "step": 435, "train_speed(iter/s)": 0.003348 }, { "epoch": 0.41464510001693333, "grad_norm": 0.97265625, "learning_rate": 6.332224794961752e-06, "loss": 0.05458606, "memory(GiB)": 75.3, "step": 440, "train_speed(iter/s)": 0.003348 }, { "epoch": 0.4193569761534894, "grad_norm": 0.95703125, "learning_rate": 6.260665462278544e-06, "loss": 0.05579169, "memory(GiB)": 75.3, "step": 445, "train_speed(iter/s)": 0.003348 }, { "epoch": 0.42406885229004543, "grad_norm": 0.99609375, "learning_rate": 6.18882929527473e-06, "loss": 0.06002288, "memory(GiB)": 75.3, "step": 450, "train_speed(iter/s)": 0.003348 }, { "epoch": 0.4287807284266015, "grad_norm": 0.94140625, "learning_rate": 6.116732068727271e-06, "loss": 0.05494517, "memory(GiB)": 75.3, "step": 455, "train_speed(iter/s)": 0.003349 }, { "epoch": 0.4334926045631575, "grad_norm": 0.953125, "learning_rate": 6.0443896147401856e-06, "loss": 0.0547879, "memory(GiB)": 75.3, "step": 460, "train_speed(iter/s)": 0.003349 }, { "epoch": 0.4382044806997136, "grad_norm": 0.82421875, "learning_rate": 5.971817819267914e-06, "loss": 0.05363967, "memory(GiB)": 75.3, "step": 465, "train_speed(iter/s)": 0.003348 }, { "epoch": 0.4429163568362697, "grad_norm": 0.91796875, "learning_rate": 5.8990326186268655e-06, "loss": 0.056594, "memory(GiB)": 75.3, "step": 470, "train_speed(iter/s)": 0.003349 }, { "epoch": 0.4476282329728257, "grad_norm": 0.9765625, "learning_rate": 5.826049995995905e-06, "loss": 0.05898719, "memory(GiB)": 75.3, "step": 475, "train_speed(iter/s)": 0.003349 }, { "epoch": 0.45234010910938177, "grad_norm": 1.3671875, "learning_rate": 5.752885977906539e-06, "loss": 0.05439388, "memory(GiB)": 75.3, "step": 480, "train_speed(iter/s)": 0.003349 }, { "epoch": 0.4570519852459379, "grad_norm": 1.0390625, "learning_rate": 5.679556630723592e-06, "loss": 0.05334362, "memory(GiB)": 75.3, "step": 485, "train_speed(iter/s)": 0.003349 }, { "epoch": 0.4617638613824939, "grad_norm": 0.9765625, "learning_rate": 5.606078057117136e-06, "loss": 0.06019425, "memory(GiB)": 75.3, "step": 490, "train_speed(iter/s)": 0.003349 }, { "epoch": 0.46647573751904997, "grad_norm": 0.95703125, "learning_rate": 5.532466392526439e-06, "loss": 0.05597678, "memory(GiB)": 75.3, "step": 495, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.471187613655606, "grad_norm": 0.86328125, "learning_rate": 5.458737801616721e-06, "loss": 0.05094014, "memory(GiB)": 75.3, "step": 500, "train_speed(iter/s)": 0.003349 }, { "epoch": 0.47589948979216207, "grad_norm": 0.875, "learning_rate": 5.384908474729501e-06, "loss": 0.0548723, "memory(GiB)": 75.3, "step": 505, "train_speed(iter/s)": 0.003348 }, { "epoch": 0.48061136592871817, "grad_norm": 0.8984375, "learning_rate": 5.310994624327292e-06, "loss": 0.05574841, "memory(GiB)": 75.3, "step": 510, "train_speed(iter/s)": 0.003348 }, { "epoch": 0.4853232420652742, "grad_norm": 0.8671875, "learning_rate": 5.23701248143345e-06, "loss": 0.05651059, "memory(GiB)": 75.3, "step": 515, "train_speed(iter/s)": 0.003348 }, { "epoch": 0.49003511820183027, "grad_norm": 0.921875, "learning_rate": 5.162978292067933e-06, "loss": 0.05878415, "memory(GiB)": 75.3, "step": 520, "train_speed(iter/s)": 0.003348 }, { "epoch": 0.4947469943383863, "grad_norm": 1.0234375, "learning_rate": 5.088908313679788e-06, "loss": 0.05620171, "memory(GiB)": 75.3, "step": 525, "train_speed(iter/s)": 0.003348 }, { "epoch": 0.49945887047494236, "grad_norm": 0.86328125, "learning_rate": 5.014818811577104e-06, "loss": 0.05407885, "memory(GiB)": 75.3, "step": 530, "train_speed(iter/s)": 0.003348 }, { "epoch": 0.5041707466114984, "grad_norm": 0.84375, "learning_rate": 4.940726055355259e-06, "loss": 0.05323058, "memory(GiB)": 75.3, "step": 535, "train_speed(iter/s)": 0.003349 }, { "epoch": 0.5088826227480545, "grad_norm": 0.83984375, "learning_rate": 4.866646315324217e-06, "loss": 0.05346375, "memory(GiB)": 75.3, "step": 540, "train_speed(iter/s)": 0.003349 }, { "epoch": 0.5135944988846106, "grad_norm": 0.828125, "learning_rate": 4.792595858935668e-06, "loss": 0.05774211, "memory(GiB)": 75.3, "step": 545, "train_speed(iter/s)": 0.003349 }, { "epoch": 0.5183063750211666, "grad_norm": 0.9921875, "learning_rate": 4.718590947210788e-06, "loss": 0.05547717, "memory(GiB)": 75.3, "step": 550, "train_speed(iter/s)": 0.003349 }, { "epoch": 0.5230182511577227, "grad_norm": 0.8046875, "learning_rate": 4.644647831169435e-06, "loss": 0.05536319, "memory(GiB)": 75.3, "step": 555, "train_speed(iter/s)": 0.003349 }, { "epoch": 0.5277301272942787, "grad_norm": 1.015625, "learning_rate": 4.570782748261516e-06, "loss": 0.05369086, "memory(GiB)": 75.3, "step": 560, "train_speed(iter/s)": 0.003349 }, { "epoch": 0.5324420034308348, "grad_norm": 0.94140625, "learning_rate": 4.497011918801347e-06, "loss": 0.05471834, "memory(GiB)": 75.3, "step": 565, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.5371538795673909, "grad_norm": 0.9140625, "learning_rate": 4.423351542405764e-06, "loss": 0.05114409, "memory(GiB)": 75.3, "step": 570, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.5418657557039469, "grad_norm": 0.9765625, "learning_rate": 4.349817794436805e-06, "loss": 0.05673685, "memory(GiB)": 75.3, "step": 575, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.546577631840503, "grad_norm": 0.88671875, "learning_rate": 4.276426822449682e-06, "loss": 0.05527523, "memory(GiB)": 75.3, "step": 580, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.551289507977059, "grad_norm": 0.90625, "learning_rate": 4.203194742646893e-06, "loss": 0.05317973, "memory(GiB)": 75.3, "step": 585, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.5560013841136151, "grad_norm": 1.0078125, "learning_rate": 4.130137636339191e-06, "loss": 0.05449303, "memory(GiB)": 75.3, "step": 590, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.5607132602501712, "grad_norm": 0.89453125, "learning_rate": 4.057271546414242e-06, "loss": 0.05341119, "memory(GiB)": 75.3, "step": 595, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.5654251363867272, "grad_norm": 0.8515625, "learning_rate": 3.984612473813689e-06, "loss": 0.05254069, "memory(GiB)": 75.3, "step": 600, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.5701370125232833, "grad_norm": 0.8984375, "learning_rate": 3.912176374019462e-06, "loss": 0.05324795, "memory(GiB)": 75.3, "step": 605, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.5748488886598394, "grad_norm": 0.8671875, "learning_rate": 3.839979153550039e-06, "loss": 0.05177047, "memory(GiB)": 75.3, "step": 610, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.5795607647963954, "grad_norm": 0.82421875, "learning_rate": 3.768036666467486e-06, "loss": 0.05265539, "memory(GiB)": 75.3, "step": 615, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.5842726409329515, "grad_norm": 0.88671875, "learning_rate": 3.6963647108959868e-06, "loss": 0.05418316, "memory(GiB)": 75.3, "step": 620, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.5889845170695075, "grad_norm": 0.93359375, "learning_rate": 3.6249790255526916e-06, "loss": 0.05562772, "memory(GiB)": 75.3, "step": 625, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.5936963932060636, "grad_norm": 0.90234375, "learning_rate": 3.553895286291577e-06, "loss": 0.05445199, "memory(GiB)": 75.3, "step": 630, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.5984082693426197, "grad_norm": 0.90234375, "learning_rate": 3.483129102661137e-06, "loss": 0.05333483, "memory(GiB)": 75.3, "step": 635, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.6031201454791757, "grad_norm": 0.8515625, "learning_rate": 3.4126960144766107e-06, "loss": 0.05417204, "memory(GiB)": 75.3, "step": 640, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.6078320216157318, "grad_norm": 0.91015625, "learning_rate": 3.3426114884075488e-06, "loss": 0.05412987, "memory(GiB)": 75.3, "step": 645, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.6125438977522878, "grad_norm": 0.87109375, "learning_rate": 3.272890914581417e-06, "loss": 0.05388454, "memory(GiB)": 75.3, "step": 650, "train_speed(iter/s)": 0.003349 }, { "epoch": 0.6172557738888439, "grad_norm": 0.85546875, "learning_rate": 3.2035496032040303e-06, "loss": 0.05097753, "memory(GiB)": 75.3, "step": 655, "train_speed(iter/s)": 0.003349 }, { "epoch": 0.6219676500254, "grad_norm": 0.875, "learning_rate": 3.134602781197515e-06, "loss": 0.05341196, "memory(GiB)": 75.3, "step": 660, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.626679526161956, "grad_norm": 0.90625, "learning_rate": 3.0660655888565827e-06, "loss": 0.05016219, "memory(GiB)": 75.3, "step": 665, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.6313914022985121, "grad_norm": 0.95703125, "learning_rate": 2.997953076523803e-06, "loss": 0.05216441, "memory(GiB)": 75.3, "step": 670, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.6361032784350681, "grad_norm": 1.015625, "learning_rate": 2.930280201284654e-06, "loss": 0.05449665, "memory(GiB)": 75.3, "step": 675, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.6408151545716242, "grad_norm": 0.921875, "learning_rate": 2.863061823683032e-06, "loss": 0.05129569, "memory(GiB)": 75.3, "step": 680, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.6455270307081803, "grad_norm": 0.87890625, "learning_rate": 2.7963127044579697e-06, "loss": 0.05290835, "memory(GiB)": 75.3, "step": 685, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.6502389068447363, "grad_norm": 0.87109375, "learning_rate": 2.7300475013022666e-06, "loss": 0.0528672, "memory(GiB)": 75.3, "step": 690, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.6549507829812924, "grad_norm": 0.984375, "learning_rate": 2.6642807656437565e-06, "loss": 0.05229232, "memory(GiB)": 75.3, "step": 695, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.6596626591178485, "grad_norm": 0.9609375, "learning_rate": 2.599026939449899e-06, "loss": 0.05371115, "memory(GiB)": 75.3, "step": 700, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.6643745352544045, "grad_norm": 1.0703125, "learning_rate": 2.534300352056416e-06, "loss": 0.05234203, "memory(GiB)": 75.3, "step": 705, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.6690864113909606, "grad_norm": 0.98828125, "learning_rate": 2.470115217020654e-06, "loss": 0.05360326, "memory(GiB)": 75.3, "step": 710, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.6737982875275166, "grad_norm": 0.92578125, "learning_rate": 2.4064856290003863e-06, "loss": 0.05475932, "memory(GiB)": 75.3, "step": 715, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.6785101636640727, "grad_norm": 1.0703125, "learning_rate": 2.3434255606586925e-06, "loss": 0.05548735, "memory(GiB)": 75.3, "step": 720, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.6832220398006288, "grad_norm": 0.89453125, "learning_rate": 2.2809488595956746e-06, "loss": 0.05201564, "memory(GiB)": 75.3, "step": 725, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.6879339159371848, "grad_norm": 0.9140625, "learning_rate": 2.219069245307589e-06, "loss": 0.05408272, "memory(GiB)": 75.3, "step": 730, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.6926457920737409, "grad_norm": 1.1640625, "learning_rate": 2.157800306174139e-06, "loss": 0.05537663, "memory(GiB)": 75.3, "step": 735, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.6973576682102969, "grad_norm": 1.125, "learning_rate": 2.0971554964745476e-06, "loss": 0.05455139, "memory(GiB)": 75.3, "step": 740, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.702069544346853, "grad_norm": 0.87890625, "learning_rate": 2.0371481334330913e-06, "loss": 0.05394316, "memory(GiB)": 75.3, "step": 745, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.7067814204834091, "grad_norm": 0.828125, "learning_rate": 1.9777913942946987e-06, "loss": 0.05269849, "memory(GiB)": 75.3, "step": 750, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.7114932966199651, "grad_norm": 0.81640625, "learning_rate": 1.919098313431335e-06, "loss": 0.05057405, "memory(GiB)": 75.3, "step": 755, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.7162051727565212, "grad_norm": 0.9375, "learning_rate": 1.8610817794797164e-06, "loss": 0.05438253, "memory(GiB)": 75.3, "step": 760, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.7209170488930772, "grad_norm": 0.84375, "learning_rate": 1.8037545325110506e-06, "loss": 0.05222658, "memory(GiB)": 75.3, "step": 765, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.7256289250296333, "grad_norm": 0.88671875, "learning_rate": 1.7471291612333997e-06, "loss": 0.05131737, "memory(GiB)": 75.3, "step": 770, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.7303408011661894, "grad_norm": 0.93359375, "learning_rate": 1.6912181002272714e-06, "loss": 0.05391481, "memory(GiB)": 75.3, "step": 775, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.7350526773027454, "grad_norm": 0.8984375, "learning_rate": 1.6360336272150684e-06, "loss": 0.05078862, "memory(GiB)": 75.3, "step": 780, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.7397645534393015, "grad_norm": 0.953125, "learning_rate": 1.581587860364977e-06, "loss": 0.05192038, "memory(GiB)": 75.3, "step": 785, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.7444764295758576, "grad_norm": 0.96875, "learning_rate": 1.52789275562988e-06, "loss": 0.05364103, "memory(GiB)": 75.3, "step": 790, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.7491883057124136, "grad_norm": 1.0625, "learning_rate": 1.4749601041219246e-06, "loss": 0.0536845, "memory(GiB)": 75.3, "step": 795, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.7539001818489697, "grad_norm": 0.90234375, "learning_rate": 1.4228015295232484e-06, "loss": 0.05084696, "memory(GiB)": 75.3, "step": 800, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.7586120579855257, "grad_norm": 0.875, "learning_rate": 1.371428485533498e-06, "loss": 0.05773014, "memory(GiB)": 75.3, "step": 805, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.7633239341220818, "grad_norm": 0.91796875, "learning_rate": 1.3208522533546748e-06, "loss": 0.05219783, "memory(GiB)": 75.3, "step": 810, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.7680358102586379, "grad_norm": 0.96875, "learning_rate": 1.2710839392138386e-06, "loss": 0.05375321, "memory(GiB)": 75.3, "step": 815, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.7727476863951939, "grad_norm": 0.87890625, "learning_rate": 1.222134471924259e-06, "loss": 0.05204231, "memory(GiB)": 75.3, "step": 820, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.77745956253175, "grad_norm": 0.91796875, "learning_rate": 1.1740146004855141e-06, "loss": 0.0559127, "memory(GiB)": 75.3, "step": 825, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.782171438668306, "grad_norm": 0.89453125, "learning_rate": 1.1267348917230737e-06, "loss": 0.05298336, "memory(GiB)": 75.3, "step": 830, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.7868833148048621, "grad_norm": 0.90625, "learning_rate": 1.080305727967893e-06, "loss": 0.05347639, "memory(GiB)": 75.3, "step": 835, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.7915951909414182, "grad_norm": 0.84765625, "learning_rate": 1.0347373047765202e-06, "loss": 0.05329442, "memory(GiB)": 75.3, "step": 840, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.7963070670779742, "grad_norm": 0.8359375, "learning_rate": 9.900396286922025e-07, "loss": 0.0537856, "memory(GiB)": 75.3, "step": 845, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.8010189432145303, "grad_norm": 0.8125, "learning_rate": 9.462225150475296e-07, "loss": 0.05233877, "memory(GiB)": 75.3, "step": 850, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.8057308193510863, "grad_norm": 0.88671875, "learning_rate": 9.032955858090319e-07, "loss": 0.0549244, "memory(GiB)": 75.3, "step": 855, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.8104426954876424, "grad_norm": 0.9140625, "learning_rate": 8.612682674642647e-07, "loss": 0.04935811, "memory(GiB)": 75.3, "step": 860, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.8151545716241985, "grad_norm": 0.921875, "learning_rate": 8.201497889518073e-07, "loss": 0.05281691, "memory(GiB)": 75.3, "step": 865, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.8198664477607545, "grad_norm": 0.90625, "learning_rate": 7.799491796346487e-07, "loss": 0.05795277, "memory(GiB)": 75.3, "step": 870, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.8245783238973106, "grad_norm": 0.8046875, "learning_rate": 7.406752673173851e-07, "loss": 0.05225162, "memory(GiB)": 75.3, "step": 875, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.8292902000338667, "grad_norm": 0.87890625, "learning_rate": 7.023366763077044e-07, "loss": 0.0509973, "memory(GiB)": 75.3, "step": 880, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.8340020761704227, "grad_norm": 0.87109375, "learning_rate": 6.649418255225298e-07, "loss": 0.05142277, "memory(GiB)": 75.3, "step": 885, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.8387139523069788, "grad_norm": 0.95703125, "learning_rate": 6.284989266392805e-07, "loss": 0.05023923, "memory(GiB)": 75.3, "step": 890, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.8434258284435348, "grad_norm": 0.8828125, "learning_rate": 5.930159822926407e-07, "loss": 0.0534648, "memory(GiB)": 75.3, "step": 895, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.8481377045800909, "grad_norm": 0.84375, "learning_rate": 5.585007843172286e-07, "loss": 0.05155768, "memory(GiB)": 75.3, "step": 900, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.852849580716647, "grad_norm": 0.9453125, "learning_rate": 5.249609120365579e-07, "loss": 0.05368913, "memory(GiB)": 75.3, "step": 905, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.857561456853203, "grad_norm": 0.859375, "learning_rate": 4.924037305986696e-07, "loss": 0.05452033, "memory(GiB)": 75.3, "step": 910, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.8622733329897591, "grad_norm": 0.8515625, "learning_rate": 4.6083638935878025e-07, "loss": 0.05384221, "memory(GiB)": 75.3, "step": 915, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.866985209126315, "grad_norm": 0.828125, "learning_rate": 4.302658203093418e-07, "loss": 0.05272598, "memory(GiB)": 75.3, "step": 920, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.8716970852628712, "grad_norm": 0.8671875, "learning_rate": 4.00698736557808e-07, "loss": 0.05447989, "memory(GiB)": 75.3, "step": 925, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.8764089613994273, "grad_norm": 0.9453125, "learning_rate": 3.721416308524839e-07, "loss": 0.05123619, "memory(GiB)": 75.3, "step": 930, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.8811208375359832, "grad_norm": 0.8515625, "learning_rate": 3.4460077415675473e-07, "loss": 0.05347574, "memory(GiB)": 75.3, "step": 935, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.8858327136725394, "grad_norm": 0.7890625, "learning_rate": 3.1808221427202636e-07, "loss": 0.05334803, "memory(GiB)": 75.3, "step": 940, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.8905445898090953, "grad_norm": 0.94921875, "learning_rate": 2.925917745096568e-07, "loss": 0.05249671, "memory(GiB)": 75.3, "step": 945, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.8952564659456514, "grad_norm": 0.91015625, "learning_rate": 2.681350524122045e-07, "loss": 0.05494893, "memory(GiB)": 75.3, "step": 950, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.8999683420822076, "grad_norm": 0.828125, "learning_rate": 2.447174185242324e-07, "loss": 0.05149726, "memory(GiB)": 75.3, "step": 955, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.9046802182187635, "grad_norm": 1.0859375, "learning_rate": 2.2234401521297576e-07, "loss": 0.05425293, "memory(GiB)": 75.3, "step": 960, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.9093920943553196, "grad_norm": 0.87109375, "learning_rate": 2.01019755539108e-07, "loss": 0.0552171, "memory(GiB)": 75.3, "step": 965, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.9141039704918758, "grad_norm": 0.84375, "learning_rate": 1.8074932217786445e-07, "loss": 0.05237709, "memory(GiB)": 75.3, "step": 970, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.9188158466284317, "grad_norm": 0.86328125, "learning_rate": 1.6153716639075223e-07, "loss": 0.05221198, "memory(GiB)": 75.3, "step": 975, "train_speed(iter/s)": 0.003351 }, { "epoch": 0.9235277227649878, "grad_norm": 0.8125, "learning_rate": 1.433875070480878e-07, "loss": 0.05134506, "memory(GiB)": 75.3, "step": 980, "train_speed(iter/s)": 0.003352 }, { "epoch": 0.9282395989015438, "grad_norm": 0.890625, "learning_rate": 1.2630432970255014e-07, "loss": 0.05436495, "memory(GiB)": 75.3, "step": 985, "train_speed(iter/s)": 0.003352 }, { "epoch": 0.9329514750380999, "grad_norm": 0.921875, "learning_rate": 1.1029138571398645e-07, "loss": 0.05440986, "memory(GiB)": 75.3, "step": 990, "train_speed(iter/s)": 0.003352 }, { "epoch": 0.937663351174656, "grad_norm": 0.91796875, "learning_rate": 9.535219142563168e-08, "loss": 0.05418127, "memory(GiB)": 75.3, "step": 995, "train_speed(iter/s)": 0.003352 }, { "epoch": 0.942375227311212, "grad_norm": 0.8984375, "learning_rate": 8.149002739194222e-08, "loss": 0.05519557, "memory(GiB)": 75.3, "step": 1000, "train_speed(iter/s)": 0.003352 } ], "logging_steps": 5, "max_steps": 1061, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.440049406181114e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }