{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9902912621359223, "eval_steps": 500, "global_step": 462, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006472491909385114, "grad_norm": 0.8681851528027766, "learning_rate": 4.255319148936171e-06, "loss": 1.2864, "step": 1 }, { "epoch": 0.012944983818770227, "grad_norm": 0.8603766114064247, "learning_rate": 8.510638297872341e-06, "loss": 1.3029, "step": 2 }, { "epoch": 0.019417475728155338, "grad_norm": 0.8460211658943029, "learning_rate": 1.2765957446808511e-05, "loss": 1.2676, "step": 3 }, { "epoch": 0.025889967637540454, "grad_norm": 0.823879412927583, "learning_rate": 1.7021276595744682e-05, "loss": 1.2794, "step": 4 }, { "epoch": 0.032362459546925564, "grad_norm": 0.7999755689798974, "learning_rate": 2.1276595744680852e-05, "loss": 1.2632, "step": 5 }, { "epoch": 0.038834951456310676, "grad_norm": 0.779503875119536, "learning_rate": 2.5531914893617022e-05, "loss": 1.2356, "step": 6 }, { "epoch": 0.045307443365695796, "grad_norm": 0.7113356719326077, "learning_rate": 2.9787234042553192e-05, "loss": 1.2075, "step": 7 }, { "epoch": 0.05177993527508091, "grad_norm": 0.5733112055032882, "learning_rate": 3.4042553191489365e-05, "loss": 1.1078, "step": 8 }, { "epoch": 0.05825242718446602, "grad_norm": 0.4686147590427965, "learning_rate": 3.829787234042553e-05, "loss": 1.0108, "step": 9 }, { "epoch": 0.06472491909385113, "grad_norm": 0.4960364471309283, "learning_rate": 4.2553191489361704e-05, "loss": 0.9991, "step": 10 }, { "epoch": 0.07119741100323625, "grad_norm": 0.5462124995641008, "learning_rate": 4.680851063829788e-05, "loss": 0.9427, "step": 11 }, { "epoch": 0.07766990291262135, "grad_norm": 0.546681134551426, "learning_rate": 5.1063829787234044e-05, "loss": 0.8893, "step": 12 }, { "epoch": 0.08414239482200647, "grad_norm": 0.556093704754962, "learning_rate": 5.531914893617022e-05, "loss": 0.8444, "step": 13 }, { "epoch": 0.09061488673139159, "grad_norm": 0.4982225051601507, "learning_rate": 5.9574468085106384e-05, "loss": 0.7642, "step": 14 }, { "epoch": 0.0970873786407767, "grad_norm": 0.48253513519932356, "learning_rate": 6.382978723404256e-05, "loss": 0.7228, "step": 15 }, { "epoch": 0.10355987055016182, "grad_norm": 0.44162352908966596, "learning_rate": 6.808510638297873e-05, "loss": 0.6525, "step": 16 }, { "epoch": 0.11003236245954692, "grad_norm": 0.3918381633741909, "learning_rate": 7.23404255319149e-05, "loss": 0.604, "step": 17 }, { "epoch": 0.11650485436893204, "grad_norm": 0.2648043458799554, "learning_rate": 7.659574468085106e-05, "loss": 0.5693, "step": 18 }, { "epoch": 0.12297734627831715, "grad_norm": 0.243819913231238, "learning_rate": 8.085106382978723e-05, "loss": 0.5205, "step": 19 }, { "epoch": 0.12944983818770225, "grad_norm": 0.23168627433032857, "learning_rate": 8.510638297872341e-05, "loss": 0.5296, "step": 20 }, { "epoch": 0.13592233009708737, "grad_norm": 0.1994751857050266, "learning_rate": 8.936170212765958e-05, "loss": 0.5269, "step": 21 }, { "epoch": 0.1423948220064725, "grad_norm": 0.2530061947493179, "learning_rate": 9.361702127659576e-05, "loss": 0.5091, "step": 22 }, { "epoch": 0.1488673139158576, "grad_norm": 0.24049496280327698, "learning_rate": 9.787234042553192e-05, "loss": 0.5169, "step": 23 }, { "epoch": 0.1553398058252427, "grad_norm": 0.22948907942121893, "learning_rate": 0.00010212765957446809, "loss": 0.4954, "step": 24 }, { "epoch": 0.16181229773462782, "grad_norm": 0.17253285827605366, "learning_rate": 0.00010638297872340425, "loss": 0.4678, "step": 25 }, { "epoch": 0.16828478964401294, "grad_norm": 0.1938226429644451, "learning_rate": 0.00011063829787234043, "loss": 0.4857, "step": 26 }, { "epoch": 0.17475728155339806, "grad_norm": 0.13721502402963923, "learning_rate": 0.00011489361702127661, "loss": 0.466, "step": 27 }, { "epoch": 0.18122977346278318, "grad_norm": 0.12808136935864992, "learning_rate": 0.00011914893617021277, "loss": 0.4652, "step": 28 }, { "epoch": 0.18770226537216828, "grad_norm": 0.12041781390556437, "learning_rate": 0.00012340425531914893, "loss": 0.4527, "step": 29 }, { "epoch": 0.1941747572815534, "grad_norm": 0.11745727258563804, "learning_rate": 0.00012765957446808513, "loss": 0.4392, "step": 30 }, { "epoch": 0.20064724919093851, "grad_norm": 0.11168371739950012, "learning_rate": 0.00013191489361702127, "loss": 0.4578, "step": 31 }, { "epoch": 0.20711974110032363, "grad_norm": 0.11295507271229074, "learning_rate": 0.00013617021276595746, "loss": 0.4384, "step": 32 }, { "epoch": 0.21359223300970873, "grad_norm": 0.11084156792638247, "learning_rate": 0.00014042553191489363, "loss": 0.4292, "step": 33 }, { "epoch": 0.22006472491909385, "grad_norm": 0.10746985136176236, "learning_rate": 0.0001446808510638298, "loss": 0.4202, "step": 34 }, { "epoch": 0.22653721682847897, "grad_norm": 0.10723808557482935, "learning_rate": 0.00014893617021276596, "loss": 0.4345, "step": 35 }, { "epoch": 0.23300970873786409, "grad_norm": 0.10308761677232968, "learning_rate": 0.00015319148936170213, "loss": 0.4318, "step": 36 }, { "epoch": 0.23948220064724918, "grad_norm": 0.1044215437881733, "learning_rate": 0.00015744680851063832, "loss": 0.4289, "step": 37 }, { "epoch": 0.2459546925566343, "grad_norm": 0.10783517564137016, "learning_rate": 0.00016170212765957446, "loss": 0.4226, "step": 38 }, { "epoch": 0.2524271844660194, "grad_norm": 0.11058134845234333, "learning_rate": 0.00016595744680851065, "loss": 0.4392, "step": 39 }, { "epoch": 0.2588996763754045, "grad_norm": 0.10721298987840806, "learning_rate": 0.00017021276595744682, "loss": 0.4097, "step": 40 }, { "epoch": 0.26537216828478966, "grad_norm": 0.10203447619356035, "learning_rate": 0.00017446808510638298, "loss": 0.4371, "step": 41 }, { "epoch": 0.27184466019417475, "grad_norm": 0.09611299924461707, "learning_rate": 0.00017872340425531915, "loss": 0.4264, "step": 42 }, { "epoch": 0.2783171521035599, "grad_norm": 0.09911922773823158, "learning_rate": 0.00018297872340425532, "loss": 0.4177, "step": 43 }, { "epoch": 0.284789644012945, "grad_norm": 0.10195593280531762, "learning_rate": 0.0001872340425531915, "loss": 0.4322, "step": 44 }, { "epoch": 0.2912621359223301, "grad_norm": 0.0990363778451009, "learning_rate": 0.00019148936170212768, "loss": 0.4272, "step": 45 }, { "epoch": 0.2977346278317152, "grad_norm": 0.10312296274780797, "learning_rate": 0.00019574468085106384, "loss": 0.3997, "step": 46 }, { "epoch": 0.3042071197411003, "grad_norm": 0.09688182042517872, "learning_rate": 0.0002, "loss": 0.3955, "step": 47 }, { "epoch": 0.3106796116504854, "grad_norm": 0.09540799755823594, "learning_rate": 0.00019999713469087867, "loss": 0.4078, "step": 48 }, { "epoch": 0.31715210355987056, "grad_norm": 0.10596966214446712, "learning_rate": 0.00019998853892771453, "loss": 0.4102, "step": 49 }, { "epoch": 0.32362459546925565, "grad_norm": 0.10086883515116485, "learning_rate": 0.00019997421320309795, "loss": 0.4048, "step": 50 }, { "epoch": 0.3300970873786408, "grad_norm": 0.10400887250356652, "learning_rate": 0.00019995415833798158, "loss": 0.403, "step": 51 }, { "epoch": 0.3365695792880259, "grad_norm": 0.09067444994385492, "learning_rate": 0.00019992837548163316, "loss": 0.3867, "step": 52 }, { "epoch": 0.343042071197411, "grad_norm": 0.09109660691369531, "learning_rate": 0.00019989686611156972, "loss": 0.3956, "step": 53 }, { "epoch": 0.34951456310679613, "grad_norm": 0.09162094937144005, "learning_rate": 0.000199859632033473, "loss": 0.407, "step": 54 }, { "epoch": 0.3559870550161812, "grad_norm": 0.09041901984453071, "learning_rate": 0.00019981667538108587, "loss": 0.3913, "step": 55 }, { "epoch": 0.36245954692556637, "grad_norm": 0.0948520973493128, "learning_rate": 0.00019976799861609008, "loss": 0.3959, "step": 56 }, { "epoch": 0.36893203883495146, "grad_norm": 0.09239535228342881, "learning_rate": 0.00019971360452796522, "loss": 0.3976, "step": 57 }, { "epoch": 0.37540453074433655, "grad_norm": 0.09430664951378093, "learning_rate": 0.0001996534962338288, "loss": 0.3984, "step": 58 }, { "epoch": 0.3818770226537217, "grad_norm": 0.08478493523992439, "learning_rate": 0.0001995876771782577, "loss": 0.3862, "step": 59 }, { "epoch": 0.3883495145631068, "grad_norm": 0.08736157516174174, "learning_rate": 0.00019951615113309075, "loss": 0.3768, "step": 60 }, { "epoch": 0.3948220064724919, "grad_norm": 0.08833359616935596, "learning_rate": 0.00019943892219721253, "loss": 0.3832, "step": 61 }, { "epoch": 0.40129449838187703, "grad_norm": 0.09634861288945641, "learning_rate": 0.0001993559947963185, "loss": 0.3975, "step": 62 }, { "epoch": 0.4077669902912621, "grad_norm": 0.08474514858686731, "learning_rate": 0.00019926737368266144, "loss": 0.3803, "step": 63 }, { "epoch": 0.41423948220064727, "grad_norm": 0.0878556667338793, "learning_rate": 0.00019917306393477907, "loss": 0.3874, "step": 64 }, { "epoch": 0.42071197411003236, "grad_norm": 0.08906183743795007, "learning_rate": 0.00019907307095720303, "loss": 0.384, "step": 65 }, { "epoch": 0.42718446601941745, "grad_norm": 0.08738087160835142, "learning_rate": 0.00019896740048014908, "loss": 0.3883, "step": 66 }, { "epoch": 0.4336569579288026, "grad_norm": 0.09070551920354154, "learning_rate": 0.00019885605855918885, "loss": 0.38, "step": 67 }, { "epoch": 0.4401294498381877, "grad_norm": 0.08922144963205411, "learning_rate": 0.00019873905157490285, "loss": 0.3854, "step": 68 }, { "epoch": 0.44660194174757284, "grad_norm": 0.09304718374139792, "learning_rate": 0.0001986163862325146, "loss": 0.3804, "step": 69 }, { "epoch": 0.45307443365695793, "grad_norm": 0.09290101551147209, "learning_rate": 0.0001984880695615066, "loss": 0.3876, "step": 70 }, { "epoch": 0.459546925566343, "grad_norm": 0.08862476310706492, "learning_rate": 0.0001983541089152174, "loss": 0.3701, "step": 71 }, { "epoch": 0.46601941747572817, "grad_norm": 0.0864643410147614, "learning_rate": 0.00019821451197042026, "loss": 0.3625, "step": 72 }, { "epoch": 0.47249190938511326, "grad_norm": 0.0889683558231721, "learning_rate": 0.0001980692867268832, "loss": 0.3635, "step": 73 }, { "epoch": 0.47896440129449835, "grad_norm": 0.090198299336284, "learning_rate": 0.0001979184415069104, "loss": 0.3804, "step": 74 }, { "epoch": 0.4854368932038835, "grad_norm": 0.09214744360572043, "learning_rate": 0.00019776198495486565, "loss": 0.367, "step": 75 }, { "epoch": 0.4919093851132686, "grad_norm": 0.08545016444696638, "learning_rate": 0.00019759992603667667, "loss": 0.3671, "step": 76 }, { "epoch": 0.49838187702265374, "grad_norm": 0.08836432043627582, "learning_rate": 0.00019743227403932134, "loss": 0.3759, "step": 77 }, { "epoch": 0.5048543689320388, "grad_norm": 0.09220525179921107, "learning_rate": 0.00019725903857029564, "loss": 0.3759, "step": 78 }, { "epoch": 0.511326860841424, "grad_norm": 0.09021919927906268, "learning_rate": 0.00019708022955706292, "loss": 0.3748, "step": 79 }, { "epoch": 0.517799352750809, "grad_norm": 0.0857417469759507, "learning_rate": 0.00019689585724648516, "loss": 0.3778, "step": 80 }, { "epoch": 0.5242718446601942, "grad_norm": 0.08899327468436542, "learning_rate": 0.00019670593220423558, "loss": 0.3711, "step": 81 }, { "epoch": 0.5307443365695793, "grad_norm": 0.09420631205003693, "learning_rate": 0.00019651046531419332, "loss": 0.3678, "step": 82 }, { "epoch": 0.5372168284789643, "grad_norm": 0.09175386946113716, "learning_rate": 0.00019630946777781966, "loss": 0.3669, "step": 83 }, { "epoch": 0.5436893203883495, "grad_norm": 0.08372986124983309, "learning_rate": 0.0001961029511135161, "loss": 0.3593, "step": 84 }, { "epoch": 0.5501618122977346, "grad_norm": 0.08923405383972152, "learning_rate": 0.00019589092715596417, "loss": 0.3729, "step": 85 }, { "epoch": 0.5566343042071198, "grad_norm": 0.08853783615052746, "learning_rate": 0.00019567340805544758, "loss": 0.3512, "step": 86 }, { "epoch": 0.5631067961165048, "grad_norm": 0.09158487904428567, "learning_rate": 0.0001954504062771555, "loss": 0.3673, "step": 87 }, { "epoch": 0.56957928802589, "grad_norm": 0.09709095338695828, "learning_rate": 0.00019522193460046864, "loss": 0.3758, "step": 88 }, { "epoch": 0.5760517799352751, "grad_norm": 0.09252319787935548, "learning_rate": 0.00019498800611822645, "loss": 0.3588, "step": 89 }, { "epoch": 0.5825242718446602, "grad_norm": 0.10229380875651542, "learning_rate": 0.00019474863423597728, "loss": 0.3859, "step": 90 }, { "epoch": 0.5889967637540453, "grad_norm": 0.09361866653135192, "learning_rate": 0.00019450383267120982, "loss": 0.3739, "step": 91 }, { "epoch": 0.5954692556634305, "grad_norm": 0.0919566522897771, "learning_rate": 0.00019425361545256727, "loss": 0.3648, "step": 92 }, { "epoch": 0.6019417475728155, "grad_norm": 0.10007063382515365, "learning_rate": 0.0001939979969190432, "loss": 0.3628, "step": 93 }, { "epoch": 0.6084142394822006, "grad_norm": 0.08642815308708984, "learning_rate": 0.00019373699171915988, "loss": 0.3607, "step": 94 }, { "epoch": 0.6148867313915858, "grad_norm": 0.08898100138256641, "learning_rate": 0.00019347061481012894, "loss": 0.3649, "step": 95 }, { "epoch": 0.6213592233009708, "grad_norm": 0.09929344608921754, "learning_rate": 0.00019319888145699415, "loss": 0.3707, "step": 96 }, { "epoch": 0.627831715210356, "grad_norm": 0.08479565009317201, "learning_rate": 0.00019292180723175654, "loss": 0.3596, "step": 97 }, { "epoch": 0.6343042071197411, "grad_norm": 0.08536934965094412, "learning_rate": 0.00019263940801248226, "loss": 0.3693, "step": 98 }, { "epoch": 0.6407766990291263, "grad_norm": 0.09264353563725661, "learning_rate": 0.0001923516999823925, "loss": 0.3679, "step": 99 }, { "epoch": 0.6472491909385113, "grad_norm": 0.09075116775864867, "learning_rate": 0.00019205869962893605, "loss": 0.3594, "step": 100 }, { "epoch": 0.6537216828478964, "grad_norm": 0.08696915654679535, "learning_rate": 0.0001917604237428447, "loss": 0.3699, "step": 101 }, { "epoch": 0.6601941747572816, "grad_norm": 0.08387717136900205, "learning_rate": 0.00019145688941717075, "loss": 0.3569, "step": 102 }, { "epoch": 0.6666666666666666, "grad_norm": 0.09727169698788522, "learning_rate": 0.00019114811404630762, "loss": 0.3725, "step": 103 }, { "epoch": 0.6731391585760518, "grad_norm": 0.08785848039003973, "learning_rate": 0.0001908341153249931, "loss": 0.3625, "step": 104 }, { "epoch": 0.6796116504854369, "grad_norm": 0.09111196159253629, "learning_rate": 0.00019051491124729512, "loss": 0.3658, "step": 105 }, { "epoch": 0.686084142394822, "grad_norm": 0.09173527522438048, "learning_rate": 0.00019019052010558088, "loss": 0.3605, "step": 106 }, { "epoch": 0.6925566343042071, "grad_norm": 0.08349190333555147, "learning_rate": 0.00018986096048946824, "loss": 0.3714, "step": 107 }, { "epoch": 0.6990291262135923, "grad_norm": 0.08795475348576975, "learning_rate": 0.0001895262512847607, "loss": 0.3646, "step": 108 }, { "epoch": 0.7055016181229773, "grad_norm": 0.0834930063587362, "learning_rate": 0.00018918641167236505, "loss": 0.3518, "step": 109 }, { "epoch": 0.7119741100323624, "grad_norm": 0.08299225873111288, "learning_rate": 0.00018884146112719207, "loss": 0.3498, "step": 110 }, { "epoch": 0.7184466019417476, "grad_norm": 0.08676290151306544, "learning_rate": 0.00018849141941704067, "loss": 0.3612, "step": 111 }, { "epoch": 0.7249190938511327, "grad_norm": 0.09107846226354271, "learning_rate": 0.00018813630660146488, "loss": 0.3508, "step": 112 }, { "epoch": 0.7313915857605178, "grad_norm": 0.09040771142681107, "learning_rate": 0.00018777614303062457, "loss": 0.358, "step": 113 }, { "epoch": 0.7378640776699029, "grad_norm": 0.08473984809675678, "learning_rate": 0.000187410949344119, "loss": 0.3567, "step": 114 }, { "epoch": 0.7443365695792881, "grad_norm": 0.09166724031633931, "learning_rate": 0.00018704074646980415, "loss": 0.3474, "step": 115 }, { "epoch": 0.7508090614886731, "grad_norm": 0.09150633413591353, "learning_rate": 0.00018666555562259356, "loss": 0.3718, "step": 116 }, { "epoch": 0.7572815533980582, "grad_norm": 0.09139304143928584, "learning_rate": 0.00018628539830324229, "loss": 0.3583, "step": 117 }, { "epoch": 0.7637540453074434, "grad_norm": 0.08557332691225407, "learning_rate": 0.00018590029629711506, "loss": 0.3481, "step": 118 }, { "epoch": 0.7702265372168284, "grad_norm": 0.08670915390405, "learning_rate": 0.00018551027167293768, "loss": 0.3486, "step": 119 }, { "epoch": 0.7766990291262136, "grad_norm": 0.085015484261482, "learning_rate": 0.00018511534678153244, "loss": 0.367, "step": 120 }, { "epoch": 0.7831715210355987, "grad_norm": 0.09010500519447748, "learning_rate": 0.0001847155442545372, "loss": 0.3673, "step": 121 }, { "epoch": 0.7896440129449838, "grad_norm": 0.08749250339043911, "learning_rate": 0.00018431088700310844, "loss": 0.3581, "step": 122 }, { "epoch": 0.7961165048543689, "grad_norm": 0.09294869349273092, "learning_rate": 0.00018390139821660855, "loss": 0.3502, "step": 123 }, { "epoch": 0.8025889967637541, "grad_norm": 0.08999457767358941, "learning_rate": 0.00018348710136127655, "loss": 0.3604, "step": 124 }, { "epoch": 0.8090614886731392, "grad_norm": 0.08450140095669934, "learning_rate": 0.0001830680201788836, "loss": 0.3585, "step": 125 }, { "epoch": 0.8155339805825242, "grad_norm": 0.0885173201152873, "learning_rate": 0.00018264417868537244, "loss": 0.36, "step": 126 }, { "epoch": 0.8220064724919094, "grad_norm": 0.08249252827967349, "learning_rate": 0.00018221560116948103, "loss": 0.3456, "step": 127 }, { "epoch": 0.8284789644012945, "grad_norm": 0.09259010876584428, "learning_rate": 0.0001817823121913506, "loss": 0.3523, "step": 128 }, { "epoch": 0.8349514563106796, "grad_norm": 0.0829493165024249, "learning_rate": 0.00018134433658111845, "loss": 0.3502, "step": 129 }, { "epoch": 0.8414239482200647, "grad_norm": 0.0870233230162085, "learning_rate": 0.00018090169943749476, "loss": 0.3503, "step": 130 }, { "epoch": 0.8478964401294499, "grad_norm": 0.09013336960665569, "learning_rate": 0.00018045442612632444, "loss": 0.355, "step": 131 }, { "epoch": 0.8543689320388349, "grad_norm": 0.0887494126249745, "learning_rate": 0.00018000254227913348, "loss": 0.3492, "step": 132 }, { "epoch": 0.86084142394822, "grad_norm": 0.08245510607718151, "learning_rate": 0.00017954607379166, "loss": 0.3509, "step": 133 }, { "epoch": 0.8673139158576052, "grad_norm": 0.08652334474586301, "learning_rate": 0.00017908504682237047, "loss": 0.3464, "step": 134 }, { "epoch": 0.8737864077669902, "grad_norm": 0.0832923145834622, "learning_rate": 0.00017861948779096046, "loss": 0.3392, "step": 135 }, { "epoch": 0.8802588996763754, "grad_norm": 0.0812465660556935, "learning_rate": 0.0001781494233768408, "loss": 0.333, "step": 136 }, { "epoch": 0.8867313915857605, "grad_norm": 0.088984541060325, "learning_rate": 0.00017767488051760857, "loss": 0.3615, "step": 137 }, { "epoch": 0.8932038834951457, "grad_norm": 0.0867087441619348, "learning_rate": 0.00017719588640750336, "loss": 0.3553, "step": 138 }, { "epoch": 0.8996763754045307, "grad_norm": 0.08382301700407316, "learning_rate": 0.00017671246849584903, "loss": 0.3592, "step": 139 }, { "epoch": 0.9061488673139159, "grad_norm": 0.0865944397225724, "learning_rate": 0.0001762246544854807, "loss": 0.3605, "step": 140 }, { "epoch": 0.912621359223301, "grad_norm": 0.08628029323884819, "learning_rate": 0.00017573247233115694, "loss": 0.371, "step": 141 }, { "epoch": 0.919093851132686, "grad_norm": 0.08863123995414322, "learning_rate": 0.00017523595023795813, "loss": 0.3504, "step": 142 }, { "epoch": 0.9255663430420712, "grad_norm": 0.08627162896846262, "learning_rate": 0.00017473511665966993, "loss": 0.3545, "step": 143 }, { "epoch": 0.9320388349514563, "grad_norm": 0.08839402497347855, "learning_rate": 0.00017423000029715267, "loss": 0.3641, "step": 144 }, { "epoch": 0.9385113268608414, "grad_norm": 0.08651737654873981, "learning_rate": 0.00017372063009669686, "loss": 0.3431, "step": 145 }, { "epoch": 0.9449838187702265, "grad_norm": 0.07938521602972776, "learning_rate": 0.00017320703524836405, "loss": 0.3541, "step": 146 }, { "epoch": 0.9514563106796117, "grad_norm": 0.08679861689760528, "learning_rate": 0.00017268924518431438, "loss": 0.3492, "step": 147 }, { "epoch": 0.9579288025889967, "grad_norm": 0.08562044946225537, "learning_rate": 0.00017216728957711967, "loss": 0.3497, "step": 148 }, { "epoch": 0.9644012944983819, "grad_norm": 0.08302055814489633, "learning_rate": 0.0001716411983380632, "loss": 0.3472, "step": 149 }, { "epoch": 0.970873786407767, "grad_norm": 0.08275430206926206, "learning_rate": 0.00017111100161542545, "loss": 0.3383, "step": 150 }, { "epoch": 0.9773462783171522, "grad_norm": 0.08114036438803068, "learning_rate": 0.00017057672979275656, "loss": 0.36, "step": 151 }, { "epoch": 0.9838187702265372, "grad_norm": 0.0875761383748598, "learning_rate": 0.0001700384134871351, "loss": 0.3727, "step": 152 }, { "epoch": 0.9902912621359223, "grad_norm": 0.0826665996105745, "learning_rate": 0.0001694960835474134, "loss": 0.3443, "step": 153 }, { "epoch": 0.9967637540453075, "grad_norm": 0.08226814015705675, "learning_rate": 0.00016894977105244997, "loss": 0.3653, "step": 154 }, { "epoch": 0.9967637540453075, "eval_loss": 0.3545331358909607, "eval_runtime": 38.077, "eval_samples_per_second": 27.287, "eval_steps_per_second": 0.867, "step": 154 }, { "epoch": 1.0032362459546926, "grad_norm": 0.08335200234700724, "learning_rate": 0.0001683995073093283, "loss": 0.3492, "step": 155 }, { "epoch": 1.0097087378640777, "grad_norm": 0.0865080297629732, "learning_rate": 0.00016784532385156285, "loss": 0.3377, "step": 156 }, { "epoch": 1.0161812297734627, "grad_norm": 0.08441339355225498, "learning_rate": 0.0001672872524372919, "loss": 0.34, "step": 157 }, { "epoch": 1.022653721682848, "grad_norm": 0.0801171782758195, "learning_rate": 0.00016672532504745778, "loss": 0.3354, "step": 158 }, { "epoch": 1.029126213592233, "grad_norm": 0.08282538959029859, "learning_rate": 0.00016615957388397399, "loss": 0.3497, "step": 159 }, { "epoch": 1.035598705501618, "grad_norm": 0.0883355169428613, "learning_rate": 0.00016559003136787988, "loss": 0.338, "step": 160 }, { "epoch": 1.0420711974110033, "grad_norm": 0.08616766891600182, "learning_rate": 0.00016501673013748284, "loss": 0.3491, "step": 161 }, { "epoch": 1.0485436893203883, "grad_norm": 0.08188626612719885, "learning_rate": 0.0001644397030464877, "loss": 0.3359, "step": 162 }, { "epoch": 1.0550161812297734, "grad_norm": 0.08630887969447615, "learning_rate": 0.00016385898316211426, "loss": 0.3268, "step": 163 }, { "epoch": 1.0614886731391586, "grad_norm": 0.09109351353175275, "learning_rate": 0.0001632746037632021, "loss": 0.3356, "step": 164 }, { "epoch": 1.0679611650485437, "grad_norm": 0.08843718177405194, "learning_rate": 0.00016268659833830367, "loss": 0.3205, "step": 165 }, { "epoch": 1.074433656957929, "grad_norm": 0.09214475868602304, "learning_rate": 0.00016209500058376515, "loss": 0.3433, "step": 166 }, { "epoch": 1.080906148867314, "grad_norm": 0.08469945389831732, "learning_rate": 0.00016149984440179537, "loss": 0.3252, "step": 167 }, { "epoch": 1.087378640776699, "grad_norm": 0.09167891300803974, "learning_rate": 0.00016090116389852306, "loss": 0.3372, "step": 168 }, { "epoch": 1.0938511326860842, "grad_norm": 0.08439372223738191, "learning_rate": 0.00016029899338204233, "loss": 0.3339, "step": 169 }, { "epoch": 1.1003236245954693, "grad_norm": 0.09671189992628862, "learning_rate": 0.0001596933673604467, "loss": 0.331, "step": 170 }, { "epoch": 1.1067961165048543, "grad_norm": 0.08926808998386844, "learning_rate": 0.00015908432053985143, "loss": 0.3438, "step": 171 }, { "epoch": 1.1132686084142396, "grad_norm": 0.08651511861187883, "learning_rate": 0.0001584718878224047, "loss": 0.3199, "step": 172 }, { "epoch": 1.1197411003236246, "grad_norm": 0.09087672552517187, "learning_rate": 0.00015785610430428762, "loss": 0.3312, "step": 173 }, { "epoch": 1.1262135922330097, "grad_norm": 0.08473819565220535, "learning_rate": 0.00015723700527370268, "loss": 0.3265, "step": 174 }, { "epoch": 1.132686084142395, "grad_norm": 0.08959082008669542, "learning_rate": 0.00015661462620885199, "loss": 0.3361, "step": 175 }, { "epoch": 1.13915857605178, "grad_norm": 0.08820985748174735, "learning_rate": 0.0001559890027759037, "loss": 0.3352, "step": 176 }, { "epoch": 1.145631067961165, "grad_norm": 0.08959538258978733, "learning_rate": 0.00015536017082694846, "loss": 0.353, "step": 177 }, { "epoch": 1.1521035598705502, "grad_norm": 0.08556029404315754, "learning_rate": 0.0001547281663979446, "loss": 0.338, "step": 178 }, { "epoch": 1.1585760517799353, "grad_norm": 0.08651722009131164, "learning_rate": 0.00015409302570665325, "loss": 0.3428, "step": 179 }, { "epoch": 1.1650485436893203, "grad_norm": 0.09127001503693886, "learning_rate": 0.00015345478515056267, "loss": 0.3196, "step": 180 }, { "epoch": 1.1715210355987056, "grad_norm": 0.08856796253981307, "learning_rate": 0.00015281348130480272, "loss": 0.3309, "step": 181 }, { "epoch": 1.1779935275080906, "grad_norm": 0.08742855227811815, "learning_rate": 0.00015216915092004847, "loss": 0.3273, "step": 182 }, { "epoch": 1.1844660194174756, "grad_norm": 0.09076657129408205, "learning_rate": 0.0001515218309204145, "loss": 0.3324, "step": 183 }, { "epoch": 1.190938511326861, "grad_norm": 0.09296734178135176, "learning_rate": 0.00015087155840133888, "loss": 0.3443, "step": 184 }, { "epoch": 1.197411003236246, "grad_norm": 0.08431288630485151, "learning_rate": 0.00015021837062745714, "loss": 0.3383, "step": 185 }, { "epoch": 1.203883495145631, "grad_norm": 0.08374691953765048, "learning_rate": 0.00014956230503046703, "loss": 0.3368, "step": 186 }, { "epoch": 1.2103559870550162, "grad_norm": 0.08899017726880568, "learning_rate": 0.00014890339920698334, "loss": 0.3368, "step": 187 }, { "epoch": 1.2168284789644013, "grad_norm": 0.08562973948918146, "learning_rate": 0.00014824169091638337, "loss": 0.3397, "step": 188 }, { "epoch": 1.2233009708737863, "grad_norm": 0.08827756751106554, "learning_rate": 0.00014757721807864317, "loss": 0.3405, "step": 189 }, { "epoch": 1.2297734627831716, "grad_norm": 0.08736187981084419, "learning_rate": 0.0001469100187721644, "loss": 0.3262, "step": 190 }, { "epoch": 1.2362459546925566, "grad_norm": 0.09050203121679472, "learning_rate": 0.0001462401312315922, "loss": 0.3345, "step": 191 }, { "epoch": 1.2427184466019416, "grad_norm": 0.08534856926861663, "learning_rate": 0.00014556759384562416, "loss": 0.3277, "step": 192 }, { "epoch": 1.249190938511327, "grad_norm": 0.08611809442550349, "learning_rate": 0.00014489244515481046, "loss": 0.3253, "step": 193 }, { "epoch": 1.255663430420712, "grad_norm": 0.08688819258876039, "learning_rate": 0.0001442147238493451, "loss": 0.3349, "step": 194 }, { "epoch": 1.262135922330097, "grad_norm": 0.08416654255653569, "learning_rate": 0.00014353446876684892, "loss": 0.3308, "step": 195 }, { "epoch": 1.2686084142394822, "grad_norm": 0.09183514403063862, "learning_rate": 0.0001428517188901437, "loss": 0.3383, "step": 196 }, { "epoch": 1.2750809061488673, "grad_norm": 0.08749863979244098, "learning_rate": 0.0001421665133450184, "loss": 0.3294, "step": 197 }, { "epoch": 1.2815533980582523, "grad_norm": 0.08786155414487208, "learning_rate": 0.00014147889139798708, "loss": 0.332, "step": 198 }, { "epoch": 1.2880258899676376, "grad_norm": 0.08754391459720788, "learning_rate": 0.00014078889245403844, "loss": 0.3268, "step": 199 }, { "epoch": 1.2944983818770226, "grad_norm": 0.09026975504709552, "learning_rate": 0.0001400965560543778, "loss": 0.3344, "step": 200 }, { "epoch": 1.3009708737864076, "grad_norm": 0.08953266507881985, "learning_rate": 0.0001394019218741612, "loss": 0.3355, "step": 201 }, { "epoch": 1.307443365695793, "grad_norm": 0.08868590943641125, "learning_rate": 0.00013870502972022173, "loss": 0.3282, "step": 202 }, { "epoch": 1.313915857605178, "grad_norm": 0.09182148747745249, "learning_rate": 0.00013800591952878825, "loss": 0.3394, "step": 203 }, { "epoch": 1.3203883495145632, "grad_norm": 0.08835010573043636, "learning_rate": 0.00013730463136319692, "loss": 0.3316, "step": 204 }, { "epoch": 1.3268608414239482, "grad_norm": 0.08573944970518392, "learning_rate": 0.00013660120541159537, "loss": 0.337, "step": 205 }, { "epoch": 1.3333333333333333, "grad_norm": 0.08411384694221385, "learning_rate": 0.00013589568198463944, "loss": 0.3356, "step": 206 }, { "epoch": 1.3398058252427185, "grad_norm": 0.08762468166968049, "learning_rate": 0.0001351881015131833, "loss": 0.3388, "step": 207 }, { "epoch": 1.3462783171521036, "grad_norm": 0.08837885567018512, "learning_rate": 0.00013447850454596265, "loss": 0.3354, "step": 208 }, { "epoch": 1.3527508090614886, "grad_norm": 0.08823334999060412, "learning_rate": 0.00013376693174727065, "loss": 0.3341, "step": 209 }, { "epoch": 1.3592233009708738, "grad_norm": 0.08665353483827595, "learning_rate": 0.00013305342389462792, "loss": 0.3282, "step": 210 }, { "epoch": 1.3656957928802589, "grad_norm": 0.08780215390647159, "learning_rate": 0.00013233802187644566, "loss": 0.3245, "step": 211 }, { "epoch": 1.3721682847896441, "grad_norm": 0.0880005148626301, "learning_rate": 0.0001316207666896824, "loss": 0.3264, "step": 212 }, { "epoch": 1.3786407766990292, "grad_norm": 0.09350061568656749, "learning_rate": 0.00013090169943749476, "loss": 0.33, "step": 213 }, { "epoch": 1.3851132686084142, "grad_norm": 0.0869291546207145, "learning_rate": 0.00013018086132688184, "loss": 0.3219, "step": 214 }, { "epoch": 1.3915857605177995, "grad_norm": 0.0926767409167509, "learning_rate": 0.0001294582936663239, "loss": 0.3294, "step": 215 }, { "epoch": 1.3980582524271845, "grad_norm": 0.08850918948495869, "learning_rate": 0.00012873403786341513, "loss": 0.3362, "step": 216 }, { "epoch": 1.4045307443365695, "grad_norm": 0.08797553458327365, "learning_rate": 0.00012800813542249072, "loss": 0.3293, "step": 217 }, { "epoch": 1.4110032362459548, "grad_norm": 0.08816787591032364, "learning_rate": 0.00012728062794224832, "loss": 0.3297, "step": 218 }, { "epoch": 1.4174757281553398, "grad_norm": 0.08844464048062073, "learning_rate": 0.0001265515571133643, "loss": 0.3405, "step": 219 }, { "epoch": 1.4239482200647249, "grad_norm": 0.08777743550690727, "learning_rate": 0.00012582096471610467, "loss": 0.33, "step": 220 }, { "epoch": 1.4304207119741101, "grad_norm": 0.0841701096943823, "learning_rate": 0.00012508889261793059, "loss": 0.3243, "step": 221 }, { "epoch": 1.4368932038834952, "grad_norm": 0.08211677315785172, "learning_rate": 0.0001243553827710992, "loss": 0.3162, "step": 222 }, { "epoch": 1.4433656957928802, "grad_norm": 0.09229614871738043, "learning_rate": 0.00012362047721025968, "loss": 0.3214, "step": 223 }, { "epoch": 1.4498381877022655, "grad_norm": 0.08827293639891444, "learning_rate": 0.00012288421805004414, "loss": 0.3388, "step": 224 }, { "epoch": 1.4563106796116505, "grad_norm": 0.08650939834799051, "learning_rate": 0.0001221466474826543, "loss": 0.3336, "step": 225 }, { "epoch": 1.4627831715210355, "grad_norm": 0.08745414194563839, "learning_rate": 0.00012140780777544367, "loss": 0.3249, "step": 226 }, { "epoch": 1.4692556634304208, "grad_norm": 0.08966472430098649, "learning_rate": 0.00012066774126849529, "loss": 0.3358, "step": 227 }, { "epoch": 1.4757281553398058, "grad_norm": 0.08547479634425736, "learning_rate": 0.00011992649037219545, "loss": 0.3415, "step": 228 }, { "epoch": 1.4822006472491909, "grad_norm": 0.08630753448523301, "learning_rate": 0.0001191840975648032, "loss": 0.3369, "step": 229 }, { "epoch": 1.4886731391585761, "grad_norm": 0.08171192229188709, "learning_rate": 0.00011844060539001618, "loss": 0.3207, "step": 230 }, { "epoch": 1.4951456310679612, "grad_norm": 0.08413278558608812, "learning_rate": 0.00011769605645453265, "loss": 0.3219, "step": 231 }, { "epoch": 1.5016181229773462, "grad_norm": 0.08498283806676993, "learning_rate": 0.00011695049342560968, "loss": 0.3339, "step": 232 }, { "epoch": 1.5080906148867315, "grad_norm": 0.08365977359578973, "learning_rate": 0.00011620395902861822, "loss": 0.3297, "step": 233 }, { "epoch": 1.5145631067961165, "grad_norm": 0.0858081054599164, "learning_rate": 0.00011545649604459466, "loss": 0.3305, "step": 234 }, { "epoch": 1.5210355987055015, "grad_norm": 0.08916196136213667, "learning_rate": 0.00011470814730778905, "loss": 0.3352, "step": 235 }, { "epoch": 1.5275080906148868, "grad_norm": 0.08650692110960809, "learning_rate": 0.00011395895570321064, "loss": 0.3347, "step": 236 }, { "epoch": 1.5339805825242718, "grad_norm": 0.08839636357345756, "learning_rate": 0.00011320896416417026, "loss": 0.3284, "step": 237 }, { "epoch": 1.5404530744336569, "grad_norm": 0.08477612347310413, "learning_rate": 0.00011245821566981976, "loss": 0.3331, "step": 238 }, { "epoch": 1.5469255663430421, "grad_norm": 0.08158083954977735, "learning_rate": 0.00011170675324268942, "loss": 0.3284, "step": 239 }, { "epoch": 1.5533980582524272, "grad_norm": 0.08210674030802527, "learning_rate": 0.00011095461994622209, "loss": 0.3152, "step": 240 }, { "epoch": 1.5598705501618122, "grad_norm": 0.08164700760175722, "learning_rate": 0.00011020185888230571, "loss": 0.3212, "step": 241 }, { "epoch": 1.5663430420711975, "grad_norm": 0.08252073365071766, "learning_rate": 0.00010944851318880314, "loss": 0.3349, "step": 242 }, { "epoch": 1.5728155339805825, "grad_norm": 0.0847504452125806, "learning_rate": 0.00010869462603708011, "loss": 0.3355, "step": 243 }, { "epoch": 1.5792880258899675, "grad_norm": 0.08651133299846145, "learning_rate": 0.00010794024062953123, "loss": 0.3295, "step": 244 }, { "epoch": 1.5857605177993528, "grad_norm": 0.08848888104315868, "learning_rate": 0.00010718540019710432, "loss": 0.334, "step": 245 }, { "epoch": 1.5922330097087378, "grad_norm": 0.0838746079221116, "learning_rate": 0.00010643014799682296, "loss": 0.3236, "step": 246 }, { "epoch": 1.5987055016181229, "grad_norm": 0.0834887968620315, "learning_rate": 0.00010567452730930743, "loss": 0.3231, "step": 247 }, { "epoch": 1.6051779935275081, "grad_norm": 0.08903656754039865, "learning_rate": 0.00010491858143629469, "loss": 0.3303, "step": 248 }, { "epoch": 1.6116504854368932, "grad_norm": 0.08833888604572854, "learning_rate": 0.00010416235369815693, "loss": 0.3417, "step": 249 }, { "epoch": 1.6181229773462782, "grad_norm": 0.08946556956131413, "learning_rate": 0.00010340588743141879, "loss": 0.3341, "step": 250 }, { "epoch": 1.6245954692556634, "grad_norm": 0.09195837741425467, "learning_rate": 0.00010264922598627418, "loss": 0.3292, "step": 251 }, { "epoch": 1.6310679611650487, "grad_norm": 0.08669211913864286, "learning_rate": 0.0001018924127241019, "loss": 0.329, "step": 252 }, { "epoch": 1.6375404530744335, "grad_norm": 0.08493981781947878, "learning_rate": 0.00010113549101498086, "loss": 0.317, "step": 253 }, { "epoch": 1.6440129449838188, "grad_norm": 0.08859940503358235, "learning_rate": 0.00010037850423520454, "loss": 0.3239, "step": 254 }, { "epoch": 1.650485436893204, "grad_norm": 0.08691997093628756, "learning_rate": 9.962149576479545e-05, "loss": 0.3276, "step": 255 }, { "epoch": 1.6569579288025889, "grad_norm": 0.08182220008549855, "learning_rate": 9.886450898501917e-05, "loss": 0.322, "step": 256 }, { "epoch": 1.6634304207119741, "grad_norm": 0.08983867569693547, "learning_rate": 9.810758727589813e-05, "loss": 0.3289, "step": 257 }, { "epoch": 1.6699029126213594, "grad_norm": 0.08440426328661797, "learning_rate": 9.735077401372583e-05, "loss": 0.335, "step": 258 }, { "epoch": 1.6763754045307442, "grad_norm": 0.09081741368893546, "learning_rate": 9.659411256858122e-05, "loss": 0.3285, "step": 259 }, { "epoch": 1.6828478964401294, "grad_norm": 0.08638692257401422, "learning_rate": 9.583764630184311e-05, "loss": 0.3283, "step": 260 }, { "epoch": 1.6893203883495147, "grad_norm": 0.08661785904833329, "learning_rate": 9.508141856370532e-05, "loss": 0.3315, "step": 261 }, { "epoch": 1.6957928802588995, "grad_norm": 0.08867942067172753, "learning_rate": 9.432547269069261e-05, "loss": 0.3298, "step": 262 }, { "epoch": 1.7022653721682848, "grad_norm": 0.08121634636493791, "learning_rate": 9.356985200317709e-05, "loss": 0.3264, "step": 263 }, { "epoch": 1.70873786407767, "grad_norm": 0.0857656466474692, "learning_rate": 9.281459980289567e-05, "loss": 0.3265, "step": 264 }, { "epoch": 1.715210355987055, "grad_norm": 0.08325063625580009, "learning_rate": 9.205975937046879e-05, "loss": 0.3248, "step": 265 }, { "epoch": 1.72168284789644, "grad_norm": 0.09125541887572731, "learning_rate": 9.130537396291994e-05, "loss": 0.3336, "step": 266 }, { "epoch": 1.7281553398058254, "grad_norm": 0.0858478153556312, "learning_rate": 9.055148681119688e-05, "loss": 0.3335, "step": 267 }, { "epoch": 1.7346278317152104, "grad_norm": 0.08412291629199992, "learning_rate": 8.979814111769431e-05, "loss": 0.3247, "step": 268 }, { "epoch": 1.7411003236245954, "grad_norm": 0.08342610941802514, "learning_rate": 8.904538005377794e-05, "loss": 0.3271, "step": 269 }, { "epoch": 1.7475728155339807, "grad_norm": 0.08566344893624273, "learning_rate": 8.829324675731059e-05, "loss": 0.3342, "step": 270 }, { "epoch": 1.7540453074433657, "grad_norm": 0.08349328064040033, "learning_rate": 8.754178433018025e-05, "loss": 0.3169, "step": 271 }, { "epoch": 1.7605177993527508, "grad_norm": 0.08290650287411376, "learning_rate": 8.679103583582979e-05, "loss": 0.32, "step": 272 }, { "epoch": 1.766990291262136, "grad_norm": 0.08355664601674428, "learning_rate": 8.604104429678935e-05, "loss": 0.3284, "step": 273 }, { "epoch": 1.773462783171521, "grad_norm": 0.08088172666023426, "learning_rate": 8.529185269221097e-05, "loss": 0.3189, "step": 274 }, { "epoch": 1.779935275080906, "grad_norm": 0.08494057369308265, "learning_rate": 8.45435039554054e-05, "loss": 0.3175, "step": 275 }, { "epoch": 1.7864077669902914, "grad_norm": 0.08435898747241219, "learning_rate": 8.379604097138179e-05, "loss": 0.3359, "step": 276 }, { "epoch": 1.7928802588996764, "grad_norm": 0.08369793345019513, "learning_rate": 8.304950657439033e-05, "loss": 0.3277, "step": 277 }, { "epoch": 1.7993527508090614, "grad_norm": 0.08234674894169963, "learning_rate": 8.230394354546737e-05, "loss": 0.3106, "step": 278 }, { "epoch": 1.8058252427184467, "grad_norm": 0.08480659421208028, "learning_rate": 8.15593946099838e-05, "loss": 0.3219, "step": 279 }, { "epoch": 1.8122977346278317, "grad_norm": 0.087986920120639, "learning_rate": 8.08159024351968e-05, "loss": 0.3442, "step": 280 }, { "epoch": 1.8187702265372168, "grad_norm": 0.0839196545152189, "learning_rate": 8.007350962780456e-05, "loss": 0.3216, "step": 281 }, { "epoch": 1.825242718446602, "grad_norm": 0.08396185408224062, "learning_rate": 7.93322587315047e-05, "loss": 0.3249, "step": 282 }, { "epoch": 1.831715210355987, "grad_norm": 0.08739956374343912, "learning_rate": 7.859219222455634e-05, "loss": 0.3366, "step": 283 }, { "epoch": 1.838187702265372, "grad_norm": 0.08539428449034953, "learning_rate": 7.785335251734573e-05, "loss": 0.3399, "step": 284 }, { "epoch": 1.8446601941747574, "grad_norm": 0.08443259129123379, "learning_rate": 7.711578194995589e-05, "loss": 0.326, "step": 285 }, { "epoch": 1.8511326860841424, "grad_norm": 0.08226727888900415, "learning_rate": 7.637952278974034e-05, "loss": 0.3122, "step": 286 }, { "epoch": 1.8576051779935274, "grad_norm": 0.08664703990759018, "learning_rate": 7.564461722890081e-05, "loss": 0.3276, "step": 287 }, { "epoch": 1.8640776699029127, "grad_norm": 0.08284483357408984, "learning_rate": 7.491110738206942e-05, "loss": 0.3331, "step": 288 }, { "epoch": 1.8705501618122977, "grad_norm": 0.08547813142706377, "learning_rate": 7.417903528389534e-05, "loss": 0.3108, "step": 289 }, { "epoch": 1.8770226537216828, "grad_norm": 0.08375066019242061, "learning_rate": 7.344844288663571e-05, "loss": 0.3356, "step": 290 }, { "epoch": 1.883495145631068, "grad_norm": 0.08564167180521602, "learning_rate": 7.27193720577517e-05, "loss": 0.3264, "step": 291 }, { "epoch": 1.889967637540453, "grad_norm": 0.08560457179852256, "learning_rate": 7.19918645775093e-05, "loss": 0.3274, "step": 292 }, { "epoch": 1.896440129449838, "grad_norm": 0.08594335919666654, "learning_rate": 7.126596213658488e-05, "loss": 0.3268, "step": 293 }, { "epoch": 1.9029126213592233, "grad_norm": 0.086743773673858, "learning_rate": 7.05417063336761e-05, "loss": 0.3291, "step": 294 }, { "epoch": 1.9093851132686084, "grad_norm": 0.08512277813785797, "learning_rate": 6.981913867311819e-05, "loss": 0.3199, "step": 295 }, { "epoch": 1.9158576051779934, "grad_norm": 0.08482302988433711, "learning_rate": 6.909830056250527e-05, "loss": 0.3308, "step": 296 }, { "epoch": 1.9223300970873787, "grad_norm": 0.08336748744395403, "learning_rate": 6.83792333103176e-05, "loss": 0.3303, "step": 297 }, { "epoch": 1.9288025889967637, "grad_norm": 0.08685899308663286, "learning_rate": 6.766197812355438e-05, "loss": 0.3156, "step": 298 }, { "epoch": 1.9352750809061487, "grad_norm": 0.08670025171649215, "learning_rate": 6.69465761053721e-05, "loss": 0.3263, "step": 299 }, { "epoch": 1.941747572815534, "grad_norm": 0.08367058440741144, "learning_rate": 6.623306825272937e-05, "loss": 0.3184, "step": 300 }, { "epoch": 1.948220064724919, "grad_norm": 0.08265849338783704, "learning_rate": 6.552149545403739e-05, "loss": 0.3141, "step": 301 }, { "epoch": 1.954692556634304, "grad_norm": 0.08667064648503052, "learning_rate": 6.48118984868167e-05, "loss": 0.3176, "step": 302 }, { "epoch": 1.9611650485436893, "grad_norm": 0.08624477625370294, "learning_rate": 6.410431801536058e-05, "loss": 0.3245, "step": 303 }, { "epoch": 1.9676375404530746, "grad_norm": 0.0863914502801744, "learning_rate": 6.339879458840465e-05, "loss": 0.3345, "step": 304 }, { "epoch": 1.9741100323624594, "grad_norm": 0.0841627898701783, "learning_rate": 6.269536863680307e-05, "loss": 0.3194, "step": 305 }, { "epoch": 1.9805825242718447, "grad_norm": 0.0854832505200848, "learning_rate": 6.199408047121174e-05, "loss": 0.3396, "step": 306 }, { "epoch": 1.98705501618123, "grad_norm": 0.07970724191486213, "learning_rate": 6.129497027977829e-05, "loss": 0.3228, "step": 307 }, { "epoch": 1.9935275080906147, "grad_norm": 0.08216914347596244, "learning_rate": 6.059807812583883e-05, "loss": 0.3119, "step": 308 }, { "epoch": 2.0, "grad_norm": 0.08150138639289872, "learning_rate": 5.990344394562226e-05, "loss": 0.3166, "step": 309 }, { "epoch": 2.0, "eval_loss": 0.3374881148338318, "eval_runtime": 35.4418, "eval_samples_per_second": 29.316, "eval_steps_per_second": 0.931, "step": 309 }, { "epoch": 2.0064724919093853, "grad_norm": 0.08131373994961688, "learning_rate": 5.92111075459616e-05, "loss": 0.3175, "step": 310 }, { "epoch": 2.01294498381877, "grad_norm": 0.08591014053929732, "learning_rate": 5.852110860201294e-05, "loss": 0.3109, "step": 311 }, { "epoch": 2.0194174757281553, "grad_norm": 0.08486358862414822, "learning_rate": 5.7833486654981606e-05, "loss": 0.3056, "step": 312 }, { "epoch": 2.0258899676375406, "grad_norm": 0.08216589197196723, "learning_rate": 5.714828110985635e-05, "loss": 0.308, "step": 313 }, { "epoch": 2.0323624595469254, "grad_norm": 0.08472956112082734, "learning_rate": 5.6465531233151126e-05, "loss": 0.3063, "step": 314 }, { "epoch": 2.0388349514563107, "grad_norm": 0.08715839296976118, "learning_rate": 5.578527615065492e-05, "loss": 0.3146, "step": 315 }, { "epoch": 2.045307443365696, "grad_norm": 0.08426241524579575, "learning_rate": 5.510755484518955e-05, "loss": 0.305, "step": 316 }, { "epoch": 2.0517799352750807, "grad_norm": 0.08998957963984325, "learning_rate": 5.443240615437586e-05, "loss": 0.3082, "step": 317 }, { "epoch": 2.058252427184466, "grad_norm": 0.09157974577803293, "learning_rate": 5.375986876840784e-05, "loss": 0.3187, "step": 318 }, { "epoch": 2.0647249190938513, "grad_norm": 0.09110362940023115, "learning_rate": 5.30899812278356e-05, "loss": 0.3089, "step": 319 }, { "epoch": 2.071197411003236, "grad_norm": 0.08924632615443999, "learning_rate": 5.2422781921356826e-05, "loss": 0.3108, "step": 320 }, { "epoch": 2.0776699029126213, "grad_norm": 0.08653114959635026, "learning_rate": 5.1758309083616673e-05, "loss": 0.3033, "step": 321 }, { "epoch": 2.0841423948220066, "grad_norm": 0.0897639664776455, "learning_rate": 5.109660079301668e-05, "loss": 0.3044, "step": 322 }, { "epoch": 2.0906148867313914, "grad_norm": 0.08654171610929062, "learning_rate": 5.043769496953299e-05, "loss": 0.3033, "step": 323 }, { "epoch": 2.0970873786407767, "grad_norm": 0.0883439805695338, "learning_rate": 4.9781629372542895e-05, "loss": 0.3081, "step": 324 }, { "epoch": 2.103559870550162, "grad_norm": 0.08874070518690325, "learning_rate": 4.912844159866112e-05, "loss": 0.3029, "step": 325 }, { "epoch": 2.1100323624595467, "grad_norm": 0.08970836183938685, "learning_rate": 4.847816907958549e-05, "loss": 0.3051, "step": 326 }, { "epoch": 2.116504854368932, "grad_norm": 0.08702359256692323, "learning_rate": 4.783084907995156e-05, "loss": 0.3083, "step": 327 }, { "epoch": 2.1229773462783172, "grad_norm": 0.08620252985570646, "learning_rate": 4.718651869519731e-05, "loss": 0.3009, "step": 328 }, { "epoch": 2.129449838187702, "grad_norm": 0.09026535273366211, "learning_rate": 4.654521484943735e-05, "loss": 0.3095, "step": 329 }, { "epoch": 2.1359223300970873, "grad_norm": 0.08797218673185163, "learning_rate": 4.59069742933468e-05, "loss": 0.3037, "step": 330 }, { "epoch": 2.1423948220064726, "grad_norm": 0.08992206831884317, "learning_rate": 4.527183360205541e-05, "loss": 0.3058, "step": 331 }, { "epoch": 2.148867313915858, "grad_norm": 0.09019684013585973, "learning_rate": 4.4639829173051554e-05, "loss": 0.3091, "step": 332 }, { "epoch": 2.1553398058252426, "grad_norm": 0.08846989758042821, "learning_rate": 4.401099722409631e-05, "loss": 0.2968, "step": 333 }, { "epoch": 2.161812297734628, "grad_norm": 0.09000600047149172, "learning_rate": 4.338537379114801e-05, "loss": 0.322, "step": 334 }, { "epoch": 2.168284789644013, "grad_norm": 0.09084909315613375, "learning_rate": 4.2762994726297346e-05, "loss": 0.3104, "step": 335 }, { "epoch": 2.174757281553398, "grad_norm": 0.08897581163092526, "learning_rate": 4.2143895695712444e-05, "loss": 0.3029, "step": 336 }, { "epoch": 2.1812297734627832, "grad_norm": 0.08806995729841001, "learning_rate": 4.152811217759529e-05, "loss": 0.3127, "step": 337 }, { "epoch": 2.1877022653721685, "grad_norm": 0.0909392143685305, "learning_rate": 4.091567946014858e-05, "loss": 0.3077, "step": 338 }, { "epoch": 2.1941747572815533, "grad_norm": 0.08526830124486821, "learning_rate": 4.0306632639553323e-05, "loss": 0.3092, "step": 339 }, { "epoch": 2.2006472491909386, "grad_norm": 0.0889661858323667, "learning_rate": 3.970100661795766e-05, "loss": 0.306, "step": 340 }, { "epoch": 2.207119741100324, "grad_norm": 0.08742864887103546, "learning_rate": 3.909883610147696e-05, "loss": 0.3021, "step": 341 }, { "epoch": 2.2135922330097086, "grad_norm": 0.08736016824506748, "learning_rate": 3.8500155598204644e-05, "loss": 0.2974, "step": 342 }, { "epoch": 2.220064724919094, "grad_norm": 0.0895276470969271, "learning_rate": 3.7904999416234864e-05, "loss": 0.3121, "step": 343 }, { "epoch": 2.226537216828479, "grad_norm": 0.08707410830498415, "learning_rate": 3.731340166169635e-05, "loss": 0.304, "step": 344 }, { "epoch": 2.233009708737864, "grad_norm": 0.09084514125957084, "learning_rate": 3.6725396236797935e-05, "loss": 0.3139, "step": 345 }, { "epoch": 2.2394822006472492, "grad_norm": 0.08595616841814938, "learning_rate": 3.614101683788575e-05, "loss": 0.3034, "step": 346 }, { "epoch": 2.2459546925566345, "grad_norm": 0.08918897946715092, "learning_rate": 3.5560296953512295e-05, "loss": 0.2871, "step": 347 }, { "epoch": 2.2524271844660193, "grad_norm": 0.08951525733240658, "learning_rate": 3.498326986251717e-05, "loss": 0.3064, "step": 348 }, { "epoch": 2.2588996763754046, "grad_norm": 0.08959019506927517, "learning_rate": 3.4409968632120126e-05, "loss": 0.3062, "step": 349 }, { "epoch": 2.26537216828479, "grad_norm": 0.09072525944669337, "learning_rate": 3.3840426116026044e-05, "loss": 0.3113, "step": 350 }, { "epoch": 2.2718446601941746, "grad_norm": 0.09370535186596111, "learning_rate": 3.327467495254225e-05, "loss": 0.3195, "step": 351 }, { "epoch": 2.27831715210356, "grad_norm": 0.08681410960013289, "learning_rate": 3.2712747562708115e-05, "loss": 0.3039, "step": 352 }, { "epoch": 2.284789644012945, "grad_norm": 0.0884544764695178, "learning_rate": 3.215467614843719e-05, "loss": 0.3028, "step": 353 }, { "epoch": 2.29126213592233, "grad_norm": 0.08974826283064652, "learning_rate": 3.160049269067174e-05, "loss": 0.3073, "step": 354 }, { "epoch": 2.2977346278317152, "grad_norm": 0.08791055069767624, "learning_rate": 3.105022894755003e-05, "loss": 0.3011, "step": 355 }, { "epoch": 2.3042071197411005, "grad_norm": 0.08951734113600135, "learning_rate": 3.0503916452586612e-05, "loss": 0.3056, "step": 356 }, { "epoch": 2.3106796116504853, "grad_norm": 0.08762327614071365, "learning_rate": 2.9961586512864947e-05, "loss": 0.3138, "step": 357 }, { "epoch": 2.3171521035598706, "grad_norm": 0.09032122428603973, "learning_rate": 2.9423270207243437e-05, "loss": 0.2978, "step": 358 }, { "epoch": 2.323624595469256, "grad_norm": 0.0917821868399838, "learning_rate": 2.888899838457455e-05, "loss": 0.3084, "step": 359 }, { "epoch": 2.3300970873786406, "grad_norm": 0.09193242261010433, "learning_rate": 2.835880166193683e-05, "loss": 0.3064, "step": 360 }, { "epoch": 2.336569579288026, "grad_norm": 0.08883763145021951, "learning_rate": 2.7832710422880328e-05, "loss": 0.2993, "step": 361 }, { "epoch": 2.343042071197411, "grad_norm": 0.08991962234888566, "learning_rate": 2.7310754815685624e-05, "loss": 0.3034, "step": 362 }, { "epoch": 2.349514563106796, "grad_norm": 0.08994982579652418, "learning_rate": 2.679296475163595e-05, "loss": 0.3072, "step": 363 }, { "epoch": 2.355987055016181, "grad_norm": 0.08970808859923911, "learning_rate": 2.6279369903303175e-05, "loss": 0.3023, "step": 364 }, { "epoch": 2.3624595469255665, "grad_norm": 0.08588546055096836, "learning_rate": 2.5769999702847346e-05, "loss": 0.296, "step": 365 }, { "epoch": 2.3689320388349513, "grad_norm": 0.08990846870006015, "learning_rate": 2.5264883340330113e-05, "loss": 0.3022, "step": 366 }, { "epoch": 2.3754045307443366, "grad_norm": 0.09151580193749502, "learning_rate": 2.4764049762041874e-05, "loss": 0.2978, "step": 367 }, { "epoch": 2.381877022653722, "grad_norm": 0.08959471455140089, "learning_rate": 2.426752766884306e-05, "loss": 0.3099, "step": 368 }, { "epoch": 2.3883495145631066, "grad_norm": 0.0887364245299765, "learning_rate": 2.377534551451932e-05, "loss": 0.3016, "step": 369 }, { "epoch": 2.394822006472492, "grad_norm": 0.08882940905923181, "learning_rate": 2.328753150415094e-05, "loss": 0.3004, "step": 370 }, { "epoch": 2.401294498381877, "grad_norm": 0.08843646634108779, "learning_rate": 2.280411359249668e-05, "loss": 0.2933, "step": 371 }, { "epoch": 2.407766990291262, "grad_norm": 0.09301769304474337, "learning_rate": 2.2325119482391467e-05, "loss": 0.3134, "step": 372 }, { "epoch": 2.414239482200647, "grad_norm": 0.08707266390112682, "learning_rate": 2.185057662315918e-05, "loss": 0.3003, "step": 373 }, { "epoch": 2.4207119741100325, "grad_norm": 0.0868807473282313, "learning_rate": 2.1380512209039528e-05, "loss": 0.302, "step": 374 }, { "epoch": 2.4271844660194173, "grad_norm": 0.09363452018708772, "learning_rate": 2.0914953177629548e-05, "loss": 0.3148, "step": 375 }, { "epoch": 2.4336569579288025, "grad_norm": 0.08938831583372829, "learning_rate": 2.0453926208340003e-05, "loss": 0.3055, "step": 376 }, { "epoch": 2.440129449838188, "grad_norm": 0.09230125425366141, "learning_rate": 1.999745772086655e-05, "loss": 0.3041, "step": 377 }, { "epoch": 2.4466019417475726, "grad_norm": 0.08932842130151852, "learning_rate": 1.954557387367557e-05, "loss": 0.3037, "step": 378 }, { "epoch": 2.453074433656958, "grad_norm": 0.08923122998718774, "learning_rate": 1.9098300562505266e-05, "loss": 0.3055, "step": 379 }, { "epoch": 2.459546925566343, "grad_norm": 0.0917592218140953, "learning_rate": 1.8655663418881584e-05, "loss": 0.3006, "step": 380 }, { "epoch": 2.466019417475728, "grad_norm": 0.08856315870273591, "learning_rate": 1.821768780864943e-05, "loss": 0.3072, "step": 381 }, { "epoch": 2.472491909385113, "grad_norm": 0.08867770851651259, "learning_rate": 1.7784398830519e-05, "loss": 0.298, "step": 382 }, { "epoch": 2.4789644012944985, "grad_norm": 0.08895980547301872, "learning_rate": 1.7355821314627564e-05, "loss": 0.3087, "step": 383 }, { "epoch": 2.4854368932038833, "grad_norm": 0.09119431484259279, "learning_rate": 1.6931979821116418e-05, "loss": 0.305, "step": 384 }, { "epoch": 2.4919093851132685, "grad_norm": 0.09318125516492613, "learning_rate": 1.6512898638723497e-05, "loss": 0.3095, "step": 385 }, { "epoch": 2.498381877022654, "grad_norm": 0.08770991666106868, "learning_rate": 1.6098601783391487e-05, "loss": 0.292, "step": 386 }, { "epoch": 2.5048543689320386, "grad_norm": 0.08730384586077941, "learning_rate": 1.5689112996891576e-05, "loss": 0.2982, "step": 387 }, { "epoch": 2.511326860841424, "grad_norm": 0.09025336858704619, "learning_rate": 1.5284455745462834e-05, "loss": 0.3175, "step": 388 }, { "epoch": 2.517799352750809, "grad_norm": 0.09170439944127616, "learning_rate": 1.4884653218467571e-05, "loss": 0.3131, "step": 389 }, { "epoch": 2.524271844660194, "grad_norm": 0.08975472589683904, "learning_rate": 1.4489728327062324e-05, "loss": 0.3077, "step": 390 }, { "epoch": 2.530744336569579, "grad_norm": 0.09151247517360593, "learning_rate": 1.4099703702884936e-05, "loss": 0.2987, "step": 391 }, { "epoch": 2.5372168284789645, "grad_norm": 0.08707773026948659, "learning_rate": 1.3714601696757712e-05, "loss": 0.301, "step": 392 }, { "epoch": 2.5436893203883493, "grad_norm": 0.09236291075818845, "learning_rate": 1.3334444377406452e-05, "loss": 0.3095, "step": 393 }, { "epoch": 2.5501618122977345, "grad_norm": 0.09003080825117314, "learning_rate": 1.2959253530195836e-05, "loss": 0.301, "step": 394 }, { "epoch": 2.55663430420712, "grad_norm": 0.08899294818724815, "learning_rate": 1.258905065588103e-05, "loss": 0.2996, "step": 395 }, { "epoch": 2.5631067961165046, "grad_norm": 0.08642480774869472, "learning_rate": 1.2223856969375447e-05, "loss": 0.2972, "step": 396 }, { "epoch": 2.56957928802589, "grad_norm": 0.09069465990264014, "learning_rate": 1.1863693398535114e-05, "loss": 0.3028, "step": 397 }, { "epoch": 2.576051779935275, "grad_norm": 0.09007456341294559, "learning_rate": 1.1508580582959349e-05, "loss": 0.3026, "step": 398 }, { "epoch": 2.58252427184466, "grad_norm": 0.0874176261621062, "learning_rate": 1.1158538872807933e-05, "loss": 0.302, "step": 399 }, { "epoch": 2.588996763754045, "grad_norm": 0.08762089070699175, "learning_rate": 1.0813588327634961e-05, "loss": 0.2845, "step": 400 }, { "epoch": 2.5954692556634305, "grad_norm": 0.08875402737182131, "learning_rate": 1.0473748715239307e-05, "loss": 0.3016, "step": 401 }, { "epoch": 2.6019417475728153, "grad_norm": 0.09129343356719948, "learning_rate": 1.01390395105318e-05, "loss": 0.3154, "step": 402 }, { "epoch": 2.6084142394822005, "grad_norm": 0.08810297072389008, "learning_rate": 9.809479894419149e-06, "loss": 0.3033, "step": 403 }, { "epoch": 2.614886731391586, "grad_norm": 0.09018340343849121, "learning_rate": 9.485088752704885e-06, "loss": 0.3088, "step": 404 }, { "epoch": 2.6213592233009706, "grad_norm": 0.08968883260752107, "learning_rate": 9.16588467500693e-06, "loss": 0.2982, "step": 405 }, { "epoch": 2.627831715210356, "grad_norm": 0.08957666144874589, "learning_rate": 8.851885953692374e-06, "loss": 0.3082, "step": 406 }, { "epoch": 2.634304207119741, "grad_norm": 0.09198641057091544, "learning_rate": 8.543110582829272e-06, "loss": 0.3015, "step": 407 }, { "epoch": 2.6407766990291264, "grad_norm": 0.08915392372194866, "learning_rate": 8.239576257155334e-06, "loss": 0.3068, "step": 408 }, { "epoch": 2.647249190938511, "grad_norm": 0.09135989217201736, "learning_rate": 7.941300371063953e-06, "loss": 0.3118, "step": 409 }, { "epoch": 2.6537216828478964, "grad_norm": 0.09150707922550998, "learning_rate": 7.648300017607534e-06, "loss": 0.3049, "step": 410 }, { "epoch": 2.6601941747572817, "grad_norm": 0.08679197622255307, "learning_rate": 7.360591987517762e-06, "loss": 0.2973, "step": 411 }, { "epoch": 2.6666666666666665, "grad_norm": 0.09043247718192007, "learning_rate": 7.078192768243486e-06, "loss": 0.3016, "step": 412 }, { "epoch": 2.6731391585760518, "grad_norm": 0.09227814811133576, "learning_rate": 6.80111854300588e-06, "loss": 0.3109, "step": 413 }, { "epoch": 2.679611650485437, "grad_norm": 0.08752395500418549, "learning_rate": 6.5293851898710625e-06, "loss": 0.301, "step": 414 }, { "epoch": 2.686084142394822, "grad_norm": 0.08959180387680037, "learning_rate": 6.2630082808401326e-06, "loss": 0.2992, "step": 415 }, { "epoch": 2.692556634304207, "grad_norm": 0.08743171313620991, "learning_rate": 6.00200308095682e-06, "loss": 0.301, "step": 416 }, { "epoch": 2.6990291262135924, "grad_norm": 0.08980865803761802, "learning_rate": 5.746384547432737e-06, "loss": 0.3174, "step": 417 }, { "epoch": 2.705501618122977, "grad_norm": 0.08661713786701486, "learning_rate": 5.496167328790191e-06, "loss": 0.2939, "step": 418 }, { "epoch": 2.7119741100323624, "grad_norm": 0.08786309161255973, "learning_rate": 5.251365764022753e-06, "loss": 0.2978, "step": 419 }, { "epoch": 2.7184466019417477, "grad_norm": 0.08733419882477451, "learning_rate": 5.011993881773569e-06, "loss": 0.3002, "step": 420 }, { "epoch": 2.724919093851133, "grad_norm": 0.08926182705644836, "learning_rate": 4.778065399531395e-06, "loss": 0.2975, "step": 421 }, { "epoch": 2.7313915857605178, "grad_norm": 0.0894176370961602, "learning_rate": 4.549593722844492e-06, "loss": 0.3067, "step": 422 }, { "epoch": 2.737864077669903, "grad_norm": 0.0876923455017712, "learning_rate": 4.326591944552438e-06, "loss": 0.3097, "step": 423 }, { "epoch": 2.7443365695792883, "grad_norm": 0.08572253083705766, "learning_rate": 4.109072844035844e-06, "loss": 0.2911, "step": 424 }, { "epoch": 2.750809061488673, "grad_norm": 0.08883901045408762, "learning_rate": 3.8970488864839334e-06, "loss": 0.2963, "step": 425 }, { "epoch": 2.7572815533980584, "grad_norm": 0.09240186844770727, "learning_rate": 3.690532222180343e-06, "loss": 0.3024, "step": 426 }, { "epoch": 2.7637540453074436, "grad_norm": 0.09051220479105677, "learning_rate": 3.4895346858066724e-06, "loss": 0.3089, "step": 427 }, { "epoch": 2.7702265372168284, "grad_norm": 0.0891289977534799, "learning_rate": 3.2940677957644215e-06, "loss": 0.3047, "step": 428 }, { "epoch": 2.7766990291262137, "grad_norm": 0.08973918683676917, "learning_rate": 3.104142753514849e-06, "loss": 0.3151, "step": 429 }, { "epoch": 2.783171521035599, "grad_norm": 0.08839195406678332, "learning_rate": 2.9197704429370977e-06, "loss": 0.2995, "step": 430 }, { "epoch": 2.7896440129449838, "grad_norm": 0.08711643702101148, "learning_rate": 2.7409614297043806e-06, "loss": 0.2983, "step": 431 }, { "epoch": 2.796116504854369, "grad_norm": 0.08827751306740735, "learning_rate": 2.5677259606786684e-06, "loss": 0.3071, "step": 432 }, { "epoch": 2.8025889967637543, "grad_norm": 0.08989812034868257, "learning_rate": 2.4000739633233347e-06, "loss": 0.3185, "step": 433 }, { "epoch": 2.809061488673139, "grad_norm": 0.08814993018203572, "learning_rate": 2.238015045134334e-06, "loss": 0.303, "step": 434 }, { "epoch": 2.8155339805825244, "grad_norm": 0.08731684328369535, "learning_rate": 2.0815584930895972e-06, "loss": 0.3053, "step": 435 }, { "epoch": 2.8220064724919096, "grad_norm": 0.09248953078205228, "learning_rate": 1.9307132731168352e-06, "loss": 0.3048, "step": 436 }, { "epoch": 2.8284789644012944, "grad_norm": 0.08979246033823442, "learning_rate": 1.7854880295797405e-06, "loss": 0.3046, "step": 437 }, { "epoch": 2.8349514563106797, "grad_norm": 0.08615293987508164, "learning_rate": 1.6458910847826026e-06, "loss": 0.2991, "step": 438 }, { "epoch": 2.841423948220065, "grad_norm": 0.08795133012526277, "learning_rate": 1.5119304384934252e-06, "loss": 0.2922, "step": 439 }, { "epoch": 2.8478964401294498, "grad_norm": 0.0862166031371393, "learning_rate": 1.3836137674854255e-06, "loss": 0.2921, "step": 440 }, { "epoch": 2.854368932038835, "grad_norm": 0.08955236492173343, "learning_rate": 1.2609484250971749e-06, "loss": 0.2997, "step": 441 }, { "epoch": 2.8608414239482203, "grad_norm": 0.08843363251406085, "learning_rate": 1.143941440811147e-06, "loss": 0.3088, "step": 442 }, { "epoch": 2.867313915857605, "grad_norm": 0.09009839385408548, "learning_rate": 1.0325995198509409e-06, "loss": 0.3109, "step": 443 }, { "epoch": 2.8737864077669903, "grad_norm": 0.08785361574469933, "learning_rate": 9.269290427969868e-07, "loss": 0.2971, "step": 444 }, { "epoch": 2.8802588996763756, "grad_norm": 0.08841485850159143, "learning_rate": 8.26936065220929e-07, "loss": 0.3035, "step": 445 }, { "epoch": 2.8867313915857604, "grad_norm": 0.08779257157327024, "learning_rate": 7.326263173385584e-07, "loss": 0.3013, "step": 446 }, { "epoch": 2.8932038834951457, "grad_norm": 0.0883658852563661, "learning_rate": 6.440052036815081e-07, "loss": 0.305, "step": 447 }, { "epoch": 2.899676375404531, "grad_norm": 0.08862179969635353, "learning_rate": 5.610778027874908e-07, "loss": 0.2952, "step": 448 }, { "epoch": 2.9061488673139158, "grad_norm": 0.08863424887187872, "learning_rate": 4.838488669092534e-07, "loss": 0.2956, "step": 449 }, { "epoch": 2.912621359223301, "grad_norm": 0.08998809785580231, "learning_rate": 4.123228217422948e-07, "loss": 0.3015, "step": 450 }, { "epoch": 2.9190938511326863, "grad_norm": 0.08707387881008244, "learning_rate": 3.465037661712134e-07, "loss": 0.3095, "step": 451 }, { "epoch": 2.925566343042071, "grad_norm": 0.08740571796909039, "learning_rate": 2.86395472034795e-07, "loss": 0.3033, "step": 452 }, { "epoch": 2.9320388349514563, "grad_norm": 0.08599743041374118, "learning_rate": 2.3200138390993e-07, "loss": 0.2904, "step": 453 }, { "epoch": 2.9385113268608416, "grad_norm": 0.08726631646259621, "learning_rate": 1.83324618914138e-07, "loss": 0.3007, "step": 454 }, { "epoch": 2.9449838187702264, "grad_norm": 0.08898358262047706, "learning_rate": 1.4036796652701078e-07, "loss": 0.3017, "step": 455 }, { "epoch": 2.9514563106796117, "grad_norm": 0.08765396985812648, "learning_rate": 1.031338884302846e-07, "loss": 0.2887, "step": 456 }, { "epoch": 2.957928802588997, "grad_norm": 0.09001756499046232, "learning_rate": 7.162451836685291e-08, "loss": 0.309, "step": 457 }, { "epoch": 2.9644012944983817, "grad_norm": 0.09045952221990206, "learning_rate": 4.584166201841988e-08, "loss": 0.2996, "step": 458 }, { "epoch": 2.970873786407767, "grad_norm": 0.08709990251871128, "learning_rate": 2.578679690204977e-08, "loss": 0.2982, "step": 459 }, { "epoch": 2.9773462783171523, "grad_norm": 0.08844965304987025, "learning_rate": 1.1461072285490204e-08, "loss": 0.3136, "step": 460 }, { "epoch": 2.983818770226537, "grad_norm": 0.0892220185111147, "learning_rate": 2.865309121358184e-09, "loss": 0.3011, "step": 461 }, { "epoch": 2.9902912621359223, "grad_norm": 0.08562579597167659, "learning_rate": 0.0, "loss": 0.2951, "step": 462 }, { "epoch": 2.9902912621359223, "eval_loss": 0.3344117999076843, "eval_runtime": 35.3894, "eval_samples_per_second": 29.359, "eval_steps_per_second": 0.932, "step": 462 }, { "epoch": 2.9902912621359223, "step": 462, "total_flos": 1.8679451641472614e+17, "train_loss": 0.36376328850205325, "train_runtime": 5739.3163, "train_samples_per_second": 10.311, "train_steps_per_second": 0.08 } ], "logging_steps": 1, "max_steps": 462, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8679451641472614e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }