|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9902912621359223, |
|
"eval_steps": 500, |
|
"global_step": 462, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006472491909385114, |
|
"grad_norm": 0.8681851528027766, |
|
"learning_rate": 4.255319148936171e-06, |
|
"loss": 1.2864, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.012944983818770227, |
|
"grad_norm": 0.8603766114064247, |
|
"learning_rate": 8.510638297872341e-06, |
|
"loss": 1.3029, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.019417475728155338, |
|
"grad_norm": 0.8460211658943029, |
|
"learning_rate": 1.2765957446808511e-05, |
|
"loss": 1.2676, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.025889967637540454, |
|
"grad_norm": 0.823879412927583, |
|
"learning_rate": 1.7021276595744682e-05, |
|
"loss": 1.2794, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.032362459546925564, |
|
"grad_norm": 0.7999755689798974, |
|
"learning_rate": 2.1276595744680852e-05, |
|
"loss": 1.2632, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.038834951456310676, |
|
"grad_norm": 0.779503875119536, |
|
"learning_rate": 2.5531914893617022e-05, |
|
"loss": 1.2356, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.045307443365695796, |
|
"grad_norm": 0.7113356719326077, |
|
"learning_rate": 2.9787234042553192e-05, |
|
"loss": 1.2075, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.05177993527508091, |
|
"grad_norm": 0.5733112055032882, |
|
"learning_rate": 3.4042553191489365e-05, |
|
"loss": 1.1078, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.05825242718446602, |
|
"grad_norm": 0.4686147590427965, |
|
"learning_rate": 3.829787234042553e-05, |
|
"loss": 1.0108, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.06472491909385113, |
|
"grad_norm": 0.4960364471309283, |
|
"learning_rate": 4.2553191489361704e-05, |
|
"loss": 0.9991, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07119741100323625, |
|
"grad_norm": 0.5462124995641008, |
|
"learning_rate": 4.680851063829788e-05, |
|
"loss": 0.9427, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.07766990291262135, |
|
"grad_norm": 0.546681134551426, |
|
"learning_rate": 5.1063829787234044e-05, |
|
"loss": 0.8893, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.08414239482200647, |
|
"grad_norm": 0.556093704754962, |
|
"learning_rate": 5.531914893617022e-05, |
|
"loss": 0.8444, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.09061488673139159, |
|
"grad_norm": 0.4982225051601507, |
|
"learning_rate": 5.9574468085106384e-05, |
|
"loss": 0.7642, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0970873786407767, |
|
"grad_norm": 0.48253513519932356, |
|
"learning_rate": 6.382978723404256e-05, |
|
"loss": 0.7228, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.10355987055016182, |
|
"grad_norm": 0.44162352908966596, |
|
"learning_rate": 6.808510638297873e-05, |
|
"loss": 0.6525, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.11003236245954692, |
|
"grad_norm": 0.3918381633741909, |
|
"learning_rate": 7.23404255319149e-05, |
|
"loss": 0.604, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.11650485436893204, |
|
"grad_norm": 0.2648043458799554, |
|
"learning_rate": 7.659574468085106e-05, |
|
"loss": 0.5693, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.12297734627831715, |
|
"grad_norm": 0.243819913231238, |
|
"learning_rate": 8.085106382978723e-05, |
|
"loss": 0.5205, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.12944983818770225, |
|
"grad_norm": 0.23168627433032857, |
|
"learning_rate": 8.510638297872341e-05, |
|
"loss": 0.5296, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.13592233009708737, |
|
"grad_norm": 0.1994751857050266, |
|
"learning_rate": 8.936170212765958e-05, |
|
"loss": 0.5269, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.1423948220064725, |
|
"grad_norm": 0.2530061947493179, |
|
"learning_rate": 9.361702127659576e-05, |
|
"loss": 0.5091, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.1488673139158576, |
|
"grad_norm": 0.24049496280327698, |
|
"learning_rate": 9.787234042553192e-05, |
|
"loss": 0.5169, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.1553398058252427, |
|
"grad_norm": 0.22948907942121893, |
|
"learning_rate": 0.00010212765957446809, |
|
"loss": 0.4954, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.16181229773462782, |
|
"grad_norm": 0.17253285827605366, |
|
"learning_rate": 0.00010638297872340425, |
|
"loss": 0.4678, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.16828478964401294, |
|
"grad_norm": 0.1938226429644451, |
|
"learning_rate": 0.00011063829787234043, |
|
"loss": 0.4857, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.17475728155339806, |
|
"grad_norm": 0.13721502402963923, |
|
"learning_rate": 0.00011489361702127661, |
|
"loss": 0.466, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.18122977346278318, |
|
"grad_norm": 0.12808136935864992, |
|
"learning_rate": 0.00011914893617021277, |
|
"loss": 0.4652, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.18770226537216828, |
|
"grad_norm": 0.12041781390556437, |
|
"learning_rate": 0.00012340425531914893, |
|
"loss": 0.4527, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.1941747572815534, |
|
"grad_norm": 0.11745727258563804, |
|
"learning_rate": 0.00012765957446808513, |
|
"loss": 0.4392, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.20064724919093851, |
|
"grad_norm": 0.11168371739950012, |
|
"learning_rate": 0.00013191489361702127, |
|
"loss": 0.4578, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.20711974110032363, |
|
"grad_norm": 0.11295507271229074, |
|
"learning_rate": 0.00013617021276595746, |
|
"loss": 0.4384, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.21359223300970873, |
|
"grad_norm": 0.11084156792638247, |
|
"learning_rate": 0.00014042553191489363, |
|
"loss": 0.4292, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.22006472491909385, |
|
"grad_norm": 0.10746985136176236, |
|
"learning_rate": 0.0001446808510638298, |
|
"loss": 0.4202, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.22653721682847897, |
|
"grad_norm": 0.10723808557482935, |
|
"learning_rate": 0.00014893617021276596, |
|
"loss": 0.4345, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.23300970873786409, |
|
"grad_norm": 0.10308761677232968, |
|
"learning_rate": 0.00015319148936170213, |
|
"loss": 0.4318, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.23948220064724918, |
|
"grad_norm": 0.1044215437881733, |
|
"learning_rate": 0.00015744680851063832, |
|
"loss": 0.4289, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2459546925566343, |
|
"grad_norm": 0.10783517564137016, |
|
"learning_rate": 0.00016170212765957446, |
|
"loss": 0.4226, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.2524271844660194, |
|
"grad_norm": 0.11058134845234333, |
|
"learning_rate": 0.00016595744680851065, |
|
"loss": 0.4392, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.2588996763754045, |
|
"grad_norm": 0.10721298987840806, |
|
"learning_rate": 0.00017021276595744682, |
|
"loss": 0.4097, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.26537216828478966, |
|
"grad_norm": 0.10203447619356035, |
|
"learning_rate": 0.00017446808510638298, |
|
"loss": 0.4371, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.27184466019417475, |
|
"grad_norm": 0.09611299924461707, |
|
"learning_rate": 0.00017872340425531915, |
|
"loss": 0.4264, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.2783171521035599, |
|
"grad_norm": 0.09911922773823158, |
|
"learning_rate": 0.00018297872340425532, |
|
"loss": 0.4177, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.284789644012945, |
|
"grad_norm": 0.10195593280531762, |
|
"learning_rate": 0.0001872340425531915, |
|
"loss": 0.4322, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.2912621359223301, |
|
"grad_norm": 0.0990363778451009, |
|
"learning_rate": 0.00019148936170212768, |
|
"loss": 0.4272, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2977346278317152, |
|
"grad_norm": 0.10312296274780797, |
|
"learning_rate": 0.00019574468085106384, |
|
"loss": 0.3997, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3042071197411003, |
|
"grad_norm": 0.09688182042517872, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3955, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3106796116504854, |
|
"grad_norm": 0.09540799755823594, |
|
"learning_rate": 0.00019999713469087867, |
|
"loss": 0.4078, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.31715210355987056, |
|
"grad_norm": 0.10596966214446712, |
|
"learning_rate": 0.00019998853892771453, |
|
"loss": 0.4102, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.32362459546925565, |
|
"grad_norm": 0.10086883515116485, |
|
"learning_rate": 0.00019997421320309795, |
|
"loss": 0.4048, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3300970873786408, |
|
"grad_norm": 0.10400887250356652, |
|
"learning_rate": 0.00019995415833798158, |
|
"loss": 0.403, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.3365695792880259, |
|
"grad_norm": 0.09067444994385492, |
|
"learning_rate": 0.00019992837548163316, |
|
"loss": 0.3867, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.343042071197411, |
|
"grad_norm": 0.09109660691369531, |
|
"learning_rate": 0.00019989686611156972, |
|
"loss": 0.3956, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.34951456310679613, |
|
"grad_norm": 0.09162094937144005, |
|
"learning_rate": 0.000199859632033473, |
|
"loss": 0.407, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.3559870550161812, |
|
"grad_norm": 0.09041901984453071, |
|
"learning_rate": 0.00019981667538108587, |
|
"loss": 0.3913, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.36245954692556637, |
|
"grad_norm": 0.0948520973493128, |
|
"learning_rate": 0.00019976799861609008, |
|
"loss": 0.3959, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.36893203883495146, |
|
"grad_norm": 0.09239535228342881, |
|
"learning_rate": 0.00019971360452796522, |
|
"loss": 0.3976, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.37540453074433655, |
|
"grad_norm": 0.09430664951378093, |
|
"learning_rate": 0.0001996534962338288, |
|
"loss": 0.3984, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.3818770226537217, |
|
"grad_norm": 0.08478493523992439, |
|
"learning_rate": 0.0001995876771782577, |
|
"loss": 0.3862, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.3883495145631068, |
|
"grad_norm": 0.08736157516174174, |
|
"learning_rate": 0.00019951615113309075, |
|
"loss": 0.3768, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3948220064724919, |
|
"grad_norm": 0.08833359616935596, |
|
"learning_rate": 0.00019943892219721253, |
|
"loss": 0.3832, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.40129449838187703, |
|
"grad_norm": 0.09634861288945641, |
|
"learning_rate": 0.0001993559947963185, |
|
"loss": 0.3975, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.4077669902912621, |
|
"grad_norm": 0.08474514858686731, |
|
"learning_rate": 0.00019926737368266144, |
|
"loss": 0.3803, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.41423948220064727, |
|
"grad_norm": 0.0878556667338793, |
|
"learning_rate": 0.00019917306393477907, |
|
"loss": 0.3874, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.42071197411003236, |
|
"grad_norm": 0.08906183743795007, |
|
"learning_rate": 0.00019907307095720303, |
|
"loss": 0.384, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.42718446601941745, |
|
"grad_norm": 0.08738087160835142, |
|
"learning_rate": 0.00019896740048014908, |
|
"loss": 0.3883, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.4336569579288026, |
|
"grad_norm": 0.09070551920354154, |
|
"learning_rate": 0.00019885605855918885, |
|
"loss": 0.38, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.4401294498381877, |
|
"grad_norm": 0.08922144963205411, |
|
"learning_rate": 0.00019873905157490285, |
|
"loss": 0.3854, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.44660194174757284, |
|
"grad_norm": 0.09304718374139792, |
|
"learning_rate": 0.0001986163862325146, |
|
"loss": 0.3804, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.45307443365695793, |
|
"grad_norm": 0.09290101551147209, |
|
"learning_rate": 0.0001984880695615066, |
|
"loss": 0.3876, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.459546925566343, |
|
"grad_norm": 0.08862476310706492, |
|
"learning_rate": 0.0001983541089152174, |
|
"loss": 0.3701, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.46601941747572817, |
|
"grad_norm": 0.0864643410147614, |
|
"learning_rate": 0.00019821451197042026, |
|
"loss": 0.3625, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.47249190938511326, |
|
"grad_norm": 0.0889683558231721, |
|
"learning_rate": 0.0001980692867268832, |
|
"loss": 0.3635, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.47896440129449835, |
|
"grad_norm": 0.090198299336284, |
|
"learning_rate": 0.0001979184415069104, |
|
"loss": 0.3804, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.4854368932038835, |
|
"grad_norm": 0.09214744360572043, |
|
"learning_rate": 0.00019776198495486565, |
|
"loss": 0.367, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.4919093851132686, |
|
"grad_norm": 0.08545016444696638, |
|
"learning_rate": 0.00019759992603667667, |
|
"loss": 0.3671, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.49838187702265374, |
|
"grad_norm": 0.08836432043627582, |
|
"learning_rate": 0.00019743227403932134, |
|
"loss": 0.3759, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.5048543689320388, |
|
"grad_norm": 0.09220525179921107, |
|
"learning_rate": 0.00019725903857029564, |
|
"loss": 0.3759, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.511326860841424, |
|
"grad_norm": 0.09021919927906268, |
|
"learning_rate": 0.00019708022955706292, |
|
"loss": 0.3748, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.517799352750809, |
|
"grad_norm": 0.0857417469759507, |
|
"learning_rate": 0.00019689585724648516, |
|
"loss": 0.3778, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5242718446601942, |
|
"grad_norm": 0.08899327468436542, |
|
"learning_rate": 0.00019670593220423558, |
|
"loss": 0.3711, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.5307443365695793, |
|
"grad_norm": 0.09420631205003693, |
|
"learning_rate": 0.00019651046531419332, |
|
"loss": 0.3678, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.5372168284789643, |
|
"grad_norm": 0.09175386946113716, |
|
"learning_rate": 0.00019630946777781966, |
|
"loss": 0.3669, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.5436893203883495, |
|
"grad_norm": 0.08372986124983309, |
|
"learning_rate": 0.0001961029511135161, |
|
"loss": 0.3593, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.5501618122977346, |
|
"grad_norm": 0.08923405383972152, |
|
"learning_rate": 0.00019589092715596417, |
|
"loss": 0.3729, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5566343042071198, |
|
"grad_norm": 0.08853783615052746, |
|
"learning_rate": 0.00019567340805544758, |
|
"loss": 0.3512, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.5631067961165048, |
|
"grad_norm": 0.09158487904428567, |
|
"learning_rate": 0.0001954504062771555, |
|
"loss": 0.3673, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.56957928802589, |
|
"grad_norm": 0.09709095338695828, |
|
"learning_rate": 0.00019522193460046864, |
|
"loss": 0.3758, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.5760517799352751, |
|
"grad_norm": 0.09252319787935548, |
|
"learning_rate": 0.00019498800611822645, |
|
"loss": 0.3588, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.5825242718446602, |
|
"grad_norm": 0.10229380875651542, |
|
"learning_rate": 0.00019474863423597728, |
|
"loss": 0.3859, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5889967637540453, |
|
"grad_norm": 0.09361866653135192, |
|
"learning_rate": 0.00019450383267120982, |
|
"loss": 0.3739, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.5954692556634305, |
|
"grad_norm": 0.0919566522897771, |
|
"learning_rate": 0.00019425361545256727, |
|
"loss": 0.3648, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.6019417475728155, |
|
"grad_norm": 0.10007063382515365, |
|
"learning_rate": 0.0001939979969190432, |
|
"loss": 0.3628, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.6084142394822006, |
|
"grad_norm": 0.08642815308708984, |
|
"learning_rate": 0.00019373699171915988, |
|
"loss": 0.3607, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.6148867313915858, |
|
"grad_norm": 0.08898100138256641, |
|
"learning_rate": 0.00019347061481012894, |
|
"loss": 0.3649, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6213592233009708, |
|
"grad_norm": 0.09929344608921754, |
|
"learning_rate": 0.00019319888145699415, |
|
"loss": 0.3707, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.627831715210356, |
|
"grad_norm": 0.08479565009317201, |
|
"learning_rate": 0.00019292180723175654, |
|
"loss": 0.3596, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.6343042071197411, |
|
"grad_norm": 0.08536934965094412, |
|
"learning_rate": 0.00019263940801248226, |
|
"loss": 0.3693, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.6407766990291263, |
|
"grad_norm": 0.09264353563725661, |
|
"learning_rate": 0.0001923516999823925, |
|
"loss": 0.3679, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.6472491909385113, |
|
"grad_norm": 0.09075116775864867, |
|
"learning_rate": 0.00019205869962893605, |
|
"loss": 0.3594, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6537216828478964, |
|
"grad_norm": 0.08696915654679535, |
|
"learning_rate": 0.0001917604237428447, |
|
"loss": 0.3699, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.6601941747572816, |
|
"grad_norm": 0.08387717136900205, |
|
"learning_rate": 0.00019145688941717075, |
|
"loss": 0.3569, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.09727169698788522, |
|
"learning_rate": 0.00019114811404630762, |
|
"loss": 0.3725, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.6731391585760518, |
|
"grad_norm": 0.08785848039003973, |
|
"learning_rate": 0.0001908341153249931, |
|
"loss": 0.3625, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.6796116504854369, |
|
"grad_norm": 0.09111196159253629, |
|
"learning_rate": 0.00019051491124729512, |
|
"loss": 0.3658, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.686084142394822, |
|
"grad_norm": 0.09173527522438048, |
|
"learning_rate": 0.00019019052010558088, |
|
"loss": 0.3605, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.6925566343042071, |
|
"grad_norm": 0.08349190333555147, |
|
"learning_rate": 0.00018986096048946824, |
|
"loss": 0.3714, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.6990291262135923, |
|
"grad_norm": 0.08795475348576975, |
|
"learning_rate": 0.0001895262512847607, |
|
"loss": 0.3646, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.7055016181229773, |
|
"grad_norm": 0.0834930063587362, |
|
"learning_rate": 0.00018918641167236505, |
|
"loss": 0.3518, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.7119741100323624, |
|
"grad_norm": 0.08299225873111288, |
|
"learning_rate": 0.00018884146112719207, |
|
"loss": 0.3498, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7184466019417476, |
|
"grad_norm": 0.08676290151306544, |
|
"learning_rate": 0.00018849141941704067, |
|
"loss": 0.3612, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.7249190938511327, |
|
"grad_norm": 0.09107846226354271, |
|
"learning_rate": 0.00018813630660146488, |
|
"loss": 0.3508, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.7313915857605178, |
|
"grad_norm": 0.09040771142681107, |
|
"learning_rate": 0.00018777614303062457, |
|
"loss": 0.358, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.7378640776699029, |
|
"grad_norm": 0.08473984809675678, |
|
"learning_rate": 0.000187410949344119, |
|
"loss": 0.3567, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.7443365695792881, |
|
"grad_norm": 0.09166724031633931, |
|
"learning_rate": 0.00018704074646980415, |
|
"loss": 0.3474, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7508090614886731, |
|
"grad_norm": 0.09150633413591353, |
|
"learning_rate": 0.00018666555562259356, |
|
"loss": 0.3718, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.7572815533980582, |
|
"grad_norm": 0.09139304143928584, |
|
"learning_rate": 0.00018628539830324229, |
|
"loss": 0.3583, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.7637540453074434, |
|
"grad_norm": 0.08557332691225407, |
|
"learning_rate": 0.00018590029629711506, |
|
"loss": 0.3481, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.7702265372168284, |
|
"grad_norm": 0.08670915390405, |
|
"learning_rate": 0.00018551027167293768, |
|
"loss": 0.3486, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.7766990291262136, |
|
"grad_norm": 0.085015484261482, |
|
"learning_rate": 0.00018511534678153244, |
|
"loss": 0.367, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7831715210355987, |
|
"grad_norm": 0.09010500519447748, |
|
"learning_rate": 0.0001847155442545372, |
|
"loss": 0.3673, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.7896440129449838, |
|
"grad_norm": 0.08749250339043911, |
|
"learning_rate": 0.00018431088700310844, |
|
"loss": 0.3581, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.7961165048543689, |
|
"grad_norm": 0.09294869349273092, |
|
"learning_rate": 0.00018390139821660855, |
|
"loss": 0.3502, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.8025889967637541, |
|
"grad_norm": 0.08999457767358941, |
|
"learning_rate": 0.00018348710136127655, |
|
"loss": 0.3604, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.8090614886731392, |
|
"grad_norm": 0.08450140095669934, |
|
"learning_rate": 0.0001830680201788836, |
|
"loss": 0.3585, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.8155339805825242, |
|
"grad_norm": 0.0885173201152873, |
|
"learning_rate": 0.00018264417868537244, |
|
"loss": 0.36, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.8220064724919094, |
|
"grad_norm": 0.08249252827967349, |
|
"learning_rate": 0.00018221560116948103, |
|
"loss": 0.3456, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.8284789644012945, |
|
"grad_norm": 0.09259010876584428, |
|
"learning_rate": 0.0001817823121913506, |
|
"loss": 0.3523, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.8349514563106796, |
|
"grad_norm": 0.0829493165024249, |
|
"learning_rate": 0.00018134433658111845, |
|
"loss": 0.3502, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.8414239482200647, |
|
"grad_norm": 0.0870233230162085, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 0.3503, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8478964401294499, |
|
"grad_norm": 0.09013336960665569, |
|
"learning_rate": 0.00018045442612632444, |
|
"loss": 0.355, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.8543689320388349, |
|
"grad_norm": 0.0887494126249745, |
|
"learning_rate": 0.00018000254227913348, |
|
"loss": 0.3492, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.86084142394822, |
|
"grad_norm": 0.08245510607718151, |
|
"learning_rate": 0.00017954607379166, |
|
"loss": 0.3509, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.8673139158576052, |
|
"grad_norm": 0.08652334474586301, |
|
"learning_rate": 0.00017908504682237047, |
|
"loss": 0.3464, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.8737864077669902, |
|
"grad_norm": 0.0832923145834622, |
|
"learning_rate": 0.00017861948779096046, |
|
"loss": 0.3392, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.8802588996763754, |
|
"grad_norm": 0.0812465660556935, |
|
"learning_rate": 0.0001781494233768408, |
|
"loss": 0.333, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.8867313915857605, |
|
"grad_norm": 0.088984541060325, |
|
"learning_rate": 0.00017767488051760857, |
|
"loss": 0.3615, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.8932038834951457, |
|
"grad_norm": 0.0867087441619348, |
|
"learning_rate": 0.00017719588640750336, |
|
"loss": 0.3553, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.8996763754045307, |
|
"grad_norm": 0.08382301700407316, |
|
"learning_rate": 0.00017671246849584903, |
|
"loss": 0.3592, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.9061488673139159, |
|
"grad_norm": 0.0865944397225724, |
|
"learning_rate": 0.0001762246544854807, |
|
"loss": 0.3605, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.912621359223301, |
|
"grad_norm": 0.08628029323884819, |
|
"learning_rate": 0.00017573247233115694, |
|
"loss": 0.371, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.919093851132686, |
|
"grad_norm": 0.08863123995414322, |
|
"learning_rate": 0.00017523595023795813, |
|
"loss": 0.3504, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.9255663430420712, |
|
"grad_norm": 0.08627162896846262, |
|
"learning_rate": 0.00017473511665966993, |
|
"loss": 0.3545, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.9320388349514563, |
|
"grad_norm": 0.08839402497347855, |
|
"learning_rate": 0.00017423000029715267, |
|
"loss": 0.3641, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.9385113268608414, |
|
"grad_norm": 0.08651737654873981, |
|
"learning_rate": 0.00017372063009669686, |
|
"loss": 0.3431, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.9449838187702265, |
|
"grad_norm": 0.07938521602972776, |
|
"learning_rate": 0.00017320703524836405, |
|
"loss": 0.3541, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.9514563106796117, |
|
"grad_norm": 0.08679861689760528, |
|
"learning_rate": 0.00017268924518431438, |
|
"loss": 0.3492, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.9579288025889967, |
|
"grad_norm": 0.08562044946225537, |
|
"learning_rate": 0.00017216728957711967, |
|
"loss": 0.3497, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.9644012944983819, |
|
"grad_norm": 0.08302055814489633, |
|
"learning_rate": 0.0001716411983380632, |
|
"loss": 0.3472, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.970873786407767, |
|
"grad_norm": 0.08275430206926206, |
|
"learning_rate": 0.00017111100161542545, |
|
"loss": 0.3383, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9773462783171522, |
|
"grad_norm": 0.08114036438803068, |
|
"learning_rate": 0.00017057672979275656, |
|
"loss": 0.36, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.9838187702265372, |
|
"grad_norm": 0.0875761383748598, |
|
"learning_rate": 0.0001700384134871351, |
|
"loss": 0.3727, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.9902912621359223, |
|
"grad_norm": 0.0826665996105745, |
|
"learning_rate": 0.0001694960835474134, |
|
"loss": 0.3443, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.9967637540453075, |
|
"grad_norm": 0.08226814015705675, |
|
"learning_rate": 0.00016894977105244997, |
|
"loss": 0.3653, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.9967637540453075, |
|
"eval_loss": 0.3545331358909607, |
|
"eval_runtime": 38.077, |
|
"eval_samples_per_second": 27.287, |
|
"eval_steps_per_second": 0.867, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.0032362459546926, |
|
"grad_norm": 0.08335200234700724, |
|
"learning_rate": 0.0001683995073093283, |
|
"loss": 0.3492, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.0097087378640777, |
|
"grad_norm": 0.0865080297629732, |
|
"learning_rate": 0.00016784532385156285, |
|
"loss": 0.3377, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.0161812297734627, |
|
"grad_norm": 0.08441339355225498, |
|
"learning_rate": 0.0001672872524372919, |
|
"loss": 0.34, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.022653721682848, |
|
"grad_norm": 0.0801171782758195, |
|
"learning_rate": 0.00016672532504745778, |
|
"loss": 0.3354, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.029126213592233, |
|
"grad_norm": 0.08282538959029859, |
|
"learning_rate": 0.00016615957388397399, |
|
"loss": 0.3497, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.035598705501618, |
|
"grad_norm": 0.0883355169428613, |
|
"learning_rate": 0.00016559003136787988, |
|
"loss": 0.338, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.0420711974110033, |
|
"grad_norm": 0.08616766891600182, |
|
"learning_rate": 0.00016501673013748284, |
|
"loss": 0.3491, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.0485436893203883, |
|
"grad_norm": 0.08188626612719885, |
|
"learning_rate": 0.0001644397030464877, |
|
"loss": 0.3359, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.0550161812297734, |
|
"grad_norm": 0.08630887969447615, |
|
"learning_rate": 0.00016385898316211426, |
|
"loss": 0.3268, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.0614886731391586, |
|
"grad_norm": 0.09109351353175275, |
|
"learning_rate": 0.0001632746037632021, |
|
"loss": 0.3356, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.0679611650485437, |
|
"grad_norm": 0.08843718177405194, |
|
"learning_rate": 0.00016268659833830367, |
|
"loss": 0.3205, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.074433656957929, |
|
"grad_norm": 0.09214475868602304, |
|
"learning_rate": 0.00016209500058376515, |
|
"loss": 0.3433, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.080906148867314, |
|
"grad_norm": 0.08469945389831732, |
|
"learning_rate": 0.00016149984440179537, |
|
"loss": 0.3252, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.087378640776699, |
|
"grad_norm": 0.09167891300803974, |
|
"learning_rate": 0.00016090116389852306, |
|
"loss": 0.3372, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.0938511326860842, |
|
"grad_norm": 0.08439372223738191, |
|
"learning_rate": 0.00016029899338204233, |
|
"loss": 0.3339, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.1003236245954693, |
|
"grad_norm": 0.09671189992628862, |
|
"learning_rate": 0.0001596933673604467, |
|
"loss": 0.331, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.1067961165048543, |
|
"grad_norm": 0.08926808998386844, |
|
"learning_rate": 0.00015908432053985143, |
|
"loss": 0.3438, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.1132686084142396, |
|
"grad_norm": 0.08651511861187883, |
|
"learning_rate": 0.0001584718878224047, |
|
"loss": 0.3199, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.1197411003236246, |
|
"grad_norm": 0.09087672552517187, |
|
"learning_rate": 0.00015785610430428762, |
|
"loss": 0.3312, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.1262135922330097, |
|
"grad_norm": 0.08473819565220535, |
|
"learning_rate": 0.00015723700527370268, |
|
"loss": 0.3265, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.132686084142395, |
|
"grad_norm": 0.08959082008669542, |
|
"learning_rate": 0.00015661462620885199, |
|
"loss": 0.3361, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.13915857605178, |
|
"grad_norm": 0.08820985748174735, |
|
"learning_rate": 0.0001559890027759037, |
|
"loss": 0.3352, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.145631067961165, |
|
"grad_norm": 0.08959538258978733, |
|
"learning_rate": 0.00015536017082694846, |
|
"loss": 0.353, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.1521035598705502, |
|
"grad_norm": 0.08556029404315754, |
|
"learning_rate": 0.0001547281663979446, |
|
"loss": 0.338, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.1585760517799353, |
|
"grad_norm": 0.08651722009131164, |
|
"learning_rate": 0.00015409302570665325, |
|
"loss": 0.3428, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.1650485436893203, |
|
"grad_norm": 0.09127001503693886, |
|
"learning_rate": 0.00015345478515056267, |
|
"loss": 0.3196, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.1715210355987056, |
|
"grad_norm": 0.08856796253981307, |
|
"learning_rate": 0.00015281348130480272, |
|
"loss": 0.3309, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.1779935275080906, |
|
"grad_norm": 0.08742855227811815, |
|
"learning_rate": 0.00015216915092004847, |
|
"loss": 0.3273, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.1844660194174756, |
|
"grad_norm": 0.09076657129408205, |
|
"learning_rate": 0.0001515218309204145, |
|
"loss": 0.3324, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.190938511326861, |
|
"grad_norm": 0.09296734178135176, |
|
"learning_rate": 0.00015087155840133888, |
|
"loss": 0.3443, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.197411003236246, |
|
"grad_norm": 0.08431288630485151, |
|
"learning_rate": 0.00015021837062745714, |
|
"loss": 0.3383, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.203883495145631, |
|
"grad_norm": 0.08374691953765048, |
|
"learning_rate": 0.00014956230503046703, |
|
"loss": 0.3368, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.2103559870550162, |
|
"grad_norm": 0.08899017726880568, |
|
"learning_rate": 0.00014890339920698334, |
|
"loss": 0.3368, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.2168284789644013, |
|
"grad_norm": 0.08562973948918146, |
|
"learning_rate": 0.00014824169091638337, |
|
"loss": 0.3397, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.2233009708737863, |
|
"grad_norm": 0.08827756751106554, |
|
"learning_rate": 0.00014757721807864317, |
|
"loss": 0.3405, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.2297734627831716, |
|
"grad_norm": 0.08736187981084419, |
|
"learning_rate": 0.0001469100187721644, |
|
"loss": 0.3262, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.2362459546925566, |
|
"grad_norm": 0.09050203121679472, |
|
"learning_rate": 0.0001462401312315922, |
|
"loss": 0.3345, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.2427184466019416, |
|
"grad_norm": 0.08534856926861663, |
|
"learning_rate": 0.00014556759384562416, |
|
"loss": 0.3277, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.249190938511327, |
|
"grad_norm": 0.08611809442550349, |
|
"learning_rate": 0.00014489244515481046, |
|
"loss": 0.3253, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.255663430420712, |
|
"grad_norm": 0.08688819258876039, |
|
"learning_rate": 0.0001442147238493451, |
|
"loss": 0.3349, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.262135922330097, |
|
"grad_norm": 0.08416654255653569, |
|
"learning_rate": 0.00014353446876684892, |
|
"loss": 0.3308, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.2686084142394822, |
|
"grad_norm": 0.09183514403063862, |
|
"learning_rate": 0.0001428517188901437, |
|
"loss": 0.3383, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.2750809061488673, |
|
"grad_norm": 0.08749863979244098, |
|
"learning_rate": 0.0001421665133450184, |
|
"loss": 0.3294, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.2815533980582523, |
|
"grad_norm": 0.08786155414487208, |
|
"learning_rate": 0.00014147889139798708, |
|
"loss": 0.332, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.2880258899676376, |
|
"grad_norm": 0.08754391459720788, |
|
"learning_rate": 0.00014078889245403844, |
|
"loss": 0.3268, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.2944983818770226, |
|
"grad_norm": 0.09026975504709552, |
|
"learning_rate": 0.0001400965560543778, |
|
"loss": 0.3344, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.3009708737864076, |
|
"grad_norm": 0.08953266507881985, |
|
"learning_rate": 0.0001394019218741612, |
|
"loss": 0.3355, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.307443365695793, |
|
"grad_norm": 0.08868590943641125, |
|
"learning_rate": 0.00013870502972022173, |
|
"loss": 0.3282, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.313915857605178, |
|
"grad_norm": 0.09182148747745249, |
|
"learning_rate": 0.00013800591952878825, |
|
"loss": 0.3394, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.3203883495145632, |
|
"grad_norm": 0.08835010573043636, |
|
"learning_rate": 0.00013730463136319692, |
|
"loss": 0.3316, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.3268608414239482, |
|
"grad_norm": 0.08573944970518392, |
|
"learning_rate": 0.00013660120541159537, |
|
"loss": 0.337, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.08411384694221385, |
|
"learning_rate": 0.00013589568198463944, |
|
"loss": 0.3356, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.3398058252427185, |
|
"grad_norm": 0.08762468166968049, |
|
"learning_rate": 0.0001351881015131833, |
|
"loss": 0.3388, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.3462783171521036, |
|
"grad_norm": 0.08837885567018512, |
|
"learning_rate": 0.00013447850454596265, |
|
"loss": 0.3354, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.3527508090614886, |
|
"grad_norm": 0.08823334999060412, |
|
"learning_rate": 0.00013376693174727065, |
|
"loss": 0.3341, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.3592233009708738, |
|
"grad_norm": 0.08665353483827595, |
|
"learning_rate": 0.00013305342389462792, |
|
"loss": 0.3282, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.3656957928802589, |
|
"grad_norm": 0.08780215390647159, |
|
"learning_rate": 0.00013233802187644566, |
|
"loss": 0.3245, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.3721682847896441, |
|
"grad_norm": 0.0880005148626301, |
|
"learning_rate": 0.0001316207666896824, |
|
"loss": 0.3264, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.3786407766990292, |
|
"grad_norm": 0.09350061568656749, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 0.33, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.3851132686084142, |
|
"grad_norm": 0.0869291546207145, |
|
"learning_rate": 0.00013018086132688184, |
|
"loss": 0.3219, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.3915857605177995, |
|
"grad_norm": 0.0926767409167509, |
|
"learning_rate": 0.0001294582936663239, |
|
"loss": 0.3294, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.3980582524271845, |
|
"grad_norm": 0.08850918948495869, |
|
"learning_rate": 0.00012873403786341513, |
|
"loss": 0.3362, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.4045307443365695, |
|
"grad_norm": 0.08797553458327365, |
|
"learning_rate": 0.00012800813542249072, |
|
"loss": 0.3293, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.4110032362459548, |
|
"grad_norm": 0.08816787591032364, |
|
"learning_rate": 0.00012728062794224832, |
|
"loss": 0.3297, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.4174757281553398, |
|
"grad_norm": 0.08844464048062073, |
|
"learning_rate": 0.0001265515571133643, |
|
"loss": 0.3405, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.4239482200647249, |
|
"grad_norm": 0.08777743550690727, |
|
"learning_rate": 0.00012582096471610467, |
|
"loss": 0.33, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.4304207119741101, |
|
"grad_norm": 0.0841701096943823, |
|
"learning_rate": 0.00012508889261793059, |
|
"loss": 0.3243, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.4368932038834952, |
|
"grad_norm": 0.08211677315785172, |
|
"learning_rate": 0.0001243553827710992, |
|
"loss": 0.3162, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.4433656957928802, |
|
"grad_norm": 0.09229614871738043, |
|
"learning_rate": 0.00012362047721025968, |
|
"loss": 0.3214, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.4498381877022655, |
|
"grad_norm": 0.08827293639891444, |
|
"learning_rate": 0.00012288421805004414, |
|
"loss": 0.3388, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.4563106796116505, |
|
"grad_norm": 0.08650939834799051, |
|
"learning_rate": 0.0001221466474826543, |
|
"loss": 0.3336, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.4627831715210355, |
|
"grad_norm": 0.08745414194563839, |
|
"learning_rate": 0.00012140780777544367, |
|
"loss": 0.3249, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.4692556634304208, |
|
"grad_norm": 0.08966472430098649, |
|
"learning_rate": 0.00012066774126849529, |
|
"loss": 0.3358, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.4757281553398058, |
|
"grad_norm": 0.08547479634425736, |
|
"learning_rate": 0.00011992649037219545, |
|
"loss": 0.3415, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.4822006472491909, |
|
"grad_norm": 0.08630753448523301, |
|
"learning_rate": 0.0001191840975648032, |
|
"loss": 0.3369, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.4886731391585761, |
|
"grad_norm": 0.08171192229188709, |
|
"learning_rate": 0.00011844060539001618, |
|
"loss": 0.3207, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.4951456310679612, |
|
"grad_norm": 0.08413278558608812, |
|
"learning_rate": 0.00011769605645453265, |
|
"loss": 0.3219, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.5016181229773462, |
|
"grad_norm": 0.08498283806676993, |
|
"learning_rate": 0.00011695049342560968, |
|
"loss": 0.3339, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.5080906148867315, |
|
"grad_norm": 0.08365977359578973, |
|
"learning_rate": 0.00011620395902861822, |
|
"loss": 0.3297, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.5145631067961165, |
|
"grad_norm": 0.0858081054599164, |
|
"learning_rate": 0.00011545649604459466, |
|
"loss": 0.3305, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.5210355987055015, |
|
"grad_norm": 0.08916196136213667, |
|
"learning_rate": 0.00011470814730778905, |
|
"loss": 0.3352, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.5275080906148868, |
|
"grad_norm": 0.08650692110960809, |
|
"learning_rate": 0.00011395895570321064, |
|
"loss": 0.3347, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.5339805825242718, |
|
"grad_norm": 0.08839636357345756, |
|
"learning_rate": 0.00011320896416417026, |
|
"loss": 0.3284, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.5404530744336569, |
|
"grad_norm": 0.08477612347310413, |
|
"learning_rate": 0.00011245821566981976, |
|
"loss": 0.3331, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.5469255663430421, |
|
"grad_norm": 0.08158083954977735, |
|
"learning_rate": 0.00011170675324268942, |
|
"loss": 0.3284, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.5533980582524272, |
|
"grad_norm": 0.08210674030802527, |
|
"learning_rate": 0.00011095461994622209, |
|
"loss": 0.3152, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.5598705501618122, |
|
"grad_norm": 0.08164700760175722, |
|
"learning_rate": 0.00011020185888230571, |
|
"loss": 0.3212, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.5663430420711975, |
|
"grad_norm": 0.08252073365071766, |
|
"learning_rate": 0.00010944851318880314, |
|
"loss": 0.3349, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.5728155339805825, |
|
"grad_norm": 0.0847504452125806, |
|
"learning_rate": 0.00010869462603708011, |
|
"loss": 0.3355, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.5792880258899675, |
|
"grad_norm": 0.08651133299846145, |
|
"learning_rate": 0.00010794024062953123, |
|
"loss": 0.3295, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.5857605177993528, |
|
"grad_norm": 0.08848888104315868, |
|
"learning_rate": 0.00010718540019710432, |
|
"loss": 0.334, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.5922330097087378, |
|
"grad_norm": 0.0838746079221116, |
|
"learning_rate": 0.00010643014799682296, |
|
"loss": 0.3236, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.5987055016181229, |
|
"grad_norm": 0.0834887968620315, |
|
"learning_rate": 0.00010567452730930743, |
|
"loss": 0.3231, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.6051779935275081, |
|
"grad_norm": 0.08903656754039865, |
|
"learning_rate": 0.00010491858143629469, |
|
"loss": 0.3303, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.6116504854368932, |
|
"grad_norm": 0.08833888604572854, |
|
"learning_rate": 0.00010416235369815693, |
|
"loss": 0.3417, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.6181229773462782, |
|
"grad_norm": 0.08946556956131413, |
|
"learning_rate": 0.00010340588743141879, |
|
"loss": 0.3341, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.6245954692556634, |
|
"grad_norm": 0.09195837741425467, |
|
"learning_rate": 0.00010264922598627418, |
|
"loss": 0.3292, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.6310679611650487, |
|
"grad_norm": 0.08669211913864286, |
|
"learning_rate": 0.0001018924127241019, |
|
"loss": 0.329, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.6375404530744335, |
|
"grad_norm": 0.08493981781947878, |
|
"learning_rate": 0.00010113549101498086, |
|
"loss": 0.317, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.6440129449838188, |
|
"grad_norm": 0.08859940503358235, |
|
"learning_rate": 0.00010037850423520454, |
|
"loss": 0.3239, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.650485436893204, |
|
"grad_norm": 0.08691997093628756, |
|
"learning_rate": 9.962149576479545e-05, |
|
"loss": 0.3276, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.6569579288025889, |
|
"grad_norm": 0.08182220008549855, |
|
"learning_rate": 9.886450898501917e-05, |
|
"loss": 0.322, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.6634304207119741, |
|
"grad_norm": 0.08983867569693547, |
|
"learning_rate": 9.810758727589813e-05, |
|
"loss": 0.3289, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.6699029126213594, |
|
"grad_norm": 0.08440426328661797, |
|
"learning_rate": 9.735077401372583e-05, |
|
"loss": 0.335, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.6763754045307442, |
|
"grad_norm": 0.09081741368893546, |
|
"learning_rate": 9.659411256858122e-05, |
|
"loss": 0.3285, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.6828478964401294, |
|
"grad_norm": 0.08638692257401422, |
|
"learning_rate": 9.583764630184311e-05, |
|
"loss": 0.3283, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.6893203883495147, |
|
"grad_norm": 0.08661785904833329, |
|
"learning_rate": 9.508141856370532e-05, |
|
"loss": 0.3315, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.6957928802588995, |
|
"grad_norm": 0.08867942067172753, |
|
"learning_rate": 9.432547269069261e-05, |
|
"loss": 0.3298, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.7022653721682848, |
|
"grad_norm": 0.08121634636493791, |
|
"learning_rate": 9.356985200317709e-05, |
|
"loss": 0.3264, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.70873786407767, |
|
"grad_norm": 0.0857656466474692, |
|
"learning_rate": 9.281459980289567e-05, |
|
"loss": 0.3265, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.715210355987055, |
|
"grad_norm": 0.08325063625580009, |
|
"learning_rate": 9.205975937046879e-05, |
|
"loss": 0.3248, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.72168284789644, |
|
"grad_norm": 0.09125541887572731, |
|
"learning_rate": 9.130537396291994e-05, |
|
"loss": 0.3336, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.7281553398058254, |
|
"grad_norm": 0.0858478153556312, |
|
"learning_rate": 9.055148681119688e-05, |
|
"loss": 0.3335, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.7346278317152104, |
|
"grad_norm": 0.08412291629199992, |
|
"learning_rate": 8.979814111769431e-05, |
|
"loss": 0.3247, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.7411003236245954, |
|
"grad_norm": 0.08342610941802514, |
|
"learning_rate": 8.904538005377794e-05, |
|
"loss": 0.3271, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.7475728155339807, |
|
"grad_norm": 0.08566344893624273, |
|
"learning_rate": 8.829324675731059e-05, |
|
"loss": 0.3342, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.7540453074433657, |
|
"grad_norm": 0.08349328064040033, |
|
"learning_rate": 8.754178433018025e-05, |
|
"loss": 0.3169, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.7605177993527508, |
|
"grad_norm": 0.08290650287411376, |
|
"learning_rate": 8.679103583582979e-05, |
|
"loss": 0.32, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.766990291262136, |
|
"grad_norm": 0.08355664601674428, |
|
"learning_rate": 8.604104429678935e-05, |
|
"loss": 0.3284, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.773462783171521, |
|
"grad_norm": 0.08088172666023426, |
|
"learning_rate": 8.529185269221097e-05, |
|
"loss": 0.3189, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.779935275080906, |
|
"grad_norm": 0.08494057369308265, |
|
"learning_rate": 8.45435039554054e-05, |
|
"loss": 0.3175, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.7864077669902914, |
|
"grad_norm": 0.08435898747241219, |
|
"learning_rate": 8.379604097138179e-05, |
|
"loss": 0.3359, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.7928802588996764, |
|
"grad_norm": 0.08369793345019513, |
|
"learning_rate": 8.304950657439033e-05, |
|
"loss": 0.3277, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.7993527508090614, |
|
"grad_norm": 0.08234674894169963, |
|
"learning_rate": 8.230394354546737e-05, |
|
"loss": 0.3106, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.8058252427184467, |
|
"grad_norm": 0.08480659421208028, |
|
"learning_rate": 8.15593946099838e-05, |
|
"loss": 0.3219, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.8122977346278317, |
|
"grad_norm": 0.087986920120639, |
|
"learning_rate": 8.08159024351968e-05, |
|
"loss": 0.3442, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.8187702265372168, |
|
"grad_norm": 0.0839196545152189, |
|
"learning_rate": 8.007350962780456e-05, |
|
"loss": 0.3216, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.825242718446602, |
|
"grad_norm": 0.08396185408224062, |
|
"learning_rate": 7.93322587315047e-05, |
|
"loss": 0.3249, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.831715210355987, |
|
"grad_norm": 0.08739956374343912, |
|
"learning_rate": 7.859219222455634e-05, |
|
"loss": 0.3366, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.838187702265372, |
|
"grad_norm": 0.08539428449034953, |
|
"learning_rate": 7.785335251734573e-05, |
|
"loss": 0.3399, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.8446601941747574, |
|
"grad_norm": 0.08443259129123379, |
|
"learning_rate": 7.711578194995589e-05, |
|
"loss": 0.326, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.8511326860841424, |
|
"grad_norm": 0.08226727888900415, |
|
"learning_rate": 7.637952278974034e-05, |
|
"loss": 0.3122, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.8576051779935274, |
|
"grad_norm": 0.08664703990759018, |
|
"learning_rate": 7.564461722890081e-05, |
|
"loss": 0.3276, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.8640776699029127, |
|
"grad_norm": 0.08284483357408984, |
|
"learning_rate": 7.491110738206942e-05, |
|
"loss": 0.3331, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.8705501618122977, |
|
"grad_norm": 0.08547813142706377, |
|
"learning_rate": 7.417903528389534e-05, |
|
"loss": 0.3108, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.8770226537216828, |
|
"grad_norm": 0.08375066019242061, |
|
"learning_rate": 7.344844288663571e-05, |
|
"loss": 0.3356, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.883495145631068, |
|
"grad_norm": 0.08564167180521602, |
|
"learning_rate": 7.27193720577517e-05, |
|
"loss": 0.3264, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.889967637540453, |
|
"grad_norm": 0.08560457179852256, |
|
"learning_rate": 7.19918645775093e-05, |
|
"loss": 0.3274, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.896440129449838, |
|
"grad_norm": 0.08594335919666654, |
|
"learning_rate": 7.126596213658488e-05, |
|
"loss": 0.3268, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.9029126213592233, |
|
"grad_norm": 0.086743773673858, |
|
"learning_rate": 7.05417063336761e-05, |
|
"loss": 0.3291, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.9093851132686084, |
|
"grad_norm": 0.08512277813785797, |
|
"learning_rate": 6.981913867311819e-05, |
|
"loss": 0.3199, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.9158576051779934, |
|
"grad_norm": 0.08482302988433711, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 0.3308, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.9223300970873787, |
|
"grad_norm": 0.08336748744395403, |
|
"learning_rate": 6.83792333103176e-05, |
|
"loss": 0.3303, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.9288025889967637, |
|
"grad_norm": 0.08685899308663286, |
|
"learning_rate": 6.766197812355438e-05, |
|
"loss": 0.3156, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.9352750809061487, |
|
"grad_norm": 0.08670025171649215, |
|
"learning_rate": 6.69465761053721e-05, |
|
"loss": 0.3263, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.941747572815534, |
|
"grad_norm": 0.08367058440741144, |
|
"learning_rate": 6.623306825272937e-05, |
|
"loss": 0.3184, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.948220064724919, |
|
"grad_norm": 0.08265849338783704, |
|
"learning_rate": 6.552149545403739e-05, |
|
"loss": 0.3141, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.954692556634304, |
|
"grad_norm": 0.08667064648503052, |
|
"learning_rate": 6.48118984868167e-05, |
|
"loss": 0.3176, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.9611650485436893, |
|
"grad_norm": 0.08624477625370294, |
|
"learning_rate": 6.410431801536058e-05, |
|
"loss": 0.3245, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.9676375404530746, |
|
"grad_norm": 0.0863914502801744, |
|
"learning_rate": 6.339879458840465e-05, |
|
"loss": 0.3345, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.9741100323624594, |
|
"grad_norm": 0.0841627898701783, |
|
"learning_rate": 6.269536863680307e-05, |
|
"loss": 0.3194, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.9805825242718447, |
|
"grad_norm": 0.0854832505200848, |
|
"learning_rate": 6.199408047121174e-05, |
|
"loss": 0.3396, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.98705501618123, |
|
"grad_norm": 0.07970724191486213, |
|
"learning_rate": 6.129497027977829e-05, |
|
"loss": 0.3228, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.9935275080906147, |
|
"grad_norm": 0.08216914347596244, |
|
"learning_rate": 6.059807812583883e-05, |
|
"loss": 0.3119, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.08150138639289872, |
|
"learning_rate": 5.990344394562226e-05, |
|
"loss": 0.3166, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.3374881148338318, |
|
"eval_runtime": 35.4418, |
|
"eval_samples_per_second": 29.316, |
|
"eval_steps_per_second": 0.931, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.0064724919093853, |
|
"grad_norm": 0.08131373994961688, |
|
"learning_rate": 5.92111075459616e-05, |
|
"loss": 0.3175, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.01294498381877, |
|
"grad_norm": 0.08591014053929732, |
|
"learning_rate": 5.852110860201294e-05, |
|
"loss": 0.3109, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.0194174757281553, |
|
"grad_norm": 0.08486358862414822, |
|
"learning_rate": 5.7833486654981606e-05, |
|
"loss": 0.3056, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.0258899676375406, |
|
"grad_norm": 0.08216589197196723, |
|
"learning_rate": 5.714828110985635e-05, |
|
"loss": 0.308, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.0323624595469254, |
|
"grad_norm": 0.08472956112082734, |
|
"learning_rate": 5.6465531233151126e-05, |
|
"loss": 0.3063, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.0388349514563107, |
|
"grad_norm": 0.08715839296976118, |
|
"learning_rate": 5.578527615065492e-05, |
|
"loss": 0.3146, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.045307443365696, |
|
"grad_norm": 0.08426241524579575, |
|
"learning_rate": 5.510755484518955e-05, |
|
"loss": 0.305, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.0517799352750807, |
|
"grad_norm": 0.08998957963984325, |
|
"learning_rate": 5.443240615437586e-05, |
|
"loss": 0.3082, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.058252427184466, |
|
"grad_norm": 0.09157974577803293, |
|
"learning_rate": 5.375986876840784e-05, |
|
"loss": 0.3187, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.0647249190938513, |
|
"grad_norm": 0.09110362940023115, |
|
"learning_rate": 5.30899812278356e-05, |
|
"loss": 0.3089, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.071197411003236, |
|
"grad_norm": 0.08924632615443999, |
|
"learning_rate": 5.2422781921356826e-05, |
|
"loss": 0.3108, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.0776699029126213, |
|
"grad_norm": 0.08653114959635026, |
|
"learning_rate": 5.1758309083616673e-05, |
|
"loss": 0.3033, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.0841423948220066, |
|
"grad_norm": 0.0897639664776455, |
|
"learning_rate": 5.109660079301668e-05, |
|
"loss": 0.3044, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.0906148867313914, |
|
"grad_norm": 0.08654171610929062, |
|
"learning_rate": 5.043769496953299e-05, |
|
"loss": 0.3033, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.0970873786407767, |
|
"grad_norm": 0.0883439805695338, |
|
"learning_rate": 4.9781629372542895e-05, |
|
"loss": 0.3081, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.103559870550162, |
|
"grad_norm": 0.08874070518690325, |
|
"learning_rate": 4.912844159866112e-05, |
|
"loss": 0.3029, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.1100323624595467, |
|
"grad_norm": 0.08970836183938685, |
|
"learning_rate": 4.847816907958549e-05, |
|
"loss": 0.3051, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.116504854368932, |
|
"grad_norm": 0.08702359256692323, |
|
"learning_rate": 4.783084907995156e-05, |
|
"loss": 0.3083, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.1229773462783172, |
|
"grad_norm": 0.08620252985570646, |
|
"learning_rate": 4.718651869519731e-05, |
|
"loss": 0.3009, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.129449838187702, |
|
"grad_norm": 0.09026535273366211, |
|
"learning_rate": 4.654521484943735e-05, |
|
"loss": 0.3095, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.1359223300970873, |
|
"grad_norm": 0.08797218673185163, |
|
"learning_rate": 4.59069742933468e-05, |
|
"loss": 0.3037, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.1423948220064726, |
|
"grad_norm": 0.08992206831884317, |
|
"learning_rate": 4.527183360205541e-05, |
|
"loss": 0.3058, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.148867313915858, |
|
"grad_norm": 0.09019684013585973, |
|
"learning_rate": 4.4639829173051554e-05, |
|
"loss": 0.3091, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.1553398058252426, |
|
"grad_norm": 0.08846989758042821, |
|
"learning_rate": 4.401099722409631e-05, |
|
"loss": 0.2968, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.161812297734628, |
|
"grad_norm": 0.09000600047149172, |
|
"learning_rate": 4.338537379114801e-05, |
|
"loss": 0.322, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.168284789644013, |
|
"grad_norm": 0.09084909315613375, |
|
"learning_rate": 4.2762994726297346e-05, |
|
"loss": 0.3104, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.174757281553398, |
|
"grad_norm": 0.08897581163092526, |
|
"learning_rate": 4.2143895695712444e-05, |
|
"loss": 0.3029, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.1812297734627832, |
|
"grad_norm": 0.08806995729841001, |
|
"learning_rate": 4.152811217759529e-05, |
|
"loss": 0.3127, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 2.1877022653721685, |
|
"grad_norm": 0.0909392143685305, |
|
"learning_rate": 4.091567946014858e-05, |
|
"loss": 0.3077, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.1941747572815533, |
|
"grad_norm": 0.08526830124486821, |
|
"learning_rate": 4.0306632639553323e-05, |
|
"loss": 0.3092, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.2006472491909386, |
|
"grad_norm": 0.0889661858323667, |
|
"learning_rate": 3.970100661795766e-05, |
|
"loss": 0.306, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.207119741100324, |
|
"grad_norm": 0.08742864887103546, |
|
"learning_rate": 3.909883610147696e-05, |
|
"loss": 0.3021, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 2.2135922330097086, |
|
"grad_norm": 0.08736016824506748, |
|
"learning_rate": 3.8500155598204644e-05, |
|
"loss": 0.2974, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.220064724919094, |
|
"grad_norm": 0.0895276470969271, |
|
"learning_rate": 3.7904999416234864e-05, |
|
"loss": 0.3121, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 2.226537216828479, |
|
"grad_norm": 0.08707410830498415, |
|
"learning_rate": 3.731340166169635e-05, |
|
"loss": 0.304, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.233009708737864, |
|
"grad_norm": 0.09084514125957084, |
|
"learning_rate": 3.6725396236797935e-05, |
|
"loss": 0.3139, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.2394822006472492, |
|
"grad_norm": 0.08595616841814938, |
|
"learning_rate": 3.614101683788575e-05, |
|
"loss": 0.3034, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.2459546925566345, |
|
"grad_norm": 0.08918897946715092, |
|
"learning_rate": 3.5560296953512295e-05, |
|
"loss": 0.2871, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.2524271844660193, |
|
"grad_norm": 0.08951525733240658, |
|
"learning_rate": 3.498326986251717e-05, |
|
"loss": 0.3064, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.2588996763754046, |
|
"grad_norm": 0.08959019506927517, |
|
"learning_rate": 3.4409968632120126e-05, |
|
"loss": 0.3062, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.26537216828479, |
|
"grad_norm": 0.09072525944669337, |
|
"learning_rate": 3.3840426116026044e-05, |
|
"loss": 0.3113, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.2718446601941746, |
|
"grad_norm": 0.09370535186596111, |
|
"learning_rate": 3.327467495254225e-05, |
|
"loss": 0.3195, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.27831715210356, |
|
"grad_norm": 0.08681410960013289, |
|
"learning_rate": 3.2712747562708115e-05, |
|
"loss": 0.3039, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.284789644012945, |
|
"grad_norm": 0.0884544764695178, |
|
"learning_rate": 3.215467614843719e-05, |
|
"loss": 0.3028, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.29126213592233, |
|
"grad_norm": 0.08974826283064652, |
|
"learning_rate": 3.160049269067174e-05, |
|
"loss": 0.3073, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.2977346278317152, |
|
"grad_norm": 0.08791055069767624, |
|
"learning_rate": 3.105022894755003e-05, |
|
"loss": 0.3011, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.3042071197411005, |
|
"grad_norm": 0.08951734113600135, |
|
"learning_rate": 3.0503916452586612e-05, |
|
"loss": 0.3056, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.3106796116504853, |
|
"grad_norm": 0.08762327614071365, |
|
"learning_rate": 2.9961586512864947e-05, |
|
"loss": 0.3138, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.3171521035598706, |
|
"grad_norm": 0.09032122428603973, |
|
"learning_rate": 2.9423270207243437e-05, |
|
"loss": 0.2978, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.323624595469256, |
|
"grad_norm": 0.0917821868399838, |
|
"learning_rate": 2.888899838457455e-05, |
|
"loss": 0.3084, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 2.3300970873786406, |
|
"grad_norm": 0.09193242261010433, |
|
"learning_rate": 2.835880166193683e-05, |
|
"loss": 0.3064, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.336569579288026, |
|
"grad_norm": 0.08883763145021951, |
|
"learning_rate": 2.7832710422880328e-05, |
|
"loss": 0.2993, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 2.343042071197411, |
|
"grad_norm": 0.08991962234888566, |
|
"learning_rate": 2.7310754815685624e-05, |
|
"loss": 0.3034, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.349514563106796, |
|
"grad_norm": 0.08994982579652418, |
|
"learning_rate": 2.679296475163595e-05, |
|
"loss": 0.3072, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.355987055016181, |
|
"grad_norm": 0.08970808859923911, |
|
"learning_rate": 2.6279369903303175e-05, |
|
"loss": 0.3023, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.3624595469255665, |
|
"grad_norm": 0.08588546055096836, |
|
"learning_rate": 2.5769999702847346e-05, |
|
"loss": 0.296, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.3689320388349513, |
|
"grad_norm": 0.08990846870006015, |
|
"learning_rate": 2.5264883340330113e-05, |
|
"loss": 0.3022, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.3754045307443366, |
|
"grad_norm": 0.09151580193749502, |
|
"learning_rate": 2.4764049762041874e-05, |
|
"loss": 0.2978, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 2.381877022653722, |
|
"grad_norm": 0.08959471455140089, |
|
"learning_rate": 2.426752766884306e-05, |
|
"loss": 0.3099, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.3883495145631066, |
|
"grad_norm": 0.0887364245299765, |
|
"learning_rate": 2.377534551451932e-05, |
|
"loss": 0.3016, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 2.394822006472492, |
|
"grad_norm": 0.08882940905923181, |
|
"learning_rate": 2.328753150415094e-05, |
|
"loss": 0.3004, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.401294498381877, |
|
"grad_norm": 0.08843646634108779, |
|
"learning_rate": 2.280411359249668e-05, |
|
"loss": 0.2933, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 2.407766990291262, |
|
"grad_norm": 0.09301769304474337, |
|
"learning_rate": 2.2325119482391467e-05, |
|
"loss": 0.3134, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.414239482200647, |
|
"grad_norm": 0.08707266390112682, |
|
"learning_rate": 2.185057662315918e-05, |
|
"loss": 0.3003, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 2.4207119741100325, |
|
"grad_norm": 0.0868807473282313, |
|
"learning_rate": 2.1380512209039528e-05, |
|
"loss": 0.302, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 2.4271844660194173, |
|
"grad_norm": 0.09363452018708772, |
|
"learning_rate": 2.0914953177629548e-05, |
|
"loss": 0.3148, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.4336569579288025, |
|
"grad_norm": 0.08938831583372829, |
|
"learning_rate": 2.0453926208340003e-05, |
|
"loss": 0.3055, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 2.440129449838188, |
|
"grad_norm": 0.09230125425366141, |
|
"learning_rate": 1.999745772086655e-05, |
|
"loss": 0.3041, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 2.4466019417475726, |
|
"grad_norm": 0.08932842130151852, |
|
"learning_rate": 1.954557387367557e-05, |
|
"loss": 0.3037, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 2.453074433656958, |
|
"grad_norm": 0.08923122998718774, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 0.3055, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 2.459546925566343, |
|
"grad_norm": 0.0917592218140953, |
|
"learning_rate": 1.8655663418881584e-05, |
|
"loss": 0.3006, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.466019417475728, |
|
"grad_norm": 0.08856315870273591, |
|
"learning_rate": 1.821768780864943e-05, |
|
"loss": 0.3072, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 2.472491909385113, |
|
"grad_norm": 0.08867770851651259, |
|
"learning_rate": 1.7784398830519e-05, |
|
"loss": 0.298, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 2.4789644012944985, |
|
"grad_norm": 0.08895980547301872, |
|
"learning_rate": 1.7355821314627564e-05, |
|
"loss": 0.3087, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 2.4854368932038833, |
|
"grad_norm": 0.09119431484259279, |
|
"learning_rate": 1.6931979821116418e-05, |
|
"loss": 0.305, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 2.4919093851132685, |
|
"grad_norm": 0.09318125516492613, |
|
"learning_rate": 1.6512898638723497e-05, |
|
"loss": 0.3095, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.498381877022654, |
|
"grad_norm": 0.08770991666106868, |
|
"learning_rate": 1.6098601783391487e-05, |
|
"loss": 0.292, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 2.5048543689320386, |
|
"grad_norm": 0.08730384586077941, |
|
"learning_rate": 1.5689112996891576e-05, |
|
"loss": 0.2982, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 2.511326860841424, |
|
"grad_norm": 0.09025336858704619, |
|
"learning_rate": 1.5284455745462834e-05, |
|
"loss": 0.3175, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 2.517799352750809, |
|
"grad_norm": 0.09170439944127616, |
|
"learning_rate": 1.4884653218467571e-05, |
|
"loss": 0.3131, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 2.524271844660194, |
|
"grad_norm": 0.08975472589683904, |
|
"learning_rate": 1.4489728327062324e-05, |
|
"loss": 0.3077, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.530744336569579, |
|
"grad_norm": 0.09151247517360593, |
|
"learning_rate": 1.4099703702884936e-05, |
|
"loss": 0.2987, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 2.5372168284789645, |
|
"grad_norm": 0.08707773026948659, |
|
"learning_rate": 1.3714601696757712e-05, |
|
"loss": 0.301, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 2.5436893203883493, |
|
"grad_norm": 0.09236291075818845, |
|
"learning_rate": 1.3334444377406452e-05, |
|
"loss": 0.3095, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 2.5501618122977345, |
|
"grad_norm": 0.09003080825117314, |
|
"learning_rate": 1.2959253530195836e-05, |
|
"loss": 0.301, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 2.55663430420712, |
|
"grad_norm": 0.08899294818724815, |
|
"learning_rate": 1.258905065588103e-05, |
|
"loss": 0.2996, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.5631067961165046, |
|
"grad_norm": 0.08642480774869472, |
|
"learning_rate": 1.2223856969375447e-05, |
|
"loss": 0.2972, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 2.56957928802589, |
|
"grad_norm": 0.09069465990264014, |
|
"learning_rate": 1.1863693398535114e-05, |
|
"loss": 0.3028, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 2.576051779935275, |
|
"grad_norm": 0.09007456341294559, |
|
"learning_rate": 1.1508580582959349e-05, |
|
"loss": 0.3026, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 2.58252427184466, |
|
"grad_norm": 0.0874176261621062, |
|
"learning_rate": 1.1158538872807933e-05, |
|
"loss": 0.302, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 2.588996763754045, |
|
"grad_norm": 0.08762089070699175, |
|
"learning_rate": 1.0813588327634961e-05, |
|
"loss": 0.2845, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.5954692556634305, |
|
"grad_norm": 0.08875402737182131, |
|
"learning_rate": 1.0473748715239307e-05, |
|
"loss": 0.3016, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 2.6019417475728153, |
|
"grad_norm": 0.09129343356719948, |
|
"learning_rate": 1.01390395105318e-05, |
|
"loss": 0.3154, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 2.6084142394822005, |
|
"grad_norm": 0.08810297072389008, |
|
"learning_rate": 9.809479894419149e-06, |
|
"loss": 0.3033, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 2.614886731391586, |
|
"grad_norm": 0.09018340343849121, |
|
"learning_rate": 9.485088752704885e-06, |
|
"loss": 0.3088, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 2.6213592233009706, |
|
"grad_norm": 0.08968883260752107, |
|
"learning_rate": 9.16588467500693e-06, |
|
"loss": 0.2982, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.627831715210356, |
|
"grad_norm": 0.08957666144874589, |
|
"learning_rate": 8.851885953692374e-06, |
|
"loss": 0.3082, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 2.634304207119741, |
|
"grad_norm": 0.09198641057091544, |
|
"learning_rate": 8.543110582829272e-06, |
|
"loss": 0.3015, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 2.6407766990291264, |
|
"grad_norm": 0.08915392372194866, |
|
"learning_rate": 8.239576257155334e-06, |
|
"loss": 0.3068, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 2.647249190938511, |
|
"grad_norm": 0.09135989217201736, |
|
"learning_rate": 7.941300371063953e-06, |
|
"loss": 0.3118, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 2.6537216828478964, |
|
"grad_norm": 0.09150707922550998, |
|
"learning_rate": 7.648300017607534e-06, |
|
"loss": 0.3049, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.6601941747572817, |
|
"grad_norm": 0.08679197622255307, |
|
"learning_rate": 7.360591987517762e-06, |
|
"loss": 0.2973, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.09043247718192007, |
|
"learning_rate": 7.078192768243486e-06, |
|
"loss": 0.3016, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 2.6731391585760518, |
|
"grad_norm": 0.09227814811133576, |
|
"learning_rate": 6.80111854300588e-06, |
|
"loss": 0.3109, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 2.679611650485437, |
|
"grad_norm": 0.08752395500418549, |
|
"learning_rate": 6.5293851898710625e-06, |
|
"loss": 0.301, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 2.686084142394822, |
|
"grad_norm": 0.08959180387680037, |
|
"learning_rate": 6.2630082808401326e-06, |
|
"loss": 0.2992, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.692556634304207, |
|
"grad_norm": 0.08743171313620991, |
|
"learning_rate": 6.00200308095682e-06, |
|
"loss": 0.301, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 2.6990291262135924, |
|
"grad_norm": 0.08980865803761802, |
|
"learning_rate": 5.746384547432737e-06, |
|
"loss": 0.3174, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 2.705501618122977, |
|
"grad_norm": 0.08661713786701486, |
|
"learning_rate": 5.496167328790191e-06, |
|
"loss": 0.2939, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 2.7119741100323624, |
|
"grad_norm": 0.08786309161255973, |
|
"learning_rate": 5.251365764022753e-06, |
|
"loss": 0.2978, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 2.7184466019417477, |
|
"grad_norm": 0.08733419882477451, |
|
"learning_rate": 5.011993881773569e-06, |
|
"loss": 0.3002, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.724919093851133, |
|
"grad_norm": 0.08926182705644836, |
|
"learning_rate": 4.778065399531395e-06, |
|
"loss": 0.2975, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 2.7313915857605178, |
|
"grad_norm": 0.0894176370961602, |
|
"learning_rate": 4.549593722844492e-06, |
|
"loss": 0.3067, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 2.737864077669903, |
|
"grad_norm": 0.0876923455017712, |
|
"learning_rate": 4.326591944552438e-06, |
|
"loss": 0.3097, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 2.7443365695792883, |
|
"grad_norm": 0.08572253083705766, |
|
"learning_rate": 4.109072844035844e-06, |
|
"loss": 0.2911, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 2.750809061488673, |
|
"grad_norm": 0.08883901045408762, |
|
"learning_rate": 3.8970488864839334e-06, |
|
"loss": 0.2963, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.7572815533980584, |
|
"grad_norm": 0.09240186844770727, |
|
"learning_rate": 3.690532222180343e-06, |
|
"loss": 0.3024, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 2.7637540453074436, |
|
"grad_norm": 0.09051220479105677, |
|
"learning_rate": 3.4895346858066724e-06, |
|
"loss": 0.3089, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 2.7702265372168284, |
|
"grad_norm": 0.0891289977534799, |
|
"learning_rate": 3.2940677957644215e-06, |
|
"loss": 0.3047, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 2.7766990291262137, |
|
"grad_norm": 0.08973918683676917, |
|
"learning_rate": 3.104142753514849e-06, |
|
"loss": 0.3151, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 2.783171521035599, |
|
"grad_norm": 0.08839195406678332, |
|
"learning_rate": 2.9197704429370977e-06, |
|
"loss": 0.2995, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.7896440129449838, |
|
"grad_norm": 0.08711643702101148, |
|
"learning_rate": 2.7409614297043806e-06, |
|
"loss": 0.2983, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 2.796116504854369, |
|
"grad_norm": 0.08827751306740735, |
|
"learning_rate": 2.5677259606786684e-06, |
|
"loss": 0.3071, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 2.8025889967637543, |
|
"grad_norm": 0.08989812034868257, |
|
"learning_rate": 2.4000739633233347e-06, |
|
"loss": 0.3185, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 2.809061488673139, |
|
"grad_norm": 0.08814993018203572, |
|
"learning_rate": 2.238015045134334e-06, |
|
"loss": 0.303, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 2.8155339805825244, |
|
"grad_norm": 0.08731684328369535, |
|
"learning_rate": 2.0815584930895972e-06, |
|
"loss": 0.3053, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.8220064724919096, |
|
"grad_norm": 0.09248953078205228, |
|
"learning_rate": 1.9307132731168352e-06, |
|
"loss": 0.3048, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 2.8284789644012944, |
|
"grad_norm": 0.08979246033823442, |
|
"learning_rate": 1.7854880295797405e-06, |
|
"loss": 0.3046, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 2.8349514563106797, |
|
"grad_norm": 0.08615293987508164, |
|
"learning_rate": 1.6458910847826026e-06, |
|
"loss": 0.2991, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 2.841423948220065, |
|
"grad_norm": 0.08795133012526277, |
|
"learning_rate": 1.5119304384934252e-06, |
|
"loss": 0.2922, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 2.8478964401294498, |
|
"grad_norm": 0.0862166031371393, |
|
"learning_rate": 1.3836137674854255e-06, |
|
"loss": 0.2921, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.854368932038835, |
|
"grad_norm": 0.08955236492173343, |
|
"learning_rate": 1.2609484250971749e-06, |
|
"loss": 0.2997, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 2.8608414239482203, |
|
"grad_norm": 0.08843363251406085, |
|
"learning_rate": 1.143941440811147e-06, |
|
"loss": 0.3088, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 2.867313915857605, |
|
"grad_norm": 0.09009839385408548, |
|
"learning_rate": 1.0325995198509409e-06, |
|
"loss": 0.3109, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 2.8737864077669903, |
|
"grad_norm": 0.08785361574469933, |
|
"learning_rate": 9.269290427969868e-07, |
|
"loss": 0.2971, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 2.8802588996763756, |
|
"grad_norm": 0.08841485850159143, |
|
"learning_rate": 8.26936065220929e-07, |
|
"loss": 0.3035, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 2.8867313915857604, |
|
"grad_norm": 0.08779257157327024, |
|
"learning_rate": 7.326263173385584e-07, |
|
"loss": 0.3013, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 2.8932038834951457, |
|
"grad_norm": 0.0883658852563661, |
|
"learning_rate": 6.440052036815081e-07, |
|
"loss": 0.305, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 2.899676375404531, |
|
"grad_norm": 0.08862179969635353, |
|
"learning_rate": 5.610778027874908e-07, |
|
"loss": 0.2952, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 2.9061488673139158, |
|
"grad_norm": 0.08863424887187872, |
|
"learning_rate": 4.838488669092534e-07, |
|
"loss": 0.2956, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 2.912621359223301, |
|
"grad_norm": 0.08998809785580231, |
|
"learning_rate": 4.123228217422948e-07, |
|
"loss": 0.3015, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.9190938511326863, |
|
"grad_norm": 0.08707387881008244, |
|
"learning_rate": 3.465037661712134e-07, |
|
"loss": 0.3095, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 2.925566343042071, |
|
"grad_norm": 0.08740571796909039, |
|
"learning_rate": 2.86395472034795e-07, |
|
"loss": 0.3033, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 2.9320388349514563, |
|
"grad_norm": 0.08599743041374118, |
|
"learning_rate": 2.3200138390993e-07, |
|
"loss": 0.2904, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 2.9385113268608416, |
|
"grad_norm": 0.08726631646259621, |
|
"learning_rate": 1.83324618914138e-07, |
|
"loss": 0.3007, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 2.9449838187702264, |
|
"grad_norm": 0.08898358262047706, |
|
"learning_rate": 1.4036796652701078e-07, |
|
"loss": 0.3017, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.9514563106796117, |
|
"grad_norm": 0.08765396985812648, |
|
"learning_rate": 1.031338884302846e-07, |
|
"loss": 0.2887, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 2.957928802588997, |
|
"grad_norm": 0.09001756499046232, |
|
"learning_rate": 7.162451836685291e-08, |
|
"loss": 0.309, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 2.9644012944983817, |
|
"grad_norm": 0.09045952221990206, |
|
"learning_rate": 4.584166201841988e-08, |
|
"loss": 0.2996, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 2.970873786407767, |
|
"grad_norm": 0.08709990251871128, |
|
"learning_rate": 2.578679690204977e-08, |
|
"loss": 0.2982, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 2.9773462783171523, |
|
"grad_norm": 0.08844965304987025, |
|
"learning_rate": 1.1461072285490204e-08, |
|
"loss": 0.3136, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.983818770226537, |
|
"grad_norm": 0.0892220185111147, |
|
"learning_rate": 2.865309121358184e-09, |
|
"loss": 0.3011, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 2.9902912621359223, |
|
"grad_norm": 0.08562579597167659, |
|
"learning_rate": 0.0, |
|
"loss": 0.2951, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 2.9902912621359223, |
|
"eval_loss": 0.3344117999076843, |
|
"eval_runtime": 35.3894, |
|
"eval_samples_per_second": 29.359, |
|
"eval_steps_per_second": 0.932, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 2.9902912621359223, |
|
"step": 462, |
|
"total_flos": 1.8679451641472614e+17, |
|
"train_loss": 0.36376328850205325, |
|
"train_runtime": 5739.3163, |
|
"train_samples_per_second": 10.311, |
|
"train_steps_per_second": 0.08 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 462, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.8679451641472614e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|