{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.99353366298973, "eval_steps": 500, "global_step": 3285, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015214910612400151, "grad_norm": 62.0, "learning_rate": 5.050505050505051e-06, "loss": 6.7184, "step": 10 }, { "epoch": 0.030429821224800303, "grad_norm": 26.625, "learning_rate": 1.0101010101010101e-05, "loss": 5.0379, "step": 20 }, { "epoch": 0.045644731837200456, "grad_norm": 39.25, "learning_rate": 1.5151515151515153e-05, "loss": 4.5296, "step": 30 }, { "epoch": 0.060859642449600605, "grad_norm": 13.75, "learning_rate": 2.0202020202020203e-05, "loss": 4.5222, "step": 40 }, { "epoch": 0.07607455306200075, "grad_norm": 14.5, "learning_rate": 2.5252525252525256e-05, "loss": 4.4963, "step": 50 }, { "epoch": 0.09128946367440091, "grad_norm": 15.125, "learning_rate": 3.0303030303030306e-05, "loss": 4.7534, "step": 60 }, { "epoch": 0.10650437428680107, "grad_norm": 92.0, "learning_rate": 3.535353535353535e-05, "loss": 4.6209, "step": 70 }, { "epoch": 0.12171928489920121, "grad_norm": 40.5, "learning_rate": 4.0404040404040405e-05, "loss": 4.698, "step": 80 }, { "epoch": 0.13693419551160138, "grad_norm": 22.375, "learning_rate": 4.545454545454546e-05, "loss": 4.6103, "step": 90 }, { "epoch": 0.1521491061240015, "grad_norm": 45.0, "learning_rate": 4.9984306340238544e-05, "loss": 4.8199, "step": 100 }, { "epoch": 0.16736401673640167, "grad_norm": 12.125, "learning_rate": 4.982736974262398e-05, "loss": 4.7424, "step": 110 }, { "epoch": 0.18257892734880182, "grad_norm": 18.875, "learning_rate": 4.967043314500942e-05, "loss": 4.8093, "step": 120 }, { "epoch": 0.19779383796120198, "grad_norm": 14.1875, "learning_rate": 4.9513496547394854e-05, "loss": 4.9051, "step": 130 }, { "epoch": 0.21300874857360214, "grad_norm": 9.375, "learning_rate": 4.935655994978029e-05, "loss": 4.8232, "step": 140 }, { "epoch": 0.2282236591860023, "grad_norm": 13.75, "learning_rate": 4.919962335216573e-05, "loss": 4.7461, "step": 150 }, { "epoch": 0.24343856979840242, "grad_norm": 9.625, "learning_rate": 4.9042686754551165e-05, "loss": 4.7576, "step": 160 }, { "epoch": 0.2586534804108026, "grad_norm": 9.375, "learning_rate": 4.88857501569366e-05, "loss": 4.7951, "step": 170 }, { "epoch": 0.27386839102320276, "grad_norm": 9.125, "learning_rate": 4.8728813559322034e-05, "loss": 4.7547, "step": 180 }, { "epoch": 0.2890833016356029, "grad_norm": 9.125, "learning_rate": 4.8571876961707475e-05, "loss": 4.7108, "step": 190 }, { "epoch": 0.304298212248003, "grad_norm": 11.125, "learning_rate": 4.841494036409291e-05, "loss": 4.6421, "step": 200 }, { "epoch": 0.3195131228604032, "grad_norm": 82.5, "learning_rate": 4.8258003766478345e-05, "loss": 4.6211, "step": 210 }, { "epoch": 0.33472803347280333, "grad_norm": 96.5, "learning_rate": 4.8101067168863786e-05, "loss": 5.1034, "step": 220 }, { "epoch": 0.3499429440852035, "grad_norm": 62.25, "learning_rate": 4.794413057124922e-05, "loss": 6.1655, "step": 230 }, { "epoch": 0.36515785469760365, "grad_norm": 116.0, "learning_rate": 4.7787193973634655e-05, "loss": 5.6462, "step": 240 }, { "epoch": 0.3803727653100038, "grad_norm": 147.0, "learning_rate": 4.763025737602009e-05, "loss": 5.1807, "step": 250 }, { "epoch": 0.39558767592240396, "grad_norm": 19.5, "learning_rate": 4.747332077840553e-05, "loss": 4.7096, "step": 260 }, { "epoch": 0.4108025865348041, "grad_norm": 52.5, "learning_rate": 4.7316384180790966e-05, "loss": 4.5789, "step": 270 }, { "epoch": 0.42601749714720427, "grad_norm": 310.0, "learning_rate": 4.71594475831764e-05, "loss": 5.1039, "step": 280 }, { "epoch": 0.44123240775960443, "grad_norm": 153.0, "learning_rate": 4.7002510985561835e-05, "loss": 5.1985, "step": 290 }, { "epoch": 0.4564473183720046, "grad_norm": 39.5, "learning_rate": 4.684557438794727e-05, "loss": 5.1797, "step": 300 }, { "epoch": 0.47166222898440474, "grad_norm": 100.0, "learning_rate": 4.6688637790332704e-05, "loss": 5.1788, "step": 310 }, { "epoch": 0.48687713959680484, "grad_norm": 41.5, "learning_rate": 4.6531701192718145e-05, "loss": 5.046, "step": 320 }, { "epoch": 0.502092050209205, "grad_norm": 964.0, "learning_rate": 4.637476459510358e-05, "loss": 5.0876, "step": 330 }, { "epoch": 0.5173069608216052, "grad_norm": 192.0, "learning_rate": 4.6217827997489015e-05, "loss": 5.0314, "step": 340 }, { "epoch": 0.5325218714340053, "grad_norm": 41.75, "learning_rate": 4.606089139987445e-05, "loss": 5.0153, "step": 350 }, { "epoch": 0.5477367820464055, "grad_norm": 51.0, "learning_rate": 4.590395480225989e-05, "loss": 4.7087, "step": 360 }, { "epoch": 0.5629516926588056, "grad_norm": 101.0, "learning_rate": 4.5747018204645325e-05, "loss": 4.8249, "step": 370 }, { "epoch": 0.5781666032712058, "grad_norm": 32.25, "learning_rate": 4.559008160703076e-05, "loss": 4.9544, "step": 380 }, { "epoch": 0.5933815138836059, "grad_norm": 18.5, "learning_rate": 4.54331450094162e-05, "loss": 4.7081, "step": 390 }, { "epoch": 0.608596424496006, "grad_norm": 49.25, "learning_rate": 4.5276208411801636e-05, "loss": 4.9085, "step": 400 }, { "epoch": 0.6238113351084063, "grad_norm": 446.0, "learning_rate": 4.511927181418707e-05, "loss": 4.8538, "step": 410 }, { "epoch": 0.6390262457208064, "grad_norm": 59.25, "learning_rate": 4.4962335216572505e-05, "loss": 4.6111, "step": 420 }, { "epoch": 0.6542411563332066, "grad_norm": 60.0, "learning_rate": 4.4805398618957946e-05, "loss": 4.6564, "step": 430 }, { "epoch": 0.6694560669456067, "grad_norm": 37.5, "learning_rate": 4.464846202134338e-05, "loss": 4.6277, "step": 440 }, { "epoch": 0.6846709775580069, "grad_norm": 8.75, "learning_rate": 4.4491525423728816e-05, "loss": 4.4838, "step": 450 }, { "epoch": 0.699885888170407, "grad_norm": 20.375, "learning_rate": 4.433458882611426e-05, "loss": 4.6507, "step": 460 }, { "epoch": 0.7151007987828072, "grad_norm": 22.0, "learning_rate": 4.417765222849969e-05, "loss": 4.7077, "step": 470 }, { "epoch": 0.7303157093952073, "grad_norm": 12.25, "learning_rate": 4.4020715630885126e-05, "loss": 4.5839, "step": 480 }, { "epoch": 0.7455306200076075, "grad_norm": 9.1875, "learning_rate": 4.386377903327056e-05, "loss": 4.5522, "step": 490 }, { "epoch": 0.7607455306200076, "grad_norm": 13.3125, "learning_rate": 4.3706842435655995e-05, "loss": 4.686, "step": 500 }, { "epoch": 0.7759604412324077, "grad_norm": 14.5625, "learning_rate": 4.354990583804143e-05, "loss": 4.444, "step": 510 }, { "epoch": 0.7911753518448079, "grad_norm": 580.0, "learning_rate": 4.3392969240426864e-05, "loss": 5.0163, "step": 520 }, { "epoch": 0.806390262457208, "grad_norm": 54.25, "learning_rate": 4.3236032642812306e-05, "loss": 5.1205, "step": 530 }, { "epoch": 0.8216051730696082, "grad_norm": 43.5, "learning_rate": 4.307909604519774e-05, "loss": 4.7013, "step": 540 }, { "epoch": 0.8368200836820083, "grad_norm": 22.125, "learning_rate": 4.2922159447583175e-05, "loss": 4.5653, "step": 550 }, { "epoch": 0.8520349942944085, "grad_norm": 19.0, "learning_rate": 4.2765222849968616e-05, "loss": 4.465, "step": 560 }, { "epoch": 0.8672499049068086, "grad_norm": 72.0, "learning_rate": 4.260828625235405e-05, "loss": 4.6378, "step": 570 }, { "epoch": 0.8824648155192089, "grad_norm": 14.4375, "learning_rate": 4.2451349654739486e-05, "loss": 4.6684, "step": 580 }, { "epoch": 0.897679726131609, "grad_norm": 75.5, "learning_rate": 4.229441305712492e-05, "loss": 4.5414, "step": 590 }, { "epoch": 0.9128946367440092, "grad_norm": 808.0, "learning_rate": 4.213747645951036e-05, "loss": 4.476, "step": 600 }, { "epoch": 0.9281095473564093, "grad_norm": 18.625, "learning_rate": 4.1980539861895796e-05, "loss": 4.4283, "step": 610 }, { "epoch": 0.9433244579688095, "grad_norm": 20.25, "learning_rate": 4.182360326428123e-05, "loss": 4.4041, "step": 620 }, { "epoch": 0.9585393685812096, "grad_norm": 430.0, "learning_rate": 4.166666666666667e-05, "loss": 4.3725, "step": 630 }, { "epoch": 0.9737542791936097, "grad_norm": 246.0, "learning_rate": 4.150973006905211e-05, "loss": 4.591, "step": 640 }, { "epoch": 0.9889691898060099, "grad_norm": 15.25, "learning_rate": 4.135279347143754e-05, "loss": 4.684, "step": 650 }, { "epoch": 1.00304298212248, "grad_norm": 19.0, "learning_rate": 4.119585687382298e-05, "loss": 3.8939, "step": 660 }, { "epoch": 1.0182578927348802, "grad_norm": 16.5, "learning_rate": 4.103892027620842e-05, "loss": 3.9108, "step": 670 }, { "epoch": 1.0334728033472804, "grad_norm": 18.625, "learning_rate": 4.088198367859385e-05, "loss": 4.0937, "step": 680 }, { "epoch": 1.0486877139596804, "grad_norm": 109.0, "learning_rate": 4.0725047080979286e-05, "loss": 3.8888, "step": 690 }, { "epoch": 1.0639026245720806, "grad_norm": 14.6875, "learning_rate": 4.056811048336472e-05, "loss": 4.0143, "step": 700 }, { "epoch": 1.0791175351844808, "grad_norm": 22.25, "learning_rate": 4.0411173885750156e-05, "loss": 3.8516, "step": 710 }, { "epoch": 1.0943324457968808, "grad_norm": 23.0, "learning_rate": 4.025423728813559e-05, "loss": 3.95, "step": 720 }, { "epoch": 1.109547356409281, "grad_norm": 13.375, "learning_rate": 4.009730069052103e-05, "loss": 3.913, "step": 730 }, { "epoch": 1.1247622670216813, "grad_norm": 13.8125, "learning_rate": 3.9940364092906466e-05, "loss": 3.9559, "step": 740 }, { "epoch": 1.1399771776340815, "grad_norm": 11.125, "learning_rate": 3.97834274952919e-05, "loss": 3.9732, "step": 750 }, { "epoch": 1.1551920882464817, "grad_norm": 20.375, "learning_rate": 3.962649089767734e-05, "loss": 4.0201, "step": 760 }, { "epoch": 1.1704069988588817, "grad_norm": 23.5, "learning_rate": 3.946955430006278e-05, "loss": 3.9096, "step": 770 }, { "epoch": 1.1856219094712819, "grad_norm": 17.875, "learning_rate": 3.931261770244821e-05, "loss": 3.9626, "step": 780 }, { "epoch": 1.200836820083682, "grad_norm": 18.375, "learning_rate": 3.9155681104833646e-05, "loss": 3.9349, "step": 790 }, { "epoch": 1.216051730696082, "grad_norm": 36.0, "learning_rate": 3.899874450721909e-05, "loss": 3.8503, "step": 800 }, { "epoch": 1.2312666413084823, "grad_norm": 18.625, "learning_rate": 3.884180790960452e-05, "loss": 3.9523, "step": 810 }, { "epoch": 1.2464815519208825, "grad_norm": 13.6875, "learning_rate": 3.8684871311989956e-05, "loss": 3.7665, "step": 820 }, { "epoch": 1.2616964625332825, "grad_norm": 14.375, "learning_rate": 3.85279347143754e-05, "loss": 3.8204, "step": 830 }, { "epoch": 1.2769113731456827, "grad_norm": 14.625, "learning_rate": 3.837099811676083e-05, "loss": 3.8995, "step": 840 }, { "epoch": 1.292126283758083, "grad_norm": 20.375, "learning_rate": 3.821406151914627e-05, "loss": 3.9207, "step": 850 }, { "epoch": 1.3073411943704831, "grad_norm": 12.4375, "learning_rate": 3.80571249215317e-05, "loss": 3.9366, "step": 860 }, { "epoch": 1.3225561049828833, "grad_norm": 22.25, "learning_rate": 3.790018832391714e-05, "loss": 3.7521, "step": 870 }, { "epoch": 1.3377710155952833, "grad_norm": 20.625, "learning_rate": 3.774325172630258e-05, "loss": 3.8315, "step": 880 }, { "epoch": 1.3529859262076835, "grad_norm": 21.875, "learning_rate": 3.758631512868801e-05, "loss": 3.8439, "step": 890 }, { "epoch": 1.3682008368200838, "grad_norm": 23.75, "learning_rate": 3.7429378531073453e-05, "loss": 3.773, "step": 900 }, { "epoch": 1.3834157474324837, "grad_norm": 134.0, "learning_rate": 3.727244193345889e-05, "loss": 3.8877, "step": 910 }, { "epoch": 1.398630658044884, "grad_norm": 24.625, "learning_rate": 3.711550533584432e-05, "loss": 3.8059, "step": 920 }, { "epoch": 1.4138455686572842, "grad_norm": 103.0, "learning_rate": 3.695856873822976e-05, "loss": 3.8337, "step": 930 }, { "epoch": 1.4290604792696842, "grad_norm": 20.375, "learning_rate": 3.680163214061519e-05, "loss": 3.8493, "step": 940 }, { "epoch": 1.4442753898820844, "grad_norm": 23.625, "learning_rate": 3.6644695543000626e-05, "loss": 3.8309, "step": 950 }, { "epoch": 1.4594903004944846, "grad_norm": 17.625, "learning_rate": 3.648775894538606e-05, "loss": 3.7735, "step": 960 }, { "epoch": 1.4747052111068848, "grad_norm": 20.0, "learning_rate": 3.63308223477715e-05, "loss": 3.7216, "step": 970 }, { "epoch": 1.489920121719285, "grad_norm": 18.75, "learning_rate": 3.617388575015694e-05, "loss": 3.8008, "step": 980 }, { "epoch": 1.505135032331685, "grad_norm": 25.25, "learning_rate": 3.601694915254237e-05, "loss": 3.8333, "step": 990 }, { "epoch": 1.5203499429440852, "grad_norm": 16.375, "learning_rate": 3.586001255492781e-05, "loss": 3.8395, "step": 1000 }, { "epoch": 1.5355648535564854, "grad_norm": 16.5, "learning_rate": 3.570307595731325e-05, "loss": 3.8099, "step": 1010 }, { "epoch": 1.5507797641688854, "grad_norm": 29.875, "learning_rate": 3.554613935969868e-05, "loss": 3.8804, "step": 1020 }, { "epoch": 1.5659946747812856, "grad_norm": 211.0, "learning_rate": 3.538920276208412e-05, "loss": 4.9431, "step": 1030 }, { "epoch": 1.5812095853936858, "grad_norm": 39.0, "learning_rate": 3.523226616446956e-05, "loss": 4.2009, "step": 1040 }, { "epoch": 1.5964244960060858, "grad_norm": 34.5, "learning_rate": 3.507532956685499e-05, "loss": 3.7996, "step": 1050 }, { "epoch": 1.6116394066184863, "grad_norm": 136.0, "learning_rate": 3.491839296924043e-05, "loss": 3.7328, "step": 1060 }, { "epoch": 1.6268543172308862, "grad_norm": 70.5, "learning_rate": 3.476145637162587e-05, "loss": 3.8927, "step": 1070 }, { "epoch": 1.6420692278432865, "grad_norm": 185.0, "learning_rate": 3.46045197740113e-05, "loss": 4.4253, "step": 1080 }, { "epoch": 1.6572841384556867, "grad_norm": 25.25, "learning_rate": 3.444758317639674e-05, "loss": 4.2154, "step": 1090 }, { "epoch": 1.6724990490680867, "grad_norm": 40.0, "learning_rate": 3.429064657878218e-05, "loss": 3.9789, "step": 1100 }, { "epoch": 1.6877139596804869, "grad_norm": 41.5, "learning_rate": 3.4133709981167614e-05, "loss": 4.7068, "step": 1110 }, { "epoch": 1.702928870292887, "grad_norm": 21.625, "learning_rate": 3.397677338355305e-05, "loss": 4.134, "step": 1120 }, { "epoch": 1.718143780905287, "grad_norm": 60.25, "learning_rate": 3.381983678593848e-05, "loss": 4.0554, "step": 1130 }, { "epoch": 1.7333586915176873, "grad_norm": 36.25, "learning_rate": 3.366290018832392e-05, "loss": 4.0661, "step": 1140 }, { "epoch": 1.7485736021300875, "grad_norm": 135.0, "learning_rate": 3.350596359070935e-05, "loss": 4.1099, "step": 1150 }, { "epoch": 1.7637885127424875, "grad_norm": 17.375, "learning_rate": 3.334902699309479e-05, "loss": 4.1226, "step": 1160 }, { "epoch": 1.779003423354888, "grad_norm": 40.25, "learning_rate": 3.319209039548023e-05, "loss": 4.1996, "step": 1170 }, { "epoch": 1.794218333967288, "grad_norm": 54.25, "learning_rate": 3.303515379786566e-05, "loss": 4.2895, "step": 1180 }, { "epoch": 1.8094332445796881, "grad_norm": 82.5, "learning_rate": 3.28782172002511e-05, "loss": 4.0929, "step": 1190 }, { "epoch": 1.8246481551920883, "grad_norm": 49.75, "learning_rate": 3.272128060263653e-05, "loss": 4.1792, "step": 1200 }, { "epoch": 1.8398630658044883, "grad_norm": 34.0, "learning_rate": 3.256434400502197e-05, "loss": 4.0729, "step": 1210 }, { "epoch": 1.8550779764168885, "grad_norm": 33.5, "learning_rate": 3.240740740740741e-05, "loss": 3.9144, "step": 1220 }, { "epoch": 1.8702928870292888, "grad_norm": 24.125, "learning_rate": 3.225047080979284e-05, "loss": 3.9079, "step": 1230 }, { "epoch": 1.8855077976416887, "grad_norm": 25.25, "learning_rate": 3.2093534212178284e-05, "loss": 3.78, "step": 1240 }, { "epoch": 1.900722708254089, "grad_norm": 37.0, "learning_rate": 3.193659761456372e-05, "loss": 4.0513, "step": 1250 }, { "epoch": 1.9159376188664892, "grad_norm": 31.5, "learning_rate": 3.177966101694915e-05, "loss": 4.0164, "step": 1260 }, { "epoch": 1.9311525294788892, "grad_norm": 18.375, "learning_rate": 3.1622724419334594e-05, "loss": 4.0181, "step": 1270 }, { "epoch": 1.9463674400912896, "grad_norm": 38.5, "learning_rate": 3.146578782172003e-05, "loss": 4.104, "step": 1280 }, { "epoch": 1.9615823507036896, "grad_norm": 132.0, "learning_rate": 3.1308851224105464e-05, "loss": 4.1902, "step": 1290 }, { "epoch": 1.9767972613160898, "grad_norm": 46.75, "learning_rate": 3.11519146264909e-05, "loss": 4.318, "step": 1300 }, { "epoch": 1.99201217192849, "grad_norm": 28.125, "learning_rate": 3.099497802887634e-05, "loss": 4.0396, "step": 1310 }, { "epoch": 2.00608596424496, "grad_norm": 30.0, "learning_rate": 3.0838041431261774e-05, "loss": 3.4143, "step": 1320 }, { "epoch": 2.0213008748573604, "grad_norm": 29.875, "learning_rate": 3.068110483364721e-05, "loss": 3.3867, "step": 1330 }, { "epoch": 2.0365157854697604, "grad_norm": 34.25, "learning_rate": 3.052416823603264e-05, "loss": 3.3467, "step": 1340 }, { "epoch": 2.0517306960821604, "grad_norm": 64.0, "learning_rate": 3.036723163841808e-05, "loss": 3.403, "step": 1350 }, { "epoch": 2.066945606694561, "grad_norm": 19.0, "learning_rate": 3.0210295040803516e-05, "loss": 3.3264, "step": 1360 }, { "epoch": 2.082160517306961, "grad_norm": 21.5, "learning_rate": 3.005335844318895e-05, "loss": 3.3493, "step": 1370 }, { "epoch": 2.097375427919361, "grad_norm": 32.75, "learning_rate": 2.9896421845574392e-05, "loss": 3.1572, "step": 1380 }, { "epoch": 2.1125903385317613, "grad_norm": 50.25, "learning_rate": 2.9739485247959826e-05, "loss": 3.2606, "step": 1390 }, { "epoch": 2.1278052491441612, "grad_norm": 29.875, "learning_rate": 2.958254865034526e-05, "loss": 3.2852, "step": 1400 }, { "epoch": 2.1430201597565612, "grad_norm": 29.125, "learning_rate": 2.94256120527307e-05, "loss": 3.1659, "step": 1410 }, { "epoch": 2.1582350703689617, "grad_norm": 50.0, "learning_rate": 2.9268675455116134e-05, "loss": 3.1076, "step": 1420 }, { "epoch": 2.1734499809813617, "grad_norm": 28.375, "learning_rate": 2.9111738857501568e-05, "loss": 3.2414, "step": 1430 }, { "epoch": 2.1886648915937617, "grad_norm": 55.25, "learning_rate": 2.895480225988701e-05, "loss": 3.1755, "step": 1440 }, { "epoch": 2.203879802206162, "grad_norm": 20.0, "learning_rate": 2.8797865662272444e-05, "loss": 3.2081, "step": 1450 }, { "epoch": 2.219094712818562, "grad_norm": 31.5, "learning_rate": 2.864092906465788e-05, "loss": 3.1076, "step": 1460 }, { "epoch": 2.2343096234309625, "grad_norm": 24.25, "learning_rate": 2.8483992467043313e-05, "loss": 3.1507, "step": 1470 }, { "epoch": 2.2495245340433625, "grad_norm": 58.25, "learning_rate": 2.8327055869428755e-05, "loss": 3.2108, "step": 1480 }, { "epoch": 2.2647394446557625, "grad_norm": 65.0, "learning_rate": 2.817011927181419e-05, "loss": 3.2844, "step": 1490 }, { "epoch": 2.279954355268163, "grad_norm": 42.25, "learning_rate": 2.8013182674199624e-05, "loss": 3.2657, "step": 1500 }, { "epoch": 2.295169265880563, "grad_norm": 17.375, "learning_rate": 2.7856246076585062e-05, "loss": 3.267, "step": 1510 }, { "epoch": 2.3103841764929633, "grad_norm": 26.75, "learning_rate": 2.7699309478970496e-05, "loss": 3.2966, "step": 1520 }, { "epoch": 2.3255990871053633, "grad_norm": 30.875, "learning_rate": 2.754237288135593e-05, "loss": 3.217, "step": 1530 }, { "epoch": 2.3408139977177633, "grad_norm": 24.75, "learning_rate": 2.7385436283741372e-05, "loss": 3.1645, "step": 1540 }, { "epoch": 2.3560289083301633, "grad_norm": 35.0, "learning_rate": 2.7228499686126807e-05, "loss": 3.1515, "step": 1550 }, { "epoch": 2.3712438189425638, "grad_norm": 51.75, "learning_rate": 2.707156308851224e-05, "loss": 3.0929, "step": 1560 }, { "epoch": 2.3864587295549637, "grad_norm": 30.875, "learning_rate": 2.6914626490897676e-05, "loss": 3.1356, "step": 1570 }, { "epoch": 2.401673640167364, "grad_norm": 35.5, "learning_rate": 2.6757689893283118e-05, "loss": 3.1858, "step": 1580 }, { "epoch": 2.416888550779764, "grad_norm": 33.0, "learning_rate": 2.6600753295668552e-05, "loss": 3.1423, "step": 1590 }, { "epoch": 2.432103461392164, "grad_norm": 44.0, "learning_rate": 2.6443816698053987e-05, "loss": 3.0862, "step": 1600 }, { "epoch": 2.4473183720045646, "grad_norm": 23.125, "learning_rate": 2.6286880100439425e-05, "loss": 3.0965, "step": 1610 }, { "epoch": 2.4625332826169646, "grad_norm": 29.5, "learning_rate": 2.612994350282486e-05, "loss": 3.11, "step": 1620 }, { "epoch": 2.477748193229365, "grad_norm": 37.5, "learning_rate": 2.5973006905210294e-05, "loss": 3.0674, "step": 1630 }, { "epoch": 2.492963103841765, "grad_norm": 44.5, "learning_rate": 2.581607030759573e-05, "loss": 3.1255, "step": 1640 }, { "epoch": 2.508178014454165, "grad_norm": 44.5, "learning_rate": 2.565913370998117e-05, "loss": 3.0898, "step": 1650 }, { "epoch": 2.523392925066565, "grad_norm": 30.25, "learning_rate": 2.5502197112366604e-05, "loss": 3.0759, "step": 1660 }, { "epoch": 2.5386078356789654, "grad_norm": 41.0, "learning_rate": 2.534526051475204e-05, "loss": 3.2519, "step": 1670 }, { "epoch": 2.5538227462913654, "grad_norm": 22.25, "learning_rate": 2.518832391713748e-05, "loss": 3.1336, "step": 1680 }, { "epoch": 2.569037656903766, "grad_norm": 47.0, "learning_rate": 2.5031387319522915e-05, "loss": 3.1522, "step": 1690 }, { "epoch": 2.584252567516166, "grad_norm": 41.25, "learning_rate": 2.487445072190835e-05, "loss": 3.1463, "step": 1700 }, { "epoch": 2.599467478128566, "grad_norm": 33.75, "learning_rate": 2.4717514124293788e-05, "loss": 3.1115, "step": 1710 }, { "epoch": 2.6146823887409663, "grad_norm": 27.5, "learning_rate": 2.4560577526679222e-05, "loss": 3.1908, "step": 1720 }, { "epoch": 2.6298972993533662, "grad_norm": 59.5, "learning_rate": 2.4403640929064657e-05, "loss": 3.035, "step": 1730 }, { "epoch": 2.6451122099657667, "grad_norm": 34.75, "learning_rate": 2.4246704331450095e-05, "loss": 3.0231, "step": 1740 }, { "epoch": 2.6603271205781667, "grad_norm": 34.75, "learning_rate": 2.408976773383553e-05, "loss": 3.0962, "step": 1750 }, { "epoch": 2.6755420311905667, "grad_norm": 20.625, "learning_rate": 2.3932831136220967e-05, "loss": 3.1877, "step": 1760 }, { "epoch": 2.6907569418029667, "grad_norm": 38.25, "learning_rate": 2.3775894538606405e-05, "loss": 2.9851, "step": 1770 }, { "epoch": 2.705971852415367, "grad_norm": 24.125, "learning_rate": 2.361895794099184e-05, "loss": 3.1095, "step": 1780 }, { "epoch": 2.721186763027767, "grad_norm": 30.5, "learning_rate": 2.3462021343377278e-05, "loss": 3.0544, "step": 1790 }, { "epoch": 2.7364016736401675, "grad_norm": 32.0, "learning_rate": 2.3305084745762712e-05, "loss": 3.0336, "step": 1800 }, { "epoch": 2.7516165842525675, "grad_norm": 48.5, "learning_rate": 2.314814814814815e-05, "loss": 3.1227, "step": 1810 }, { "epoch": 2.7668314948649675, "grad_norm": 37.25, "learning_rate": 2.299121155053359e-05, "loss": 3.0989, "step": 1820 }, { "epoch": 2.782046405477368, "grad_norm": 20.25, "learning_rate": 2.2834274952919023e-05, "loss": 3.158, "step": 1830 }, { "epoch": 2.797261316089768, "grad_norm": 35.0, "learning_rate": 2.2677338355304458e-05, "loss": 3.1081, "step": 1840 }, { "epoch": 2.8124762267021683, "grad_norm": 49.0, "learning_rate": 2.2520401757689892e-05, "loss": 3.1436, "step": 1850 }, { "epoch": 2.8276911373145683, "grad_norm": 18.625, "learning_rate": 2.236346516007533e-05, "loss": 3.0876, "step": 1860 }, { "epoch": 2.8429060479269683, "grad_norm": 26.375, "learning_rate": 2.2206528562460768e-05, "loss": 3.0573, "step": 1870 }, { "epoch": 2.8581209585393683, "grad_norm": 44.0, "learning_rate": 2.2049591964846203e-05, "loss": 3.0853, "step": 1880 }, { "epoch": 2.8733358691517688, "grad_norm": 33.0, "learning_rate": 2.189265536723164e-05, "loss": 3.2242, "step": 1890 }, { "epoch": 2.8885507797641687, "grad_norm": 89.0, "learning_rate": 2.1735718769617075e-05, "loss": 3.1315, "step": 1900 }, { "epoch": 2.903765690376569, "grad_norm": 29.875, "learning_rate": 2.1578782172002513e-05, "loss": 3.1638, "step": 1910 }, { "epoch": 2.918980600988969, "grad_norm": 27.125, "learning_rate": 2.1421845574387948e-05, "loss": 3.0408, "step": 1920 }, { "epoch": 2.934195511601369, "grad_norm": 42.5, "learning_rate": 2.1264908976773386e-05, "loss": 3.0296, "step": 1930 }, { "epoch": 2.9494104222137696, "grad_norm": 37.5, "learning_rate": 2.110797237915882e-05, "loss": 3.1607, "step": 1940 }, { "epoch": 2.9646253328261696, "grad_norm": 31.5, "learning_rate": 2.0951035781544255e-05, "loss": 3.0746, "step": 1950 }, { "epoch": 2.97984024343857, "grad_norm": 40.0, "learning_rate": 2.0794099183929693e-05, "loss": 3.0749, "step": 1960 }, { "epoch": 2.99505515405097, "grad_norm": 21.375, "learning_rate": 2.0637162586315128e-05, "loss": 3.1096, "step": 1970 }, { "epoch": 3.00912894636744, "grad_norm": 37.25, "learning_rate": 2.0480225988700566e-05, "loss": 2.5168, "step": 1980 }, { "epoch": 3.0243438569798404, "grad_norm": 37.75, "learning_rate": 2.0323289391086004e-05, "loss": 2.4061, "step": 1990 }, { "epoch": 3.0395587675922404, "grad_norm": 28.125, "learning_rate": 2.0166352793471438e-05, "loss": 2.4859, "step": 2000 }, { "epoch": 3.0547736782046404, "grad_norm": 29.375, "learning_rate": 2.0009416195856876e-05, "loss": 2.3777, "step": 2010 }, { "epoch": 3.069988588817041, "grad_norm": 88.5, "learning_rate": 1.985247959824231e-05, "loss": 2.4563, "step": 2020 }, { "epoch": 3.085203499429441, "grad_norm": 33.0, "learning_rate": 1.969554300062775e-05, "loss": 2.393, "step": 2030 }, { "epoch": 3.100418410041841, "grad_norm": 25.5, "learning_rate": 1.9538606403013183e-05, "loss": 2.4262, "step": 2040 }, { "epoch": 3.1156333206542413, "grad_norm": 52.25, "learning_rate": 1.9381669805398618e-05, "loss": 2.394, "step": 2050 }, { "epoch": 3.1308482312666412, "grad_norm": 28.25, "learning_rate": 1.9224733207784056e-05, "loss": 2.4209, "step": 2060 }, { "epoch": 3.1460631418790417, "grad_norm": 24.125, "learning_rate": 1.906779661016949e-05, "loss": 2.4335, "step": 2070 }, { "epoch": 3.1612780524914417, "grad_norm": 35.5, "learning_rate": 1.891086001255493e-05, "loss": 2.4935, "step": 2080 }, { "epoch": 3.1764929631038417, "grad_norm": 42.5, "learning_rate": 1.8753923414940363e-05, "loss": 2.4604, "step": 2090 }, { "epoch": 3.191707873716242, "grad_norm": 29.25, "learning_rate": 1.85969868173258e-05, "loss": 2.4856, "step": 2100 }, { "epoch": 3.206922784328642, "grad_norm": 34.75, "learning_rate": 1.844005021971124e-05, "loss": 2.4311, "step": 2110 }, { "epoch": 3.222137694941042, "grad_norm": 56.0, "learning_rate": 1.8283113622096674e-05, "loss": 2.4334, "step": 2120 }, { "epoch": 3.2373526055534425, "grad_norm": 38.25, "learning_rate": 1.812617702448211e-05, "loss": 2.5363, "step": 2130 }, { "epoch": 3.2525675161658425, "grad_norm": 36.25, "learning_rate": 1.7969240426867546e-05, "loss": 2.4689, "step": 2140 }, { "epoch": 3.2677824267782425, "grad_norm": 27.375, "learning_rate": 1.7812303829252984e-05, "loss": 2.522, "step": 2150 }, { "epoch": 3.282997337390643, "grad_norm": 37.5, "learning_rate": 1.765536723163842e-05, "loss": 2.561, "step": 2160 }, { "epoch": 3.298212248003043, "grad_norm": 56.5, "learning_rate": 1.7498430634023853e-05, "loss": 2.5073, "step": 2170 }, { "epoch": 3.3134271586154433, "grad_norm": 31.875, "learning_rate": 1.734149403640929e-05, "loss": 2.6292, "step": 2180 }, { "epoch": 3.3286420692278433, "grad_norm": 66.5, "learning_rate": 1.7184557438794726e-05, "loss": 2.5683, "step": 2190 }, { "epoch": 3.3438569798402433, "grad_norm": 163.0, "learning_rate": 1.7027620841180164e-05, "loss": 2.4828, "step": 2200 }, { "epoch": 3.3590718904526438, "grad_norm": 28.625, "learning_rate": 1.6870684243565602e-05, "loss": 2.4985, "step": 2210 }, { "epoch": 3.3742868010650438, "grad_norm": 26.5, "learning_rate": 1.6713747645951036e-05, "loss": 2.5606, "step": 2220 }, { "epoch": 3.3895017116774437, "grad_norm": 29.0, "learning_rate": 1.6556811048336474e-05, "loss": 2.5204, "step": 2230 }, { "epoch": 3.404716622289844, "grad_norm": 97.5, "learning_rate": 1.639987445072191e-05, "loss": 2.5655, "step": 2240 }, { "epoch": 3.419931532902244, "grad_norm": 110.0, "learning_rate": 1.6242937853107347e-05, "loss": 2.5357, "step": 2250 }, { "epoch": 3.435146443514644, "grad_norm": 41.75, "learning_rate": 1.608600125549278e-05, "loss": 2.5904, "step": 2260 }, { "epoch": 3.4503613541270446, "grad_norm": 79.5, "learning_rate": 1.5929064657878216e-05, "loss": 2.5566, "step": 2270 }, { "epoch": 3.4655762647394446, "grad_norm": 51.25, "learning_rate": 1.5772128060263654e-05, "loss": 2.7638, "step": 2280 }, { "epoch": 3.480791175351845, "grad_norm": 45.5, "learning_rate": 1.561519146264909e-05, "loss": 2.6242, "step": 2290 }, { "epoch": 3.496006085964245, "grad_norm": 35.0, "learning_rate": 1.5458254865034527e-05, "loss": 2.5996, "step": 2300 }, { "epoch": 3.511220996576645, "grad_norm": 33.75, "learning_rate": 1.530131826741996e-05, "loss": 2.6636, "step": 2310 }, { "epoch": 3.5264359071890454, "grad_norm": 36.75, "learning_rate": 1.51443816698054e-05, "loss": 2.5936, "step": 2320 }, { "epoch": 3.5416508178014454, "grad_norm": 73.5, "learning_rate": 1.4987445072190837e-05, "loss": 2.6029, "step": 2330 }, { "epoch": 3.5568657284138454, "grad_norm": 33.25, "learning_rate": 1.4830508474576272e-05, "loss": 2.6107, "step": 2340 }, { "epoch": 3.572080639026246, "grad_norm": 31.0, "learning_rate": 1.4673571876961708e-05, "loss": 2.6401, "step": 2350 }, { "epoch": 3.587295549638646, "grad_norm": 46.75, "learning_rate": 1.4516635279347143e-05, "loss": 2.6327, "step": 2360 }, { "epoch": 3.602510460251046, "grad_norm": 2928.0, "learning_rate": 1.435969868173258e-05, "loss": 2.668, "step": 2370 }, { "epoch": 3.6177253708634463, "grad_norm": 31.375, "learning_rate": 1.4202762084118019e-05, "loss": 2.6043, "step": 2380 }, { "epoch": 3.6329402814758462, "grad_norm": 27.75, "learning_rate": 1.4045825486503453e-05, "loss": 2.6445, "step": 2390 }, { "epoch": 3.6481551920882467, "grad_norm": 44.0, "learning_rate": 1.388888888888889e-05, "loss": 2.5699, "step": 2400 }, { "epoch": 3.6633701027006467, "grad_norm": 29.875, "learning_rate": 1.3731952291274324e-05, "loss": 2.5164, "step": 2410 }, { "epoch": 3.6785850133130467, "grad_norm": 30.75, "learning_rate": 1.3575015693659762e-05, "loss": 2.6119, "step": 2420 }, { "epoch": 3.693799923925447, "grad_norm": 74.5, "learning_rate": 1.34180790960452e-05, "loss": 2.6144, "step": 2430 }, { "epoch": 3.709014834537847, "grad_norm": 38.5, "learning_rate": 1.3261142498430635e-05, "loss": 2.6119, "step": 2440 }, { "epoch": 3.7242297451502475, "grad_norm": 70.5, "learning_rate": 1.3104205900816071e-05, "loss": 2.5528, "step": 2450 }, { "epoch": 3.7394446557626475, "grad_norm": 31.5, "learning_rate": 1.2947269303201506e-05, "loss": 2.6334, "step": 2460 }, { "epoch": 3.7546595663750475, "grad_norm": 41.5, "learning_rate": 1.2790332705586944e-05, "loss": 2.6298, "step": 2470 }, { "epoch": 3.7698744769874475, "grad_norm": 46.5, "learning_rate": 1.2633396107972378e-05, "loss": 2.563, "step": 2480 }, { "epoch": 3.785089387599848, "grad_norm": 29.125, "learning_rate": 1.2476459510357816e-05, "loss": 2.5884, "step": 2490 }, { "epoch": 3.800304298212248, "grad_norm": 35.25, "learning_rate": 1.2319522912743252e-05, "loss": 2.6457, "step": 2500 }, { "epoch": 3.8155192088246483, "grad_norm": 22.375, "learning_rate": 1.2162586315128689e-05, "loss": 2.5394, "step": 2510 }, { "epoch": 3.8307341194370483, "grad_norm": 48.75, "learning_rate": 1.2005649717514125e-05, "loss": 2.5998, "step": 2520 }, { "epoch": 3.8459490300494483, "grad_norm": 30.375, "learning_rate": 1.1848713119899561e-05, "loss": 2.6238, "step": 2530 }, { "epoch": 3.8611639406618488, "grad_norm": 38.0, "learning_rate": 1.1691776522284998e-05, "loss": 2.6113, "step": 2540 }, { "epoch": 3.8763788512742487, "grad_norm": 39.0, "learning_rate": 1.1534839924670434e-05, "loss": 2.7116, "step": 2550 }, { "epoch": 3.891593761886649, "grad_norm": 44.75, "learning_rate": 1.137790332705587e-05, "loss": 2.6315, "step": 2560 }, { "epoch": 3.906808672499049, "grad_norm": 27.0, "learning_rate": 1.1220966729441306e-05, "loss": 2.6728, "step": 2570 }, { "epoch": 3.922023583111449, "grad_norm": 37.25, "learning_rate": 1.1064030131826743e-05, "loss": 2.7066, "step": 2580 }, { "epoch": 3.937238493723849, "grad_norm": 75.5, "learning_rate": 1.0907093534212179e-05, "loss": 2.5966, "step": 2590 }, { "epoch": 3.9524534043362496, "grad_norm": 37.5, "learning_rate": 1.0750156936597615e-05, "loss": 2.6743, "step": 2600 }, { "epoch": 3.9676683149486496, "grad_norm": 22.5, "learning_rate": 1.0593220338983052e-05, "loss": 2.5749, "step": 2610 }, { "epoch": 3.98288322556105, "grad_norm": 27.125, "learning_rate": 1.0436283741368488e-05, "loss": 2.4513, "step": 2620 }, { "epoch": 3.99809813617345, "grad_norm": 32.25, "learning_rate": 1.0279347143753924e-05, "loss": 2.5954, "step": 2630 }, { "epoch": 4.01217192848992, "grad_norm": 27.75, "learning_rate": 1.012241054613936e-05, "loss": 2.1217, "step": 2640 }, { "epoch": 4.02738683910232, "grad_norm": 37.0, "learning_rate": 9.965473948524797e-06, "loss": 2.4047, "step": 2650 }, { "epoch": 4.042601749714721, "grad_norm": 190.0, "learning_rate": 9.808537350910233e-06, "loss": 2.4063, "step": 2660 }, { "epoch": 4.057816660327121, "grad_norm": 32.0, "learning_rate": 9.65160075329567e-06, "loss": 2.2504, "step": 2670 }, { "epoch": 4.073031570939521, "grad_norm": 30.375, "learning_rate": 9.494664155681106e-06, "loss": 2.4439, "step": 2680 }, { "epoch": 4.088246481551921, "grad_norm": 44.75, "learning_rate": 9.337727558066542e-06, "loss": 2.3362, "step": 2690 }, { "epoch": 4.103461392164321, "grad_norm": 26.375, "learning_rate": 9.180790960451978e-06, "loss": 2.263, "step": 2700 }, { "epoch": 4.118676302776721, "grad_norm": 38.5, "learning_rate": 9.023854362837414e-06, "loss": 2.3142, "step": 2710 }, { "epoch": 4.133891213389122, "grad_norm": 31.125, "learning_rate": 8.86691776522285e-06, "loss": 2.4242, "step": 2720 }, { "epoch": 4.149106124001522, "grad_norm": 30.625, "learning_rate": 8.709981167608287e-06, "loss": 2.3795, "step": 2730 }, { "epoch": 4.164321034613922, "grad_norm": 240.0, "learning_rate": 8.553044569993723e-06, "loss": 2.4871, "step": 2740 }, { "epoch": 4.179535945226322, "grad_norm": 326.0, "learning_rate": 8.39610797237916e-06, "loss": 2.3286, "step": 2750 }, { "epoch": 4.194750855838722, "grad_norm": 61.25, "learning_rate": 8.239171374764596e-06, "loss": 2.3469, "step": 2760 }, { "epoch": 4.2099657664511225, "grad_norm": 30.0, "learning_rate": 8.082234777150032e-06, "loss": 2.4634, "step": 2770 }, { "epoch": 4.2251806770635225, "grad_norm": 43.25, "learning_rate": 7.925298179535467e-06, "loss": 2.5155, "step": 2780 }, { "epoch": 4.2403955876759225, "grad_norm": 63.75, "learning_rate": 7.768361581920905e-06, "loss": 2.5562, "step": 2790 }, { "epoch": 4.2556104982883225, "grad_norm": 180.0, "learning_rate": 7.611424984306341e-06, "loss": 2.856, "step": 2800 }, { "epoch": 4.2708254089007225, "grad_norm": 57.25, "learning_rate": 7.454488386691777e-06, "loss": 2.8994, "step": 2810 }, { "epoch": 4.2860403195131225, "grad_norm": 38.0, "learning_rate": 7.297551789077213e-06, "loss": 2.7051, "step": 2820 }, { "epoch": 4.301255230125523, "grad_norm": 206.0, "learning_rate": 7.140615191462649e-06, "loss": 2.7805, "step": 2830 }, { "epoch": 4.316470140737923, "grad_norm": 74.0, "learning_rate": 6.983678593848085e-06, "loss": 2.722, "step": 2840 }, { "epoch": 4.331685051350323, "grad_norm": 58.0, "learning_rate": 6.826741996233522e-06, "loss": 2.5927, "step": 2850 }, { "epoch": 4.346899961962723, "grad_norm": 49.75, "learning_rate": 6.669805398618959e-06, "loss": 2.5875, "step": 2860 }, { "epoch": 4.362114872575123, "grad_norm": 312.0, "learning_rate": 6.512868801004394e-06, "loss": 2.5255, "step": 2870 }, { "epoch": 4.377329783187523, "grad_norm": 33.5, "learning_rate": 6.3559322033898304e-06, "loss": 2.551, "step": 2880 }, { "epoch": 4.392544693799924, "grad_norm": 51.5, "learning_rate": 6.1989956057752676e-06, "loss": 2.5554, "step": 2890 }, { "epoch": 4.407759604412324, "grad_norm": 43.75, "learning_rate": 6.042059008160703e-06, "loss": 2.601, "step": 2900 }, { "epoch": 4.422974515024724, "grad_norm": 28.5, "learning_rate": 5.885122410546139e-06, "loss": 2.4956, "step": 2910 }, { "epoch": 4.438189425637124, "grad_norm": 30.5, "learning_rate": 5.728185812931576e-06, "loss": 2.5699, "step": 2920 }, { "epoch": 4.453404336249524, "grad_norm": 68.5, "learning_rate": 5.571249215317012e-06, "loss": 2.5997, "step": 2930 }, { "epoch": 4.468619246861925, "grad_norm": 42.5, "learning_rate": 5.414312617702449e-06, "loss": 2.5853, "step": 2940 }, { "epoch": 4.483834157474325, "grad_norm": 124.0, "learning_rate": 5.2573760200878844e-06, "loss": 2.6104, "step": 2950 }, { "epoch": 4.499049068086725, "grad_norm": 860.0, "learning_rate": 5.100439422473321e-06, "loss": 2.5993, "step": 2960 }, { "epoch": 4.514263978699125, "grad_norm": 42.0, "learning_rate": 4.943502824858758e-06, "loss": 2.6001, "step": 2970 }, { "epoch": 4.529478889311525, "grad_norm": 56.25, "learning_rate": 4.786566227244193e-06, "loss": 2.4628, "step": 2980 }, { "epoch": 4.544693799923925, "grad_norm": 45.0, "learning_rate": 4.6296296296296296e-06, "loss": 2.5943, "step": 2990 }, { "epoch": 4.559908710536326, "grad_norm": 64.0, "learning_rate": 4.472693032015067e-06, "loss": 2.6277, "step": 3000 }, { "epoch": 4.575123621148726, "grad_norm": 157.0, "learning_rate": 4.315756434400502e-06, "loss": 2.5966, "step": 3010 }, { "epoch": 4.590338531761126, "grad_norm": 208.0, "learning_rate": 4.1588198367859384e-06, "loss": 2.5342, "step": 3020 }, { "epoch": 4.605553442373526, "grad_norm": 44.75, "learning_rate": 4.001883239171375e-06, "loss": 2.5117, "step": 3030 }, { "epoch": 4.620768352985927, "grad_norm": 159.0, "learning_rate": 3.844946641556811e-06, "loss": 2.4587, "step": 3040 }, { "epoch": 4.635983263598327, "grad_norm": 49.75, "learning_rate": 3.6880100439422477e-06, "loss": 2.5882, "step": 3050 }, { "epoch": 4.651198174210727, "grad_norm": 31.125, "learning_rate": 3.531073446327684e-06, "loss": 2.4809, "step": 3060 }, { "epoch": 4.666413084823127, "grad_norm": 26.625, "learning_rate": 3.37413684871312e-06, "loss": 2.5864, "step": 3070 }, { "epoch": 4.681627995435527, "grad_norm": 45.0, "learning_rate": 3.2172002510985566e-06, "loss": 2.4663, "step": 3080 }, { "epoch": 4.696842906047927, "grad_norm": 81.0, "learning_rate": 3.0602636534839924e-06, "loss": 2.4853, "step": 3090 }, { "epoch": 4.712057816660327, "grad_norm": 976.0, "learning_rate": 2.903327055869429e-06, "loss": 2.5038, "step": 3100 }, { "epoch": 4.7272727272727275, "grad_norm": 50.25, "learning_rate": 2.746390458254865e-06, "loss": 2.5078, "step": 3110 }, { "epoch": 4.7424876378851275, "grad_norm": 23.375, "learning_rate": 2.5894538606403013e-06, "loss": 2.434, "step": 3120 }, { "epoch": 4.7577025484975275, "grad_norm": 69.5, "learning_rate": 2.4325172630257376e-06, "loss": 2.5189, "step": 3130 }, { "epoch": 4.7729174591099275, "grad_norm": 22.5, "learning_rate": 2.2755806654111743e-06, "loss": 2.5092, "step": 3140 }, { "epoch": 4.788132369722328, "grad_norm": 4192.0, "learning_rate": 2.11864406779661e-06, "loss": 2.5068, "step": 3150 }, { "epoch": 4.803347280334728, "grad_norm": 36.0, "learning_rate": 1.9617074701820464e-06, "loss": 2.5104, "step": 3160 }, { "epoch": 4.818562190947128, "grad_norm": 55.5, "learning_rate": 1.804770872567483e-06, "loss": 2.4943, "step": 3170 }, { "epoch": 4.833777101559528, "grad_norm": 23.0, "learning_rate": 1.647834274952919e-06, "loss": 2.4493, "step": 3180 }, { "epoch": 4.848992012171928, "grad_norm": 71.0, "learning_rate": 1.4908976773383553e-06, "loss": 2.5597, "step": 3190 }, { "epoch": 4.864206922784328, "grad_norm": 83.0, "learning_rate": 1.3339610797237918e-06, "loss": 2.5889, "step": 3200 }, { "epoch": 4.879421833396728, "grad_norm": 49.25, "learning_rate": 1.1770244821092279e-06, "loss": 2.5008, "step": 3210 }, { "epoch": 4.894636744009129, "grad_norm": 34.75, "learning_rate": 1.0200878844946644e-06, "loss": 2.5614, "step": 3220 }, { "epoch": 4.909851654621529, "grad_norm": 34.25, "learning_rate": 8.631512868801004e-07, "loss": 2.5362, "step": 3230 }, { "epoch": 4.925066565233929, "grad_norm": 114.5, "learning_rate": 7.062146892655367e-07, "loss": 2.5529, "step": 3240 }, { "epoch": 4.940281475846329, "grad_norm": 444.0, "learning_rate": 5.49278091650973e-07, "loss": 2.4373, "step": 3250 }, { "epoch": 4.95549638645873, "grad_norm": 208.0, "learning_rate": 3.9234149403640934e-07, "loss": 2.5953, "step": 3260 }, { "epoch": 4.97071129707113, "grad_norm": 101.0, "learning_rate": 2.3540489642184557e-07, "loss": 2.5719, "step": 3270 }, { "epoch": 4.98592620768353, "grad_norm": 55.5, "learning_rate": 7.846829880728186e-08, "loss": 2.5252, "step": 3280 } ], "logging_steps": 10, "max_steps": 3285, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.590478085395579e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }