Granite-3.1-8B-instruct-Korean / trainer_state.json
mindw96's picture
Upload folder using huggingface_hub
d4d0002 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.99353366298973,
"eval_steps": 500,
"global_step": 3285,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.015214910612400151,
"grad_norm": 62.0,
"learning_rate": 5.050505050505051e-06,
"loss": 6.7184,
"step": 10
},
{
"epoch": 0.030429821224800303,
"grad_norm": 26.625,
"learning_rate": 1.0101010101010101e-05,
"loss": 5.0379,
"step": 20
},
{
"epoch": 0.045644731837200456,
"grad_norm": 39.25,
"learning_rate": 1.5151515151515153e-05,
"loss": 4.5296,
"step": 30
},
{
"epoch": 0.060859642449600605,
"grad_norm": 13.75,
"learning_rate": 2.0202020202020203e-05,
"loss": 4.5222,
"step": 40
},
{
"epoch": 0.07607455306200075,
"grad_norm": 14.5,
"learning_rate": 2.5252525252525256e-05,
"loss": 4.4963,
"step": 50
},
{
"epoch": 0.09128946367440091,
"grad_norm": 15.125,
"learning_rate": 3.0303030303030306e-05,
"loss": 4.7534,
"step": 60
},
{
"epoch": 0.10650437428680107,
"grad_norm": 92.0,
"learning_rate": 3.535353535353535e-05,
"loss": 4.6209,
"step": 70
},
{
"epoch": 0.12171928489920121,
"grad_norm": 40.5,
"learning_rate": 4.0404040404040405e-05,
"loss": 4.698,
"step": 80
},
{
"epoch": 0.13693419551160138,
"grad_norm": 22.375,
"learning_rate": 4.545454545454546e-05,
"loss": 4.6103,
"step": 90
},
{
"epoch": 0.1521491061240015,
"grad_norm": 45.0,
"learning_rate": 4.9984306340238544e-05,
"loss": 4.8199,
"step": 100
},
{
"epoch": 0.16736401673640167,
"grad_norm": 12.125,
"learning_rate": 4.982736974262398e-05,
"loss": 4.7424,
"step": 110
},
{
"epoch": 0.18257892734880182,
"grad_norm": 18.875,
"learning_rate": 4.967043314500942e-05,
"loss": 4.8093,
"step": 120
},
{
"epoch": 0.19779383796120198,
"grad_norm": 14.1875,
"learning_rate": 4.9513496547394854e-05,
"loss": 4.9051,
"step": 130
},
{
"epoch": 0.21300874857360214,
"grad_norm": 9.375,
"learning_rate": 4.935655994978029e-05,
"loss": 4.8232,
"step": 140
},
{
"epoch": 0.2282236591860023,
"grad_norm": 13.75,
"learning_rate": 4.919962335216573e-05,
"loss": 4.7461,
"step": 150
},
{
"epoch": 0.24343856979840242,
"grad_norm": 9.625,
"learning_rate": 4.9042686754551165e-05,
"loss": 4.7576,
"step": 160
},
{
"epoch": 0.2586534804108026,
"grad_norm": 9.375,
"learning_rate": 4.88857501569366e-05,
"loss": 4.7951,
"step": 170
},
{
"epoch": 0.27386839102320276,
"grad_norm": 9.125,
"learning_rate": 4.8728813559322034e-05,
"loss": 4.7547,
"step": 180
},
{
"epoch": 0.2890833016356029,
"grad_norm": 9.125,
"learning_rate": 4.8571876961707475e-05,
"loss": 4.7108,
"step": 190
},
{
"epoch": 0.304298212248003,
"grad_norm": 11.125,
"learning_rate": 4.841494036409291e-05,
"loss": 4.6421,
"step": 200
},
{
"epoch": 0.3195131228604032,
"grad_norm": 82.5,
"learning_rate": 4.8258003766478345e-05,
"loss": 4.6211,
"step": 210
},
{
"epoch": 0.33472803347280333,
"grad_norm": 96.5,
"learning_rate": 4.8101067168863786e-05,
"loss": 5.1034,
"step": 220
},
{
"epoch": 0.3499429440852035,
"grad_norm": 62.25,
"learning_rate": 4.794413057124922e-05,
"loss": 6.1655,
"step": 230
},
{
"epoch": 0.36515785469760365,
"grad_norm": 116.0,
"learning_rate": 4.7787193973634655e-05,
"loss": 5.6462,
"step": 240
},
{
"epoch": 0.3803727653100038,
"grad_norm": 147.0,
"learning_rate": 4.763025737602009e-05,
"loss": 5.1807,
"step": 250
},
{
"epoch": 0.39558767592240396,
"grad_norm": 19.5,
"learning_rate": 4.747332077840553e-05,
"loss": 4.7096,
"step": 260
},
{
"epoch": 0.4108025865348041,
"grad_norm": 52.5,
"learning_rate": 4.7316384180790966e-05,
"loss": 4.5789,
"step": 270
},
{
"epoch": 0.42601749714720427,
"grad_norm": 310.0,
"learning_rate": 4.71594475831764e-05,
"loss": 5.1039,
"step": 280
},
{
"epoch": 0.44123240775960443,
"grad_norm": 153.0,
"learning_rate": 4.7002510985561835e-05,
"loss": 5.1985,
"step": 290
},
{
"epoch": 0.4564473183720046,
"grad_norm": 39.5,
"learning_rate": 4.684557438794727e-05,
"loss": 5.1797,
"step": 300
},
{
"epoch": 0.47166222898440474,
"grad_norm": 100.0,
"learning_rate": 4.6688637790332704e-05,
"loss": 5.1788,
"step": 310
},
{
"epoch": 0.48687713959680484,
"grad_norm": 41.5,
"learning_rate": 4.6531701192718145e-05,
"loss": 5.046,
"step": 320
},
{
"epoch": 0.502092050209205,
"grad_norm": 964.0,
"learning_rate": 4.637476459510358e-05,
"loss": 5.0876,
"step": 330
},
{
"epoch": 0.5173069608216052,
"grad_norm": 192.0,
"learning_rate": 4.6217827997489015e-05,
"loss": 5.0314,
"step": 340
},
{
"epoch": 0.5325218714340053,
"grad_norm": 41.75,
"learning_rate": 4.606089139987445e-05,
"loss": 5.0153,
"step": 350
},
{
"epoch": 0.5477367820464055,
"grad_norm": 51.0,
"learning_rate": 4.590395480225989e-05,
"loss": 4.7087,
"step": 360
},
{
"epoch": 0.5629516926588056,
"grad_norm": 101.0,
"learning_rate": 4.5747018204645325e-05,
"loss": 4.8249,
"step": 370
},
{
"epoch": 0.5781666032712058,
"grad_norm": 32.25,
"learning_rate": 4.559008160703076e-05,
"loss": 4.9544,
"step": 380
},
{
"epoch": 0.5933815138836059,
"grad_norm": 18.5,
"learning_rate": 4.54331450094162e-05,
"loss": 4.7081,
"step": 390
},
{
"epoch": 0.608596424496006,
"grad_norm": 49.25,
"learning_rate": 4.5276208411801636e-05,
"loss": 4.9085,
"step": 400
},
{
"epoch": 0.6238113351084063,
"grad_norm": 446.0,
"learning_rate": 4.511927181418707e-05,
"loss": 4.8538,
"step": 410
},
{
"epoch": 0.6390262457208064,
"grad_norm": 59.25,
"learning_rate": 4.4962335216572505e-05,
"loss": 4.6111,
"step": 420
},
{
"epoch": 0.6542411563332066,
"grad_norm": 60.0,
"learning_rate": 4.4805398618957946e-05,
"loss": 4.6564,
"step": 430
},
{
"epoch": 0.6694560669456067,
"grad_norm": 37.5,
"learning_rate": 4.464846202134338e-05,
"loss": 4.6277,
"step": 440
},
{
"epoch": 0.6846709775580069,
"grad_norm": 8.75,
"learning_rate": 4.4491525423728816e-05,
"loss": 4.4838,
"step": 450
},
{
"epoch": 0.699885888170407,
"grad_norm": 20.375,
"learning_rate": 4.433458882611426e-05,
"loss": 4.6507,
"step": 460
},
{
"epoch": 0.7151007987828072,
"grad_norm": 22.0,
"learning_rate": 4.417765222849969e-05,
"loss": 4.7077,
"step": 470
},
{
"epoch": 0.7303157093952073,
"grad_norm": 12.25,
"learning_rate": 4.4020715630885126e-05,
"loss": 4.5839,
"step": 480
},
{
"epoch": 0.7455306200076075,
"grad_norm": 9.1875,
"learning_rate": 4.386377903327056e-05,
"loss": 4.5522,
"step": 490
},
{
"epoch": 0.7607455306200076,
"grad_norm": 13.3125,
"learning_rate": 4.3706842435655995e-05,
"loss": 4.686,
"step": 500
},
{
"epoch": 0.7759604412324077,
"grad_norm": 14.5625,
"learning_rate": 4.354990583804143e-05,
"loss": 4.444,
"step": 510
},
{
"epoch": 0.7911753518448079,
"grad_norm": 580.0,
"learning_rate": 4.3392969240426864e-05,
"loss": 5.0163,
"step": 520
},
{
"epoch": 0.806390262457208,
"grad_norm": 54.25,
"learning_rate": 4.3236032642812306e-05,
"loss": 5.1205,
"step": 530
},
{
"epoch": 0.8216051730696082,
"grad_norm": 43.5,
"learning_rate": 4.307909604519774e-05,
"loss": 4.7013,
"step": 540
},
{
"epoch": 0.8368200836820083,
"grad_norm": 22.125,
"learning_rate": 4.2922159447583175e-05,
"loss": 4.5653,
"step": 550
},
{
"epoch": 0.8520349942944085,
"grad_norm": 19.0,
"learning_rate": 4.2765222849968616e-05,
"loss": 4.465,
"step": 560
},
{
"epoch": 0.8672499049068086,
"grad_norm": 72.0,
"learning_rate": 4.260828625235405e-05,
"loss": 4.6378,
"step": 570
},
{
"epoch": 0.8824648155192089,
"grad_norm": 14.4375,
"learning_rate": 4.2451349654739486e-05,
"loss": 4.6684,
"step": 580
},
{
"epoch": 0.897679726131609,
"grad_norm": 75.5,
"learning_rate": 4.229441305712492e-05,
"loss": 4.5414,
"step": 590
},
{
"epoch": 0.9128946367440092,
"grad_norm": 808.0,
"learning_rate": 4.213747645951036e-05,
"loss": 4.476,
"step": 600
},
{
"epoch": 0.9281095473564093,
"grad_norm": 18.625,
"learning_rate": 4.1980539861895796e-05,
"loss": 4.4283,
"step": 610
},
{
"epoch": 0.9433244579688095,
"grad_norm": 20.25,
"learning_rate": 4.182360326428123e-05,
"loss": 4.4041,
"step": 620
},
{
"epoch": 0.9585393685812096,
"grad_norm": 430.0,
"learning_rate": 4.166666666666667e-05,
"loss": 4.3725,
"step": 630
},
{
"epoch": 0.9737542791936097,
"grad_norm": 246.0,
"learning_rate": 4.150973006905211e-05,
"loss": 4.591,
"step": 640
},
{
"epoch": 0.9889691898060099,
"grad_norm": 15.25,
"learning_rate": 4.135279347143754e-05,
"loss": 4.684,
"step": 650
},
{
"epoch": 1.00304298212248,
"grad_norm": 19.0,
"learning_rate": 4.119585687382298e-05,
"loss": 3.8939,
"step": 660
},
{
"epoch": 1.0182578927348802,
"grad_norm": 16.5,
"learning_rate": 4.103892027620842e-05,
"loss": 3.9108,
"step": 670
},
{
"epoch": 1.0334728033472804,
"grad_norm": 18.625,
"learning_rate": 4.088198367859385e-05,
"loss": 4.0937,
"step": 680
},
{
"epoch": 1.0486877139596804,
"grad_norm": 109.0,
"learning_rate": 4.0725047080979286e-05,
"loss": 3.8888,
"step": 690
},
{
"epoch": 1.0639026245720806,
"grad_norm": 14.6875,
"learning_rate": 4.056811048336472e-05,
"loss": 4.0143,
"step": 700
},
{
"epoch": 1.0791175351844808,
"grad_norm": 22.25,
"learning_rate": 4.0411173885750156e-05,
"loss": 3.8516,
"step": 710
},
{
"epoch": 1.0943324457968808,
"grad_norm": 23.0,
"learning_rate": 4.025423728813559e-05,
"loss": 3.95,
"step": 720
},
{
"epoch": 1.109547356409281,
"grad_norm": 13.375,
"learning_rate": 4.009730069052103e-05,
"loss": 3.913,
"step": 730
},
{
"epoch": 1.1247622670216813,
"grad_norm": 13.8125,
"learning_rate": 3.9940364092906466e-05,
"loss": 3.9559,
"step": 740
},
{
"epoch": 1.1399771776340815,
"grad_norm": 11.125,
"learning_rate": 3.97834274952919e-05,
"loss": 3.9732,
"step": 750
},
{
"epoch": 1.1551920882464817,
"grad_norm": 20.375,
"learning_rate": 3.962649089767734e-05,
"loss": 4.0201,
"step": 760
},
{
"epoch": 1.1704069988588817,
"grad_norm": 23.5,
"learning_rate": 3.946955430006278e-05,
"loss": 3.9096,
"step": 770
},
{
"epoch": 1.1856219094712819,
"grad_norm": 17.875,
"learning_rate": 3.931261770244821e-05,
"loss": 3.9626,
"step": 780
},
{
"epoch": 1.200836820083682,
"grad_norm": 18.375,
"learning_rate": 3.9155681104833646e-05,
"loss": 3.9349,
"step": 790
},
{
"epoch": 1.216051730696082,
"grad_norm": 36.0,
"learning_rate": 3.899874450721909e-05,
"loss": 3.8503,
"step": 800
},
{
"epoch": 1.2312666413084823,
"grad_norm": 18.625,
"learning_rate": 3.884180790960452e-05,
"loss": 3.9523,
"step": 810
},
{
"epoch": 1.2464815519208825,
"grad_norm": 13.6875,
"learning_rate": 3.8684871311989956e-05,
"loss": 3.7665,
"step": 820
},
{
"epoch": 1.2616964625332825,
"grad_norm": 14.375,
"learning_rate": 3.85279347143754e-05,
"loss": 3.8204,
"step": 830
},
{
"epoch": 1.2769113731456827,
"grad_norm": 14.625,
"learning_rate": 3.837099811676083e-05,
"loss": 3.8995,
"step": 840
},
{
"epoch": 1.292126283758083,
"grad_norm": 20.375,
"learning_rate": 3.821406151914627e-05,
"loss": 3.9207,
"step": 850
},
{
"epoch": 1.3073411943704831,
"grad_norm": 12.4375,
"learning_rate": 3.80571249215317e-05,
"loss": 3.9366,
"step": 860
},
{
"epoch": 1.3225561049828833,
"grad_norm": 22.25,
"learning_rate": 3.790018832391714e-05,
"loss": 3.7521,
"step": 870
},
{
"epoch": 1.3377710155952833,
"grad_norm": 20.625,
"learning_rate": 3.774325172630258e-05,
"loss": 3.8315,
"step": 880
},
{
"epoch": 1.3529859262076835,
"grad_norm": 21.875,
"learning_rate": 3.758631512868801e-05,
"loss": 3.8439,
"step": 890
},
{
"epoch": 1.3682008368200838,
"grad_norm": 23.75,
"learning_rate": 3.7429378531073453e-05,
"loss": 3.773,
"step": 900
},
{
"epoch": 1.3834157474324837,
"grad_norm": 134.0,
"learning_rate": 3.727244193345889e-05,
"loss": 3.8877,
"step": 910
},
{
"epoch": 1.398630658044884,
"grad_norm": 24.625,
"learning_rate": 3.711550533584432e-05,
"loss": 3.8059,
"step": 920
},
{
"epoch": 1.4138455686572842,
"grad_norm": 103.0,
"learning_rate": 3.695856873822976e-05,
"loss": 3.8337,
"step": 930
},
{
"epoch": 1.4290604792696842,
"grad_norm": 20.375,
"learning_rate": 3.680163214061519e-05,
"loss": 3.8493,
"step": 940
},
{
"epoch": 1.4442753898820844,
"grad_norm": 23.625,
"learning_rate": 3.6644695543000626e-05,
"loss": 3.8309,
"step": 950
},
{
"epoch": 1.4594903004944846,
"grad_norm": 17.625,
"learning_rate": 3.648775894538606e-05,
"loss": 3.7735,
"step": 960
},
{
"epoch": 1.4747052111068848,
"grad_norm": 20.0,
"learning_rate": 3.63308223477715e-05,
"loss": 3.7216,
"step": 970
},
{
"epoch": 1.489920121719285,
"grad_norm": 18.75,
"learning_rate": 3.617388575015694e-05,
"loss": 3.8008,
"step": 980
},
{
"epoch": 1.505135032331685,
"grad_norm": 25.25,
"learning_rate": 3.601694915254237e-05,
"loss": 3.8333,
"step": 990
},
{
"epoch": 1.5203499429440852,
"grad_norm": 16.375,
"learning_rate": 3.586001255492781e-05,
"loss": 3.8395,
"step": 1000
},
{
"epoch": 1.5355648535564854,
"grad_norm": 16.5,
"learning_rate": 3.570307595731325e-05,
"loss": 3.8099,
"step": 1010
},
{
"epoch": 1.5507797641688854,
"grad_norm": 29.875,
"learning_rate": 3.554613935969868e-05,
"loss": 3.8804,
"step": 1020
},
{
"epoch": 1.5659946747812856,
"grad_norm": 211.0,
"learning_rate": 3.538920276208412e-05,
"loss": 4.9431,
"step": 1030
},
{
"epoch": 1.5812095853936858,
"grad_norm": 39.0,
"learning_rate": 3.523226616446956e-05,
"loss": 4.2009,
"step": 1040
},
{
"epoch": 1.5964244960060858,
"grad_norm": 34.5,
"learning_rate": 3.507532956685499e-05,
"loss": 3.7996,
"step": 1050
},
{
"epoch": 1.6116394066184863,
"grad_norm": 136.0,
"learning_rate": 3.491839296924043e-05,
"loss": 3.7328,
"step": 1060
},
{
"epoch": 1.6268543172308862,
"grad_norm": 70.5,
"learning_rate": 3.476145637162587e-05,
"loss": 3.8927,
"step": 1070
},
{
"epoch": 1.6420692278432865,
"grad_norm": 185.0,
"learning_rate": 3.46045197740113e-05,
"loss": 4.4253,
"step": 1080
},
{
"epoch": 1.6572841384556867,
"grad_norm": 25.25,
"learning_rate": 3.444758317639674e-05,
"loss": 4.2154,
"step": 1090
},
{
"epoch": 1.6724990490680867,
"grad_norm": 40.0,
"learning_rate": 3.429064657878218e-05,
"loss": 3.9789,
"step": 1100
},
{
"epoch": 1.6877139596804869,
"grad_norm": 41.5,
"learning_rate": 3.4133709981167614e-05,
"loss": 4.7068,
"step": 1110
},
{
"epoch": 1.702928870292887,
"grad_norm": 21.625,
"learning_rate": 3.397677338355305e-05,
"loss": 4.134,
"step": 1120
},
{
"epoch": 1.718143780905287,
"grad_norm": 60.25,
"learning_rate": 3.381983678593848e-05,
"loss": 4.0554,
"step": 1130
},
{
"epoch": 1.7333586915176873,
"grad_norm": 36.25,
"learning_rate": 3.366290018832392e-05,
"loss": 4.0661,
"step": 1140
},
{
"epoch": 1.7485736021300875,
"grad_norm": 135.0,
"learning_rate": 3.350596359070935e-05,
"loss": 4.1099,
"step": 1150
},
{
"epoch": 1.7637885127424875,
"grad_norm": 17.375,
"learning_rate": 3.334902699309479e-05,
"loss": 4.1226,
"step": 1160
},
{
"epoch": 1.779003423354888,
"grad_norm": 40.25,
"learning_rate": 3.319209039548023e-05,
"loss": 4.1996,
"step": 1170
},
{
"epoch": 1.794218333967288,
"grad_norm": 54.25,
"learning_rate": 3.303515379786566e-05,
"loss": 4.2895,
"step": 1180
},
{
"epoch": 1.8094332445796881,
"grad_norm": 82.5,
"learning_rate": 3.28782172002511e-05,
"loss": 4.0929,
"step": 1190
},
{
"epoch": 1.8246481551920883,
"grad_norm": 49.75,
"learning_rate": 3.272128060263653e-05,
"loss": 4.1792,
"step": 1200
},
{
"epoch": 1.8398630658044883,
"grad_norm": 34.0,
"learning_rate": 3.256434400502197e-05,
"loss": 4.0729,
"step": 1210
},
{
"epoch": 1.8550779764168885,
"grad_norm": 33.5,
"learning_rate": 3.240740740740741e-05,
"loss": 3.9144,
"step": 1220
},
{
"epoch": 1.8702928870292888,
"grad_norm": 24.125,
"learning_rate": 3.225047080979284e-05,
"loss": 3.9079,
"step": 1230
},
{
"epoch": 1.8855077976416887,
"grad_norm": 25.25,
"learning_rate": 3.2093534212178284e-05,
"loss": 3.78,
"step": 1240
},
{
"epoch": 1.900722708254089,
"grad_norm": 37.0,
"learning_rate": 3.193659761456372e-05,
"loss": 4.0513,
"step": 1250
},
{
"epoch": 1.9159376188664892,
"grad_norm": 31.5,
"learning_rate": 3.177966101694915e-05,
"loss": 4.0164,
"step": 1260
},
{
"epoch": 1.9311525294788892,
"grad_norm": 18.375,
"learning_rate": 3.1622724419334594e-05,
"loss": 4.0181,
"step": 1270
},
{
"epoch": 1.9463674400912896,
"grad_norm": 38.5,
"learning_rate": 3.146578782172003e-05,
"loss": 4.104,
"step": 1280
},
{
"epoch": 1.9615823507036896,
"grad_norm": 132.0,
"learning_rate": 3.1308851224105464e-05,
"loss": 4.1902,
"step": 1290
},
{
"epoch": 1.9767972613160898,
"grad_norm": 46.75,
"learning_rate": 3.11519146264909e-05,
"loss": 4.318,
"step": 1300
},
{
"epoch": 1.99201217192849,
"grad_norm": 28.125,
"learning_rate": 3.099497802887634e-05,
"loss": 4.0396,
"step": 1310
},
{
"epoch": 2.00608596424496,
"grad_norm": 30.0,
"learning_rate": 3.0838041431261774e-05,
"loss": 3.4143,
"step": 1320
},
{
"epoch": 2.0213008748573604,
"grad_norm": 29.875,
"learning_rate": 3.068110483364721e-05,
"loss": 3.3867,
"step": 1330
},
{
"epoch": 2.0365157854697604,
"grad_norm": 34.25,
"learning_rate": 3.052416823603264e-05,
"loss": 3.3467,
"step": 1340
},
{
"epoch": 2.0517306960821604,
"grad_norm": 64.0,
"learning_rate": 3.036723163841808e-05,
"loss": 3.403,
"step": 1350
},
{
"epoch": 2.066945606694561,
"grad_norm": 19.0,
"learning_rate": 3.0210295040803516e-05,
"loss": 3.3264,
"step": 1360
},
{
"epoch": 2.082160517306961,
"grad_norm": 21.5,
"learning_rate": 3.005335844318895e-05,
"loss": 3.3493,
"step": 1370
},
{
"epoch": 2.097375427919361,
"grad_norm": 32.75,
"learning_rate": 2.9896421845574392e-05,
"loss": 3.1572,
"step": 1380
},
{
"epoch": 2.1125903385317613,
"grad_norm": 50.25,
"learning_rate": 2.9739485247959826e-05,
"loss": 3.2606,
"step": 1390
},
{
"epoch": 2.1278052491441612,
"grad_norm": 29.875,
"learning_rate": 2.958254865034526e-05,
"loss": 3.2852,
"step": 1400
},
{
"epoch": 2.1430201597565612,
"grad_norm": 29.125,
"learning_rate": 2.94256120527307e-05,
"loss": 3.1659,
"step": 1410
},
{
"epoch": 2.1582350703689617,
"grad_norm": 50.0,
"learning_rate": 2.9268675455116134e-05,
"loss": 3.1076,
"step": 1420
},
{
"epoch": 2.1734499809813617,
"grad_norm": 28.375,
"learning_rate": 2.9111738857501568e-05,
"loss": 3.2414,
"step": 1430
},
{
"epoch": 2.1886648915937617,
"grad_norm": 55.25,
"learning_rate": 2.895480225988701e-05,
"loss": 3.1755,
"step": 1440
},
{
"epoch": 2.203879802206162,
"grad_norm": 20.0,
"learning_rate": 2.8797865662272444e-05,
"loss": 3.2081,
"step": 1450
},
{
"epoch": 2.219094712818562,
"grad_norm": 31.5,
"learning_rate": 2.864092906465788e-05,
"loss": 3.1076,
"step": 1460
},
{
"epoch": 2.2343096234309625,
"grad_norm": 24.25,
"learning_rate": 2.8483992467043313e-05,
"loss": 3.1507,
"step": 1470
},
{
"epoch": 2.2495245340433625,
"grad_norm": 58.25,
"learning_rate": 2.8327055869428755e-05,
"loss": 3.2108,
"step": 1480
},
{
"epoch": 2.2647394446557625,
"grad_norm": 65.0,
"learning_rate": 2.817011927181419e-05,
"loss": 3.2844,
"step": 1490
},
{
"epoch": 2.279954355268163,
"grad_norm": 42.25,
"learning_rate": 2.8013182674199624e-05,
"loss": 3.2657,
"step": 1500
},
{
"epoch": 2.295169265880563,
"grad_norm": 17.375,
"learning_rate": 2.7856246076585062e-05,
"loss": 3.267,
"step": 1510
},
{
"epoch": 2.3103841764929633,
"grad_norm": 26.75,
"learning_rate": 2.7699309478970496e-05,
"loss": 3.2966,
"step": 1520
},
{
"epoch": 2.3255990871053633,
"grad_norm": 30.875,
"learning_rate": 2.754237288135593e-05,
"loss": 3.217,
"step": 1530
},
{
"epoch": 2.3408139977177633,
"grad_norm": 24.75,
"learning_rate": 2.7385436283741372e-05,
"loss": 3.1645,
"step": 1540
},
{
"epoch": 2.3560289083301633,
"grad_norm": 35.0,
"learning_rate": 2.7228499686126807e-05,
"loss": 3.1515,
"step": 1550
},
{
"epoch": 2.3712438189425638,
"grad_norm": 51.75,
"learning_rate": 2.707156308851224e-05,
"loss": 3.0929,
"step": 1560
},
{
"epoch": 2.3864587295549637,
"grad_norm": 30.875,
"learning_rate": 2.6914626490897676e-05,
"loss": 3.1356,
"step": 1570
},
{
"epoch": 2.401673640167364,
"grad_norm": 35.5,
"learning_rate": 2.6757689893283118e-05,
"loss": 3.1858,
"step": 1580
},
{
"epoch": 2.416888550779764,
"grad_norm": 33.0,
"learning_rate": 2.6600753295668552e-05,
"loss": 3.1423,
"step": 1590
},
{
"epoch": 2.432103461392164,
"grad_norm": 44.0,
"learning_rate": 2.6443816698053987e-05,
"loss": 3.0862,
"step": 1600
},
{
"epoch": 2.4473183720045646,
"grad_norm": 23.125,
"learning_rate": 2.6286880100439425e-05,
"loss": 3.0965,
"step": 1610
},
{
"epoch": 2.4625332826169646,
"grad_norm": 29.5,
"learning_rate": 2.612994350282486e-05,
"loss": 3.11,
"step": 1620
},
{
"epoch": 2.477748193229365,
"grad_norm": 37.5,
"learning_rate": 2.5973006905210294e-05,
"loss": 3.0674,
"step": 1630
},
{
"epoch": 2.492963103841765,
"grad_norm": 44.5,
"learning_rate": 2.581607030759573e-05,
"loss": 3.1255,
"step": 1640
},
{
"epoch": 2.508178014454165,
"grad_norm": 44.5,
"learning_rate": 2.565913370998117e-05,
"loss": 3.0898,
"step": 1650
},
{
"epoch": 2.523392925066565,
"grad_norm": 30.25,
"learning_rate": 2.5502197112366604e-05,
"loss": 3.0759,
"step": 1660
},
{
"epoch": 2.5386078356789654,
"grad_norm": 41.0,
"learning_rate": 2.534526051475204e-05,
"loss": 3.2519,
"step": 1670
},
{
"epoch": 2.5538227462913654,
"grad_norm": 22.25,
"learning_rate": 2.518832391713748e-05,
"loss": 3.1336,
"step": 1680
},
{
"epoch": 2.569037656903766,
"grad_norm": 47.0,
"learning_rate": 2.5031387319522915e-05,
"loss": 3.1522,
"step": 1690
},
{
"epoch": 2.584252567516166,
"grad_norm": 41.25,
"learning_rate": 2.487445072190835e-05,
"loss": 3.1463,
"step": 1700
},
{
"epoch": 2.599467478128566,
"grad_norm": 33.75,
"learning_rate": 2.4717514124293788e-05,
"loss": 3.1115,
"step": 1710
},
{
"epoch": 2.6146823887409663,
"grad_norm": 27.5,
"learning_rate": 2.4560577526679222e-05,
"loss": 3.1908,
"step": 1720
},
{
"epoch": 2.6298972993533662,
"grad_norm": 59.5,
"learning_rate": 2.4403640929064657e-05,
"loss": 3.035,
"step": 1730
},
{
"epoch": 2.6451122099657667,
"grad_norm": 34.75,
"learning_rate": 2.4246704331450095e-05,
"loss": 3.0231,
"step": 1740
},
{
"epoch": 2.6603271205781667,
"grad_norm": 34.75,
"learning_rate": 2.408976773383553e-05,
"loss": 3.0962,
"step": 1750
},
{
"epoch": 2.6755420311905667,
"grad_norm": 20.625,
"learning_rate": 2.3932831136220967e-05,
"loss": 3.1877,
"step": 1760
},
{
"epoch": 2.6907569418029667,
"grad_norm": 38.25,
"learning_rate": 2.3775894538606405e-05,
"loss": 2.9851,
"step": 1770
},
{
"epoch": 2.705971852415367,
"grad_norm": 24.125,
"learning_rate": 2.361895794099184e-05,
"loss": 3.1095,
"step": 1780
},
{
"epoch": 2.721186763027767,
"grad_norm": 30.5,
"learning_rate": 2.3462021343377278e-05,
"loss": 3.0544,
"step": 1790
},
{
"epoch": 2.7364016736401675,
"grad_norm": 32.0,
"learning_rate": 2.3305084745762712e-05,
"loss": 3.0336,
"step": 1800
},
{
"epoch": 2.7516165842525675,
"grad_norm": 48.5,
"learning_rate": 2.314814814814815e-05,
"loss": 3.1227,
"step": 1810
},
{
"epoch": 2.7668314948649675,
"grad_norm": 37.25,
"learning_rate": 2.299121155053359e-05,
"loss": 3.0989,
"step": 1820
},
{
"epoch": 2.782046405477368,
"grad_norm": 20.25,
"learning_rate": 2.2834274952919023e-05,
"loss": 3.158,
"step": 1830
},
{
"epoch": 2.797261316089768,
"grad_norm": 35.0,
"learning_rate": 2.2677338355304458e-05,
"loss": 3.1081,
"step": 1840
},
{
"epoch": 2.8124762267021683,
"grad_norm": 49.0,
"learning_rate": 2.2520401757689892e-05,
"loss": 3.1436,
"step": 1850
},
{
"epoch": 2.8276911373145683,
"grad_norm": 18.625,
"learning_rate": 2.236346516007533e-05,
"loss": 3.0876,
"step": 1860
},
{
"epoch": 2.8429060479269683,
"grad_norm": 26.375,
"learning_rate": 2.2206528562460768e-05,
"loss": 3.0573,
"step": 1870
},
{
"epoch": 2.8581209585393683,
"grad_norm": 44.0,
"learning_rate": 2.2049591964846203e-05,
"loss": 3.0853,
"step": 1880
},
{
"epoch": 2.8733358691517688,
"grad_norm": 33.0,
"learning_rate": 2.189265536723164e-05,
"loss": 3.2242,
"step": 1890
},
{
"epoch": 2.8885507797641687,
"grad_norm": 89.0,
"learning_rate": 2.1735718769617075e-05,
"loss": 3.1315,
"step": 1900
},
{
"epoch": 2.903765690376569,
"grad_norm": 29.875,
"learning_rate": 2.1578782172002513e-05,
"loss": 3.1638,
"step": 1910
},
{
"epoch": 2.918980600988969,
"grad_norm": 27.125,
"learning_rate": 2.1421845574387948e-05,
"loss": 3.0408,
"step": 1920
},
{
"epoch": 2.934195511601369,
"grad_norm": 42.5,
"learning_rate": 2.1264908976773386e-05,
"loss": 3.0296,
"step": 1930
},
{
"epoch": 2.9494104222137696,
"grad_norm": 37.5,
"learning_rate": 2.110797237915882e-05,
"loss": 3.1607,
"step": 1940
},
{
"epoch": 2.9646253328261696,
"grad_norm": 31.5,
"learning_rate": 2.0951035781544255e-05,
"loss": 3.0746,
"step": 1950
},
{
"epoch": 2.97984024343857,
"grad_norm": 40.0,
"learning_rate": 2.0794099183929693e-05,
"loss": 3.0749,
"step": 1960
},
{
"epoch": 2.99505515405097,
"grad_norm": 21.375,
"learning_rate": 2.0637162586315128e-05,
"loss": 3.1096,
"step": 1970
},
{
"epoch": 3.00912894636744,
"grad_norm": 37.25,
"learning_rate": 2.0480225988700566e-05,
"loss": 2.5168,
"step": 1980
},
{
"epoch": 3.0243438569798404,
"grad_norm": 37.75,
"learning_rate": 2.0323289391086004e-05,
"loss": 2.4061,
"step": 1990
},
{
"epoch": 3.0395587675922404,
"grad_norm": 28.125,
"learning_rate": 2.0166352793471438e-05,
"loss": 2.4859,
"step": 2000
},
{
"epoch": 3.0547736782046404,
"grad_norm": 29.375,
"learning_rate": 2.0009416195856876e-05,
"loss": 2.3777,
"step": 2010
},
{
"epoch": 3.069988588817041,
"grad_norm": 88.5,
"learning_rate": 1.985247959824231e-05,
"loss": 2.4563,
"step": 2020
},
{
"epoch": 3.085203499429441,
"grad_norm": 33.0,
"learning_rate": 1.969554300062775e-05,
"loss": 2.393,
"step": 2030
},
{
"epoch": 3.100418410041841,
"grad_norm": 25.5,
"learning_rate": 1.9538606403013183e-05,
"loss": 2.4262,
"step": 2040
},
{
"epoch": 3.1156333206542413,
"grad_norm": 52.25,
"learning_rate": 1.9381669805398618e-05,
"loss": 2.394,
"step": 2050
},
{
"epoch": 3.1308482312666412,
"grad_norm": 28.25,
"learning_rate": 1.9224733207784056e-05,
"loss": 2.4209,
"step": 2060
},
{
"epoch": 3.1460631418790417,
"grad_norm": 24.125,
"learning_rate": 1.906779661016949e-05,
"loss": 2.4335,
"step": 2070
},
{
"epoch": 3.1612780524914417,
"grad_norm": 35.5,
"learning_rate": 1.891086001255493e-05,
"loss": 2.4935,
"step": 2080
},
{
"epoch": 3.1764929631038417,
"grad_norm": 42.5,
"learning_rate": 1.8753923414940363e-05,
"loss": 2.4604,
"step": 2090
},
{
"epoch": 3.191707873716242,
"grad_norm": 29.25,
"learning_rate": 1.85969868173258e-05,
"loss": 2.4856,
"step": 2100
},
{
"epoch": 3.206922784328642,
"grad_norm": 34.75,
"learning_rate": 1.844005021971124e-05,
"loss": 2.4311,
"step": 2110
},
{
"epoch": 3.222137694941042,
"grad_norm": 56.0,
"learning_rate": 1.8283113622096674e-05,
"loss": 2.4334,
"step": 2120
},
{
"epoch": 3.2373526055534425,
"grad_norm": 38.25,
"learning_rate": 1.812617702448211e-05,
"loss": 2.5363,
"step": 2130
},
{
"epoch": 3.2525675161658425,
"grad_norm": 36.25,
"learning_rate": 1.7969240426867546e-05,
"loss": 2.4689,
"step": 2140
},
{
"epoch": 3.2677824267782425,
"grad_norm": 27.375,
"learning_rate": 1.7812303829252984e-05,
"loss": 2.522,
"step": 2150
},
{
"epoch": 3.282997337390643,
"grad_norm": 37.5,
"learning_rate": 1.765536723163842e-05,
"loss": 2.561,
"step": 2160
},
{
"epoch": 3.298212248003043,
"grad_norm": 56.5,
"learning_rate": 1.7498430634023853e-05,
"loss": 2.5073,
"step": 2170
},
{
"epoch": 3.3134271586154433,
"grad_norm": 31.875,
"learning_rate": 1.734149403640929e-05,
"loss": 2.6292,
"step": 2180
},
{
"epoch": 3.3286420692278433,
"grad_norm": 66.5,
"learning_rate": 1.7184557438794726e-05,
"loss": 2.5683,
"step": 2190
},
{
"epoch": 3.3438569798402433,
"grad_norm": 163.0,
"learning_rate": 1.7027620841180164e-05,
"loss": 2.4828,
"step": 2200
},
{
"epoch": 3.3590718904526438,
"grad_norm": 28.625,
"learning_rate": 1.6870684243565602e-05,
"loss": 2.4985,
"step": 2210
},
{
"epoch": 3.3742868010650438,
"grad_norm": 26.5,
"learning_rate": 1.6713747645951036e-05,
"loss": 2.5606,
"step": 2220
},
{
"epoch": 3.3895017116774437,
"grad_norm": 29.0,
"learning_rate": 1.6556811048336474e-05,
"loss": 2.5204,
"step": 2230
},
{
"epoch": 3.404716622289844,
"grad_norm": 97.5,
"learning_rate": 1.639987445072191e-05,
"loss": 2.5655,
"step": 2240
},
{
"epoch": 3.419931532902244,
"grad_norm": 110.0,
"learning_rate": 1.6242937853107347e-05,
"loss": 2.5357,
"step": 2250
},
{
"epoch": 3.435146443514644,
"grad_norm": 41.75,
"learning_rate": 1.608600125549278e-05,
"loss": 2.5904,
"step": 2260
},
{
"epoch": 3.4503613541270446,
"grad_norm": 79.5,
"learning_rate": 1.5929064657878216e-05,
"loss": 2.5566,
"step": 2270
},
{
"epoch": 3.4655762647394446,
"grad_norm": 51.25,
"learning_rate": 1.5772128060263654e-05,
"loss": 2.7638,
"step": 2280
},
{
"epoch": 3.480791175351845,
"grad_norm": 45.5,
"learning_rate": 1.561519146264909e-05,
"loss": 2.6242,
"step": 2290
},
{
"epoch": 3.496006085964245,
"grad_norm": 35.0,
"learning_rate": 1.5458254865034527e-05,
"loss": 2.5996,
"step": 2300
},
{
"epoch": 3.511220996576645,
"grad_norm": 33.75,
"learning_rate": 1.530131826741996e-05,
"loss": 2.6636,
"step": 2310
},
{
"epoch": 3.5264359071890454,
"grad_norm": 36.75,
"learning_rate": 1.51443816698054e-05,
"loss": 2.5936,
"step": 2320
},
{
"epoch": 3.5416508178014454,
"grad_norm": 73.5,
"learning_rate": 1.4987445072190837e-05,
"loss": 2.6029,
"step": 2330
},
{
"epoch": 3.5568657284138454,
"grad_norm": 33.25,
"learning_rate": 1.4830508474576272e-05,
"loss": 2.6107,
"step": 2340
},
{
"epoch": 3.572080639026246,
"grad_norm": 31.0,
"learning_rate": 1.4673571876961708e-05,
"loss": 2.6401,
"step": 2350
},
{
"epoch": 3.587295549638646,
"grad_norm": 46.75,
"learning_rate": 1.4516635279347143e-05,
"loss": 2.6327,
"step": 2360
},
{
"epoch": 3.602510460251046,
"grad_norm": 2928.0,
"learning_rate": 1.435969868173258e-05,
"loss": 2.668,
"step": 2370
},
{
"epoch": 3.6177253708634463,
"grad_norm": 31.375,
"learning_rate": 1.4202762084118019e-05,
"loss": 2.6043,
"step": 2380
},
{
"epoch": 3.6329402814758462,
"grad_norm": 27.75,
"learning_rate": 1.4045825486503453e-05,
"loss": 2.6445,
"step": 2390
},
{
"epoch": 3.6481551920882467,
"grad_norm": 44.0,
"learning_rate": 1.388888888888889e-05,
"loss": 2.5699,
"step": 2400
},
{
"epoch": 3.6633701027006467,
"grad_norm": 29.875,
"learning_rate": 1.3731952291274324e-05,
"loss": 2.5164,
"step": 2410
},
{
"epoch": 3.6785850133130467,
"grad_norm": 30.75,
"learning_rate": 1.3575015693659762e-05,
"loss": 2.6119,
"step": 2420
},
{
"epoch": 3.693799923925447,
"grad_norm": 74.5,
"learning_rate": 1.34180790960452e-05,
"loss": 2.6144,
"step": 2430
},
{
"epoch": 3.709014834537847,
"grad_norm": 38.5,
"learning_rate": 1.3261142498430635e-05,
"loss": 2.6119,
"step": 2440
},
{
"epoch": 3.7242297451502475,
"grad_norm": 70.5,
"learning_rate": 1.3104205900816071e-05,
"loss": 2.5528,
"step": 2450
},
{
"epoch": 3.7394446557626475,
"grad_norm": 31.5,
"learning_rate": 1.2947269303201506e-05,
"loss": 2.6334,
"step": 2460
},
{
"epoch": 3.7546595663750475,
"grad_norm": 41.5,
"learning_rate": 1.2790332705586944e-05,
"loss": 2.6298,
"step": 2470
},
{
"epoch": 3.7698744769874475,
"grad_norm": 46.5,
"learning_rate": 1.2633396107972378e-05,
"loss": 2.563,
"step": 2480
},
{
"epoch": 3.785089387599848,
"grad_norm": 29.125,
"learning_rate": 1.2476459510357816e-05,
"loss": 2.5884,
"step": 2490
},
{
"epoch": 3.800304298212248,
"grad_norm": 35.25,
"learning_rate": 1.2319522912743252e-05,
"loss": 2.6457,
"step": 2500
},
{
"epoch": 3.8155192088246483,
"grad_norm": 22.375,
"learning_rate": 1.2162586315128689e-05,
"loss": 2.5394,
"step": 2510
},
{
"epoch": 3.8307341194370483,
"grad_norm": 48.75,
"learning_rate": 1.2005649717514125e-05,
"loss": 2.5998,
"step": 2520
},
{
"epoch": 3.8459490300494483,
"grad_norm": 30.375,
"learning_rate": 1.1848713119899561e-05,
"loss": 2.6238,
"step": 2530
},
{
"epoch": 3.8611639406618488,
"grad_norm": 38.0,
"learning_rate": 1.1691776522284998e-05,
"loss": 2.6113,
"step": 2540
},
{
"epoch": 3.8763788512742487,
"grad_norm": 39.0,
"learning_rate": 1.1534839924670434e-05,
"loss": 2.7116,
"step": 2550
},
{
"epoch": 3.891593761886649,
"grad_norm": 44.75,
"learning_rate": 1.137790332705587e-05,
"loss": 2.6315,
"step": 2560
},
{
"epoch": 3.906808672499049,
"grad_norm": 27.0,
"learning_rate": 1.1220966729441306e-05,
"loss": 2.6728,
"step": 2570
},
{
"epoch": 3.922023583111449,
"grad_norm": 37.25,
"learning_rate": 1.1064030131826743e-05,
"loss": 2.7066,
"step": 2580
},
{
"epoch": 3.937238493723849,
"grad_norm": 75.5,
"learning_rate": 1.0907093534212179e-05,
"loss": 2.5966,
"step": 2590
},
{
"epoch": 3.9524534043362496,
"grad_norm": 37.5,
"learning_rate": 1.0750156936597615e-05,
"loss": 2.6743,
"step": 2600
},
{
"epoch": 3.9676683149486496,
"grad_norm": 22.5,
"learning_rate": 1.0593220338983052e-05,
"loss": 2.5749,
"step": 2610
},
{
"epoch": 3.98288322556105,
"grad_norm": 27.125,
"learning_rate": 1.0436283741368488e-05,
"loss": 2.4513,
"step": 2620
},
{
"epoch": 3.99809813617345,
"grad_norm": 32.25,
"learning_rate": 1.0279347143753924e-05,
"loss": 2.5954,
"step": 2630
},
{
"epoch": 4.01217192848992,
"grad_norm": 27.75,
"learning_rate": 1.012241054613936e-05,
"loss": 2.1217,
"step": 2640
},
{
"epoch": 4.02738683910232,
"grad_norm": 37.0,
"learning_rate": 9.965473948524797e-06,
"loss": 2.4047,
"step": 2650
},
{
"epoch": 4.042601749714721,
"grad_norm": 190.0,
"learning_rate": 9.808537350910233e-06,
"loss": 2.4063,
"step": 2660
},
{
"epoch": 4.057816660327121,
"grad_norm": 32.0,
"learning_rate": 9.65160075329567e-06,
"loss": 2.2504,
"step": 2670
},
{
"epoch": 4.073031570939521,
"grad_norm": 30.375,
"learning_rate": 9.494664155681106e-06,
"loss": 2.4439,
"step": 2680
},
{
"epoch": 4.088246481551921,
"grad_norm": 44.75,
"learning_rate": 9.337727558066542e-06,
"loss": 2.3362,
"step": 2690
},
{
"epoch": 4.103461392164321,
"grad_norm": 26.375,
"learning_rate": 9.180790960451978e-06,
"loss": 2.263,
"step": 2700
},
{
"epoch": 4.118676302776721,
"grad_norm": 38.5,
"learning_rate": 9.023854362837414e-06,
"loss": 2.3142,
"step": 2710
},
{
"epoch": 4.133891213389122,
"grad_norm": 31.125,
"learning_rate": 8.86691776522285e-06,
"loss": 2.4242,
"step": 2720
},
{
"epoch": 4.149106124001522,
"grad_norm": 30.625,
"learning_rate": 8.709981167608287e-06,
"loss": 2.3795,
"step": 2730
},
{
"epoch": 4.164321034613922,
"grad_norm": 240.0,
"learning_rate": 8.553044569993723e-06,
"loss": 2.4871,
"step": 2740
},
{
"epoch": 4.179535945226322,
"grad_norm": 326.0,
"learning_rate": 8.39610797237916e-06,
"loss": 2.3286,
"step": 2750
},
{
"epoch": 4.194750855838722,
"grad_norm": 61.25,
"learning_rate": 8.239171374764596e-06,
"loss": 2.3469,
"step": 2760
},
{
"epoch": 4.2099657664511225,
"grad_norm": 30.0,
"learning_rate": 8.082234777150032e-06,
"loss": 2.4634,
"step": 2770
},
{
"epoch": 4.2251806770635225,
"grad_norm": 43.25,
"learning_rate": 7.925298179535467e-06,
"loss": 2.5155,
"step": 2780
},
{
"epoch": 4.2403955876759225,
"grad_norm": 63.75,
"learning_rate": 7.768361581920905e-06,
"loss": 2.5562,
"step": 2790
},
{
"epoch": 4.2556104982883225,
"grad_norm": 180.0,
"learning_rate": 7.611424984306341e-06,
"loss": 2.856,
"step": 2800
},
{
"epoch": 4.2708254089007225,
"grad_norm": 57.25,
"learning_rate": 7.454488386691777e-06,
"loss": 2.8994,
"step": 2810
},
{
"epoch": 4.2860403195131225,
"grad_norm": 38.0,
"learning_rate": 7.297551789077213e-06,
"loss": 2.7051,
"step": 2820
},
{
"epoch": 4.301255230125523,
"grad_norm": 206.0,
"learning_rate": 7.140615191462649e-06,
"loss": 2.7805,
"step": 2830
},
{
"epoch": 4.316470140737923,
"grad_norm": 74.0,
"learning_rate": 6.983678593848085e-06,
"loss": 2.722,
"step": 2840
},
{
"epoch": 4.331685051350323,
"grad_norm": 58.0,
"learning_rate": 6.826741996233522e-06,
"loss": 2.5927,
"step": 2850
},
{
"epoch": 4.346899961962723,
"grad_norm": 49.75,
"learning_rate": 6.669805398618959e-06,
"loss": 2.5875,
"step": 2860
},
{
"epoch": 4.362114872575123,
"grad_norm": 312.0,
"learning_rate": 6.512868801004394e-06,
"loss": 2.5255,
"step": 2870
},
{
"epoch": 4.377329783187523,
"grad_norm": 33.5,
"learning_rate": 6.3559322033898304e-06,
"loss": 2.551,
"step": 2880
},
{
"epoch": 4.392544693799924,
"grad_norm": 51.5,
"learning_rate": 6.1989956057752676e-06,
"loss": 2.5554,
"step": 2890
},
{
"epoch": 4.407759604412324,
"grad_norm": 43.75,
"learning_rate": 6.042059008160703e-06,
"loss": 2.601,
"step": 2900
},
{
"epoch": 4.422974515024724,
"grad_norm": 28.5,
"learning_rate": 5.885122410546139e-06,
"loss": 2.4956,
"step": 2910
},
{
"epoch": 4.438189425637124,
"grad_norm": 30.5,
"learning_rate": 5.728185812931576e-06,
"loss": 2.5699,
"step": 2920
},
{
"epoch": 4.453404336249524,
"grad_norm": 68.5,
"learning_rate": 5.571249215317012e-06,
"loss": 2.5997,
"step": 2930
},
{
"epoch": 4.468619246861925,
"grad_norm": 42.5,
"learning_rate": 5.414312617702449e-06,
"loss": 2.5853,
"step": 2940
},
{
"epoch": 4.483834157474325,
"grad_norm": 124.0,
"learning_rate": 5.2573760200878844e-06,
"loss": 2.6104,
"step": 2950
},
{
"epoch": 4.499049068086725,
"grad_norm": 860.0,
"learning_rate": 5.100439422473321e-06,
"loss": 2.5993,
"step": 2960
},
{
"epoch": 4.514263978699125,
"grad_norm": 42.0,
"learning_rate": 4.943502824858758e-06,
"loss": 2.6001,
"step": 2970
},
{
"epoch": 4.529478889311525,
"grad_norm": 56.25,
"learning_rate": 4.786566227244193e-06,
"loss": 2.4628,
"step": 2980
},
{
"epoch": 4.544693799923925,
"grad_norm": 45.0,
"learning_rate": 4.6296296296296296e-06,
"loss": 2.5943,
"step": 2990
},
{
"epoch": 4.559908710536326,
"grad_norm": 64.0,
"learning_rate": 4.472693032015067e-06,
"loss": 2.6277,
"step": 3000
},
{
"epoch": 4.575123621148726,
"grad_norm": 157.0,
"learning_rate": 4.315756434400502e-06,
"loss": 2.5966,
"step": 3010
},
{
"epoch": 4.590338531761126,
"grad_norm": 208.0,
"learning_rate": 4.1588198367859384e-06,
"loss": 2.5342,
"step": 3020
},
{
"epoch": 4.605553442373526,
"grad_norm": 44.75,
"learning_rate": 4.001883239171375e-06,
"loss": 2.5117,
"step": 3030
},
{
"epoch": 4.620768352985927,
"grad_norm": 159.0,
"learning_rate": 3.844946641556811e-06,
"loss": 2.4587,
"step": 3040
},
{
"epoch": 4.635983263598327,
"grad_norm": 49.75,
"learning_rate": 3.6880100439422477e-06,
"loss": 2.5882,
"step": 3050
},
{
"epoch": 4.651198174210727,
"grad_norm": 31.125,
"learning_rate": 3.531073446327684e-06,
"loss": 2.4809,
"step": 3060
},
{
"epoch": 4.666413084823127,
"grad_norm": 26.625,
"learning_rate": 3.37413684871312e-06,
"loss": 2.5864,
"step": 3070
},
{
"epoch": 4.681627995435527,
"grad_norm": 45.0,
"learning_rate": 3.2172002510985566e-06,
"loss": 2.4663,
"step": 3080
},
{
"epoch": 4.696842906047927,
"grad_norm": 81.0,
"learning_rate": 3.0602636534839924e-06,
"loss": 2.4853,
"step": 3090
},
{
"epoch": 4.712057816660327,
"grad_norm": 976.0,
"learning_rate": 2.903327055869429e-06,
"loss": 2.5038,
"step": 3100
},
{
"epoch": 4.7272727272727275,
"grad_norm": 50.25,
"learning_rate": 2.746390458254865e-06,
"loss": 2.5078,
"step": 3110
},
{
"epoch": 4.7424876378851275,
"grad_norm": 23.375,
"learning_rate": 2.5894538606403013e-06,
"loss": 2.434,
"step": 3120
},
{
"epoch": 4.7577025484975275,
"grad_norm": 69.5,
"learning_rate": 2.4325172630257376e-06,
"loss": 2.5189,
"step": 3130
},
{
"epoch": 4.7729174591099275,
"grad_norm": 22.5,
"learning_rate": 2.2755806654111743e-06,
"loss": 2.5092,
"step": 3140
},
{
"epoch": 4.788132369722328,
"grad_norm": 4192.0,
"learning_rate": 2.11864406779661e-06,
"loss": 2.5068,
"step": 3150
},
{
"epoch": 4.803347280334728,
"grad_norm": 36.0,
"learning_rate": 1.9617074701820464e-06,
"loss": 2.5104,
"step": 3160
},
{
"epoch": 4.818562190947128,
"grad_norm": 55.5,
"learning_rate": 1.804770872567483e-06,
"loss": 2.4943,
"step": 3170
},
{
"epoch": 4.833777101559528,
"grad_norm": 23.0,
"learning_rate": 1.647834274952919e-06,
"loss": 2.4493,
"step": 3180
},
{
"epoch": 4.848992012171928,
"grad_norm": 71.0,
"learning_rate": 1.4908976773383553e-06,
"loss": 2.5597,
"step": 3190
},
{
"epoch": 4.864206922784328,
"grad_norm": 83.0,
"learning_rate": 1.3339610797237918e-06,
"loss": 2.5889,
"step": 3200
},
{
"epoch": 4.879421833396728,
"grad_norm": 49.25,
"learning_rate": 1.1770244821092279e-06,
"loss": 2.5008,
"step": 3210
},
{
"epoch": 4.894636744009129,
"grad_norm": 34.75,
"learning_rate": 1.0200878844946644e-06,
"loss": 2.5614,
"step": 3220
},
{
"epoch": 4.909851654621529,
"grad_norm": 34.25,
"learning_rate": 8.631512868801004e-07,
"loss": 2.5362,
"step": 3230
},
{
"epoch": 4.925066565233929,
"grad_norm": 114.5,
"learning_rate": 7.062146892655367e-07,
"loss": 2.5529,
"step": 3240
},
{
"epoch": 4.940281475846329,
"grad_norm": 444.0,
"learning_rate": 5.49278091650973e-07,
"loss": 2.4373,
"step": 3250
},
{
"epoch": 4.95549638645873,
"grad_norm": 208.0,
"learning_rate": 3.9234149403640934e-07,
"loss": 2.5953,
"step": 3260
},
{
"epoch": 4.97071129707113,
"grad_norm": 101.0,
"learning_rate": 2.3540489642184557e-07,
"loss": 2.5719,
"step": 3270
},
{
"epoch": 4.98592620768353,
"grad_norm": 55.5,
"learning_rate": 7.846829880728186e-08,
"loss": 2.5252,
"step": 3280
}
],
"logging_steps": 10,
"max_steps": 3285,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.590478085395579e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}