|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.99353366298973, |
|
"eval_steps": 500, |
|
"global_step": 3285, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.015214910612400151, |
|
"grad_norm": 62.0, |
|
"learning_rate": 5.050505050505051e-06, |
|
"loss": 6.7184, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.030429821224800303, |
|
"grad_norm": 26.625, |
|
"learning_rate": 1.0101010101010101e-05, |
|
"loss": 5.0379, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.045644731837200456, |
|
"grad_norm": 39.25, |
|
"learning_rate": 1.5151515151515153e-05, |
|
"loss": 4.5296, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.060859642449600605, |
|
"grad_norm": 13.75, |
|
"learning_rate": 2.0202020202020203e-05, |
|
"loss": 4.5222, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07607455306200075, |
|
"grad_norm": 14.5, |
|
"learning_rate": 2.5252525252525256e-05, |
|
"loss": 4.4963, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09128946367440091, |
|
"grad_norm": 15.125, |
|
"learning_rate": 3.0303030303030306e-05, |
|
"loss": 4.7534, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.10650437428680107, |
|
"grad_norm": 92.0, |
|
"learning_rate": 3.535353535353535e-05, |
|
"loss": 4.6209, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12171928489920121, |
|
"grad_norm": 40.5, |
|
"learning_rate": 4.0404040404040405e-05, |
|
"loss": 4.698, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.13693419551160138, |
|
"grad_norm": 22.375, |
|
"learning_rate": 4.545454545454546e-05, |
|
"loss": 4.6103, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1521491061240015, |
|
"grad_norm": 45.0, |
|
"learning_rate": 4.9984306340238544e-05, |
|
"loss": 4.8199, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.16736401673640167, |
|
"grad_norm": 12.125, |
|
"learning_rate": 4.982736974262398e-05, |
|
"loss": 4.7424, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.18257892734880182, |
|
"grad_norm": 18.875, |
|
"learning_rate": 4.967043314500942e-05, |
|
"loss": 4.8093, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.19779383796120198, |
|
"grad_norm": 14.1875, |
|
"learning_rate": 4.9513496547394854e-05, |
|
"loss": 4.9051, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.21300874857360214, |
|
"grad_norm": 9.375, |
|
"learning_rate": 4.935655994978029e-05, |
|
"loss": 4.8232, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2282236591860023, |
|
"grad_norm": 13.75, |
|
"learning_rate": 4.919962335216573e-05, |
|
"loss": 4.7461, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.24343856979840242, |
|
"grad_norm": 9.625, |
|
"learning_rate": 4.9042686754551165e-05, |
|
"loss": 4.7576, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2586534804108026, |
|
"grad_norm": 9.375, |
|
"learning_rate": 4.88857501569366e-05, |
|
"loss": 4.7951, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.27386839102320276, |
|
"grad_norm": 9.125, |
|
"learning_rate": 4.8728813559322034e-05, |
|
"loss": 4.7547, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2890833016356029, |
|
"grad_norm": 9.125, |
|
"learning_rate": 4.8571876961707475e-05, |
|
"loss": 4.7108, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.304298212248003, |
|
"grad_norm": 11.125, |
|
"learning_rate": 4.841494036409291e-05, |
|
"loss": 4.6421, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3195131228604032, |
|
"grad_norm": 82.5, |
|
"learning_rate": 4.8258003766478345e-05, |
|
"loss": 4.6211, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.33472803347280333, |
|
"grad_norm": 96.5, |
|
"learning_rate": 4.8101067168863786e-05, |
|
"loss": 5.1034, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3499429440852035, |
|
"grad_norm": 62.25, |
|
"learning_rate": 4.794413057124922e-05, |
|
"loss": 6.1655, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.36515785469760365, |
|
"grad_norm": 116.0, |
|
"learning_rate": 4.7787193973634655e-05, |
|
"loss": 5.6462, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3803727653100038, |
|
"grad_norm": 147.0, |
|
"learning_rate": 4.763025737602009e-05, |
|
"loss": 5.1807, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.39558767592240396, |
|
"grad_norm": 19.5, |
|
"learning_rate": 4.747332077840553e-05, |
|
"loss": 4.7096, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4108025865348041, |
|
"grad_norm": 52.5, |
|
"learning_rate": 4.7316384180790966e-05, |
|
"loss": 4.5789, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.42601749714720427, |
|
"grad_norm": 310.0, |
|
"learning_rate": 4.71594475831764e-05, |
|
"loss": 5.1039, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.44123240775960443, |
|
"grad_norm": 153.0, |
|
"learning_rate": 4.7002510985561835e-05, |
|
"loss": 5.1985, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.4564473183720046, |
|
"grad_norm": 39.5, |
|
"learning_rate": 4.684557438794727e-05, |
|
"loss": 5.1797, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.47166222898440474, |
|
"grad_norm": 100.0, |
|
"learning_rate": 4.6688637790332704e-05, |
|
"loss": 5.1788, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.48687713959680484, |
|
"grad_norm": 41.5, |
|
"learning_rate": 4.6531701192718145e-05, |
|
"loss": 5.046, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.502092050209205, |
|
"grad_norm": 964.0, |
|
"learning_rate": 4.637476459510358e-05, |
|
"loss": 5.0876, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5173069608216052, |
|
"grad_norm": 192.0, |
|
"learning_rate": 4.6217827997489015e-05, |
|
"loss": 5.0314, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5325218714340053, |
|
"grad_norm": 41.75, |
|
"learning_rate": 4.606089139987445e-05, |
|
"loss": 5.0153, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5477367820464055, |
|
"grad_norm": 51.0, |
|
"learning_rate": 4.590395480225989e-05, |
|
"loss": 4.7087, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5629516926588056, |
|
"grad_norm": 101.0, |
|
"learning_rate": 4.5747018204645325e-05, |
|
"loss": 4.8249, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5781666032712058, |
|
"grad_norm": 32.25, |
|
"learning_rate": 4.559008160703076e-05, |
|
"loss": 4.9544, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5933815138836059, |
|
"grad_norm": 18.5, |
|
"learning_rate": 4.54331450094162e-05, |
|
"loss": 4.7081, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.608596424496006, |
|
"grad_norm": 49.25, |
|
"learning_rate": 4.5276208411801636e-05, |
|
"loss": 4.9085, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6238113351084063, |
|
"grad_norm": 446.0, |
|
"learning_rate": 4.511927181418707e-05, |
|
"loss": 4.8538, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6390262457208064, |
|
"grad_norm": 59.25, |
|
"learning_rate": 4.4962335216572505e-05, |
|
"loss": 4.6111, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6542411563332066, |
|
"grad_norm": 60.0, |
|
"learning_rate": 4.4805398618957946e-05, |
|
"loss": 4.6564, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6694560669456067, |
|
"grad_norm": 37.5, |
|
"learning_rate": 4.464846202134338e-05, |
|
"loss": 4.6277, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6846709775580069, |
|
"grad_norm": 8.75, |
|
"learning_rate": 4.4491525423728816e-05, |
|
"loss": 4.4838, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.699885888170407, |
|
"grad_norm": 20.375, |
|
"learning_rate": 4.433458882611426e-05, |
|
"loss": 4.6507, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.7151007987828072, |
|
"grad_norm": 22.0, |
|
"learning_rate": 4.417765222849969e-05, |
|
"loss": 4.7077, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7303157093952073, |
|
"grad_norm": 12.25, |
|
"learning_rate": 4.4020715630885126e-05, |
|
"loss": 4.5839, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7455306200076075, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 4.386377903327056e-05, |
|
"loss": 4.5522, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7607455306200076, |
|
"grad_norm": 13.3125, |
|
"learning_rate": 4.3706842435655995e-05, |
|
"loss": 4.686, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7759604412324077, |
|
"grad_norm": 14.5625, |
|
"learning_rate": 4.354990583804143e-05, |
|
"loss": 4.444, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.7911753518448079, |
|
"grad_norm": 580.0, |
|
"learning_rate": 4.3392969240426864e-05, |
|
"loss": 5.0163, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.806390262457208, |
|
"grad_norm": 54.25, |
|
"learning_rate": 4.3236032642812306e-05, |
|
"loss": 5.1205, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.8216051730696082, |
|
"grad_norm": 43.5, |
|
"learning_rate": 4.307909604519774e-05, |
|
"loss": 4.7013, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8368200836820083, |
|
"grad_norm": 22.125, |
|
"learning_rate": 4.2922159447583175e-05, |
|
"loss": 4.5653, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8520349942944085, |
|
"grad_norm": 19.0, |
|
"learning_rate": 4.2765222849968616e-05, |
|
"loss": 4.465, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8672499049068086, |
|
"grad_norm": 72.0, |
|
"learning_rate": 4.260828625235405e-05, |
|
"loss": 4.6378, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.8824648155192089, |
|
"grad_norm": 14.4375, |
|
"learning_rate": 4.2451349654739486e-05, |
|
"loss": 4.6684, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.897679726131609, |
|
"grad_norm": 75.5, |
|
"learning_rate": 4.229441305712492e-05, |
|
"loss": 4.5414, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.9128946367440092, |
|
"grad_norm": 808.0, |
|
"learning_rate": 4.213747645951036e-05, |
|
"loss": 4.476, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9281095473564093, |
|
"grad_norm": 18.625, |
|
"learning_rate": 4.1980539861895796e-05, |
|
"loss": 4.4283, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.9433244579688095, |
|
"grad_norm": 20.25, |
|
"learning_rate": 4.182360326428123e-05, |
|
"loss": 4.4041, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9585393685812096, |
|
"grad_norm": 430.0, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 4.3725, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9737542791936097, |
|
"grad_norm": 246.0, |
|
"learning_rate": 4.150973006905211e-05, |
|
"loss": 4.591, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.9889691898060099, |
|
"grad_norm": 15.25, |
|
"learning_rate": 4.135279347143754e-05, |
|
"loss": 4.684, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.00304298212248, |
|
"grad_norm": 19.0, |
|
"learning_rate": 4.119585687382298e-05, |
|
"loss": 3.8939, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.0182578927348802, |
|
"grad_norm": 16.5, |
|
"learning_rate": 4.103892027620842e-05, |
|
"loss": 3.9108, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.0334728033472804, |
|
"grad_norm": 18.625, |
|
"learning_rate": 4.088198367859385e-05, |
|
"loss": 4.0937, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.0486877139596804, |
|
"grad_norm": 109.0, |
|
"learning_rate": 4.0725047080979286e-05, |
|
"loss": 3.8888, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.0639026245720806, |
|
"grad_norm": 14.6875, |
|
"learning_rate": 4.056811048336472e-05, |
|
"loss": 4.0143, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.0791175351844808, |
|
"grad_norm": 22.25, |
|
"learning_rate": 4.0411173885750156e-05, |
|
"loss": 3.8516, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.0943324457968808, |
|
"grad_norm": 23.0, |
|
"learning_rate": 4.025423728813559e-05, |
|
"loss": 3.95, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.109547356409281, |
|
"grad_norm": 13.375, |
|
"learning_rate": 4.009730069052103e-05, |
|
"loss": 3.913, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.1247622670216813, |
|
"grad_norm": 13.8125, |
|
"learning_rate": 3.9940364092906466e-05, |
|
"loss": 3.9559, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.1399771776340815, |
|
"grad_norm": 11.125, |
|
"learning_rate": 3.97834274952919e-05, |
|
"loss": 3.9732, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.1551920882464817, |
|
"grad_norm": 20.375, |
|
"learning_rate": 3.962649089767734e-05, |
|
"loss": 4.0201, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.1704069988588817, |
|
"grad_norm": 23.5, |
|
"learning_rate": 3.946955430006278e-05, |
|
"loss": 3.9096, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.1856219094712819, |
|
"grad_norm": 17.875, |
|
"learning_rate": 3.931261770244821e-05, |
|
"loss": 3.9626, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.200836820083682, |
|
"grad_norm": 18.375, |
|
"learning_rate": 3.9155681104833646e-05, |
|
"loss": 3.9349, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.216051730696082, |
|
"grad_norm": 36.0, |
|
"learning_rate": 3.899874450721909e-05, |
|
"loss": 3.8503, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.2312666413084823, |
|
"grad_norm": 18.625, |
|
"learning_rate": 3.884180790960452e-05, |
|
"loss": 3.9523, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.2464815519208825, |
|
"grad_norm": 13.6875, |
|
"learning_rate": 3.8684871311989956e-05, |
|
"loss": 3.7665, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.2616964625332825, |
|
"grad_norm": 14.375, |
|
"learning_rate": 3.85279347143754e-05, |
|
"loss": 3.8204, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.2769113731456827, |
|
"grad_norm": 14.625, |
|
"learning_rate": 3.837099811676083e-05, |
|
"loss": 3.8995, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.292126283758083, |
|
"grad_norm": 20.375, |
|
"learning_rate": 3.821406151914627e-05, |
|
"loss": 3.9207, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.3073411943704831, |
|
"grad_norm": 12.4375, |
|
"learning_rate": 3.80571249215317e-05, |
|
"loss": 3.9366, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.3225561049828833, |
|
"grad_norm": 22.25, |
|
"learning_rate": 3.790018832391714e-05, |
|
"loss": 3.7521, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.3377710155952833, |
|
"grad_norm": 20.625, |
|
"learning_rate": 3.774325172630258e-05, |
|
"loss": 3.8315, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.3529859262076835, |
|
"grad_norm": 21.875, |
|
"learning_rate": 3.758631512868801e-05, |
|
"loss": 3.8439, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.3682008368200838, |
|
"grad_norm": 23.75, |
|
"learning_rate": 3.7429378531073453e-05, |
|
"loss": 3.773, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.3834157474324837, |
|
"grad_norm": 134.0, |
|
"learning_rate": 3.727244193345889e-05, |
|
"loss": 3.8877, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.398630658044884, |
|
"grad_norm": 24.625, |
|
"learning_rate": 3.711550533584432e-05, |
|
"loss": 3.8059, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.4138455686572842, |
|
"grad_norm": 103.0, |
|
"learning_rate": 3.695856873822976e-05, |
|
"loss": 3.8337, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.4290604792696842, |
|
"grad_norm": 20.375, |
|
"learning_rate": 3.680163214061519e-05, |
|
"loss": 3.8493, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.4442753898820844, |
|
"grad_norm": 23.625, |
|
"learning_rate": 3.6644695543000626e-05, |
|
"loss": 3.8309, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.4594903004944846, |
|
"grad_norm": 17.625, |
|
"learning_rate": 3.648775894538606e-05, |
|
"loss": 3.7735, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.4747052111068848, |
|
"grad_norm": 20.0, |
|
"learning_rate": 3.63308223477715e-05, |
|
"loss": 3.7216, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.489920121719285, |
|
"grad_norm": 18.75, |
|
"learning_rate": 3.617388575015694e-05, |
|
"loss": 3.8008, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.505135032331685, |
|
"grad_norm": 25.25, |
|
"learning_rate": 3.601694915254237e-05, |
|
"loss": 3.8333, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.5203499429440852, |
|
"grad_norm": 16.375, |
|
"learning_rate": 3.586001255492781e-05, |
|
"loss": 3.8395, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.5355648535564854, |
|
"grad_norm": 16.5, |
|
"learning_rate": 3.570307595731325e-05, |
|
"loss": 3.8099, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.5507797641688854, |
|
"grad_norm": 29.875, |
|
"learning_rate": 3.554613935969868e-05, |
|
"loss": 3.8804, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.5659946747812856, |
|
"grad_norm": 211.0, |
|
"learning_rate": 3.538920276208412e-05, |
|
"loss": 4.9431, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.5812095853936858, |
|
"grad_norm": 39.0, |
|
"learning_rate": 3.523226616446956e-05, |
|
"loss": 4.2009, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.5964244960060858, |
|
"grad_norm": 34.5, |
|
"learning_rate": 3.507532956685499e-05, |
|
"loss": 3.7996, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.6116394066184863, |
|
"grad_norm": 136.0, |
|
"learning_rate": 3.491839296924043e-05, |
|
"loss": 3.7328, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.6268543172308862, |
|
"grad_norm": 70.5, |
|
"learning_rate": 3.476145637162587e-05, |
|
"loss": 3.8927, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.6420692278432865, |
|
"grad_norm": 185.0, |
|
"learning_rate": 3.46045197740113e-05, |
|
"loss": 4.4253, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.6572841384556867, |
|
"grad_norm": 25.25, |
|
"learning_rate": 3.444758317639674e-05, |
|
"loss": 4.2154, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.6724990490680867, |
|
"grad_norm": 40.0, |
|
"learning_rate": 3.429064657878218e-05, |
|
"loss": 3.9789, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.6877139596804869, |
|
"grad_norm": 41.5, |
|
"learning_rate": 3.4133709981167614e-05, |
|
"loss": 4.7068, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.702928870292887, |
|
"grad_norm": 21.625, |
|
"learning_rate": 3.397677338355305e-05, |
|
"loss": 4.134, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.718143780905287, |
|
"grad_norm": 60.25, |
|
"learning_rate": 3.381983678593848e-05, |
|
"loss": 4.0554, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.7333586915176873, |
|
"grad_norm": 36.25, |
|
"learning_rate": 3.366290018832392e-05, |
|
"loss": 4.0661, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.7485736021300875, |
|
"grad_norm": 135.0, |
|
"learning_rate": 3.350596359070935e-05, |
|
"loss": 4.1099, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.7637885127424875, |
|
"grad_norm": 17.375, |
|
"learning_rate": 3.334902699309479e-05, |
|
"loss": 4.1226, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.779003423354888, |
|
"grad_norm": 40.25, |
|
"learning_rate": 3.319209039548023e-05, |
|
"loss": 4.1996, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.794218333967288, |
|
"grad_norm": 54.25, |
|
"learning_rate": 3.303515379786566e-05, |
|
"loss": 4.2895, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.8094332445796881, |
|
"grad_norm": 82.5, |
|
"learning_rate": 3.28782172002511e-05, |
|
"loss": 4.0929, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.8246481551920883, |
|
"grad_norm": 49.75, |
|
"learning_rate": 3.272128060263653e-05, |
|
"loss": 4.1792, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.8398630658044883, |
|
"grad_norm": 34.0, |
|
"learning_rate": 3.256434400502197e-05, |
|
"loss": 4.0729, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.8550779764168885, |
|
"grad_norm": 33.5, |
|
"learning_rate": 3.240740740740741e-05, |
|
"loss": 3.9144, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.8702928870292888, |
|
"grad_norm": 24.125, |
|
"learning_rate": 3.225047080979284e-05, |
|
"loss": 3.9079, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.8855077976416887, |
|
"grad_norm": 25.25, |
|
"learning_rate": 3.2093534212178284e-05, |
|
"loss": 3.78, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.900722708254089, |
|
"grad_norm": 37.0, |
|
"learning_rate": 3.193659761456372e-05, |
|
"loss": 4.0513, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.9159376188664892, |
|
"grad_norm": 31.5, |
|
"learning_rate": 3.177966101694915e-05, |
|
"loss": 4.0164, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.9311525294788892, |
|
"grad_norm": 18.375, |
|
"learning_rate": 3.1622724419334594e-05, |
|
"loss": 4.0181, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.9463674400912896, |
|
"grad_norm": 38.5, |
|
"learning_rate": 3.146578782172003e-05, |
|
"loss": 4.104, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.9615823507036896, |
|
"grad_norm": 132.0, |
|
"learning_rate": 3.1308851224105464e-05, |
|
"loss": 4.1902, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.9767972613160898, |
|
"grad_norm": 46.75, |
|
"learning_rate": 3.11519146264909e-05, |
|
"loss": 4.318, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.99201217192849, |
|
"grad_norm": 28.125, |
|
"learning_rate": 3.099497802887634e-05, |
|
"loss": 4.0396, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.00608596424496, |
|
"grad_norm": 30.0, |
|
"learning_rate": 3.0838041431261774e-05, |
|
"loss": 3.4143, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.0213008748573604, |
|
"grad_norm": 29.875, |
|
"learning_rate": 3.068110483364721e-05, |
|
"loss": 3.3867, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.0365157854697604, |
|
"grad_norm": 34.25, |
|
"learning_rate": 3.052416823603264e-05, |
|
"loss": 3.3467, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.0517306960821604, |
|
"grad_norm": 64.0, |
|
"learning_rate": 3.036723163841808e-05, |
|
"loss": 3.403, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.066945606694561, |
|
"grad_norm": 19.0, |
|
"learning_rate": 3.0210295040803516e-05, |
|
"loss": 3.3264, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.082160517306961, |
|
"grad_norm": 21.5, |
|
"learning_rate": 3.005335844318895e-05, |
|
"loss": 3.3493, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.097375427919361, |
|
"grad_norm": 32.75, |
|
"learning_rate": 2.9896421845574392e-05, |
|
"loss": 3.1572, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.1125903385317613, |
|
"grad_norm": 50.25, |
|
"learning_rate": 2.9739485247959826e-05, |
|
"loss": 3.2606, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.1278052491441612, |
|
"grad_norm": 29.875, |
|
"learning_rate": 2.958254865034526e-05, |
|
"loss": 3.2852, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.1430201597565612, |
|
"grad_norm": 29.125, |
|
"learning_rate": 2.94256120527307e-05, |
|
"loss": 3.1659, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.1582350703689617, |
|
"grad_norm": 50.0, |
|
"learning_rate": 2.9268675455116134e-05, |
|
"loss": 3.1076, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.1734499809813617, |
|
"grad_norm": 28.375, |
|
"learning_rate": 2.9111738857501568e-05, |
|
"loss": 3.2414, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.1886648915937617, |
|
"grad_norm": 55.25, |
|
"learning_rate": 2.895480225988701e-05, |
|
"loss": 3.1755, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.203879802206162, |
|
"grad_norm": 20.0, |
|
"learning_rate": 2.8797865662272444e-05, |
|
"loss": 3.2081, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.219094712818562, |
|
"grad_norm": 31.5, |
|
"learning_rate": 2.864092906465788e-05, |
|
"loss": 3.1076, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.2343096234309625, |
|
"grad_norm": 24.25, |
|
"learning_rate": 2.8483992467043313e-05, |
|
"loss": 3.1507, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.2495245340433625, |
|
"grad_norm": 58.25, |
|
"learning_rate": 2.8327055869428755e-05, |
|
"loss": 3.2108, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.2647394446557625, |
|
"grad_norm": 65.0, |
|
"learning_rate": 2.817011927181419e-05, |
|
"loss": 3.2844, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.279954355268163, |
|
"grad_norm": 42.25, |
|
"learning_rate": 2.8013182674199624e-05, |
|
"loss": 3.2657, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.295169265880563, |
|
"grad_norm": 17.375, |
|
"learning_rate": 2.7856246076585062e-05, |
|
"loss": 3.267, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.3103841764929633, |
|
"grad_norm": 26.75, |
|
"learning_rate": 2.7699309478970496e-05, |
|
"loss": 3.2966, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.3255990871053633, |
|
"grad_norm": 30.875, |
|
"learning_rate": 2.754237288135593e-05, |
|
"loss": 3.217, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.3408139977177633, |
|
"grad_norm": 24.75, |
|
"learning_rate": 2.7385436283741372e-05, |
|
"loss": 3.1645, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.3560289083301633, |
|
"grad_norm": 35.0, |
|
"learning_rate": 2.7228499686126807e-05, |
|
"loss": 3.1515, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.3712438189425638, |
|
"grad_norm": 51.75, |
|
"learning_rate": 2.707156308851224e-05, |
|
"loss": 3.0929, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.3864587295549637, |
|
"grad_norm": 30.875, |
|
"learning_rate": 2.6914626490897676e-05, |
|
"loss": 3.1356, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.401673640167364, |
|
"grad_norm": 35.5, |
|
"learning_rate": 2.6757689893283118e-05, |
|
"loss": 3.1858, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.416888550779764, |
|
"grad_norm": 33.0, |
|
"learning_rate": 2.6600753295668552e-05, |
|
"loss": 3.1423, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.432103461392164, |
|
"grad_norm": 44.0, |
|
"learning_rate": 2.6443816698053987e-05, |
|
"loss": 3.0862, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.4473183720045646, |
|
"grad_norm": 23.125, |
|
"learning_rate": 2.6286880100439425e-05, |
|
"loss": 3.0965, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.4625332826169646, |
|
"grad_norm": 29.5, |
|
"learning_rate": 2.612994350282486e-05, |
|
"loss": 3.11, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.477748193229365, |
|
"grad_norm": 37.5, |
|
"learning_rate": 2.5973006905210294e-05, |
|
"loss": 3.0674, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.492963103841765, |
|
"grad_norm": 44.5, |
|
"learning_rate": 2.581607030759573e-05, |
|
"loss": 3.1255, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.508178014454165, |
|
"grad_norm": 44.5, |
|
"learning_rate": 2.565913370998117e-05, |
|
"loss": 3.0898, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.523392925066565, |
|
"grad_norm": 30.25, |
|
"learning_rate": 2.5502197112366604e-05, |
|
"loss": 3.0759, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.5386078356789654, |
|
"grad_norm": 41.0, |
|
"learning_rate": 2.534526051475204e-05, |
|
"loss": 3.2519, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.5538227462913654, |
|
"grad_norm": 22.25, |
|
"learning_rate": 2.518832391713748e-05, |
|
"loss": 3.1336, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.569037656903766, |
|
"grad_norm": 47.0, |
|
"learning_rate": 2.5031387319522915e-05, |
|
"loss": 3.1522, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.584252567516166, |
|
"grad_norm": 41.25, |
|
"learning_rate": 2.487445072190835e-05, |
|
"loss": 3.1463, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.599467478128566, |
|
"grad_norm": 33.75, |
|
"learning_rate": 2.4717514124293788e-05, |
|
"loss": 3.1115, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.6146823887409663, |
|
"grad_norm": 27.5, |
|
"learning_rate": 2.4560577526679222e-05, |
|
"loss": 3.1908, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.6298972993533662, |
|
"grad_norm": 59.5, |
|
"learning_rate": 2.4403640929064657e-05, |
|
"loss": 3.035, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.6451122099657667, |
|
"grad_norm": 34.75, |
|
"learning_rate": 2.4246704331450095e-05, |
|
"loss": 3.0231, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.6603271205781667, |
|
"grad_norm": 34.75, |
|
"learning_rate": 2.408976773383553e-05, |
|
"loss": 3.0962, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.6755420311905667, |
|
"grad_norm": 20.625, |
|
"learning_rate": 2.3932831136220967e-05, |
|
"loss": 3.1877, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.6907569418029667, |
|
"grad_norm": 38.25, |
|
"learning_rate": 2.3775894538606405e-05, |
|
"loss": 2.9851, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.705971852415367, |
|
"grad_norm": 24.125, |
|
"learning_rate": 2.361895794099184e-05, |
|
"loss": 3.1095, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.721186763027767, |
|
"grad_norm": 30.5, |
|
"learning_rate": 2.3462021343377278e-05, |
|
"loss": 3.0544, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.7364016736401675, |
|
"grad_norm": 32.0, |
|
"learning_rate": 2.3305084745762712e-05, |
|
"loss": 3.0336, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.7516165842525675, |
|
"grad_norm": 48.5, |
|
"learning_rate": 2.314814814814815e-05, |
|
"loss": 3.1227, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.7668314948649675, |
|
"grad_norm": 37.25, |
|
"learning_rate": 2.299121155053359e-05, |
|
"loss": 3.0989, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.782046405477368, |
|
"grad_norm": 20.25, |
|
"learning_rate": 2.2834274952919023e-05, |
|
"loss": 3.158, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.797261316089768, |
|
"grad_norm": 35.0, |
|
"learning_rate": 2.2677338355304458e-05, |
|
"loss": 3.1081, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.8124762267021683, |
|
"grad_norm": 49.0, |
|
"learning_rate": 2.2520401757689892e-05, |
|
"loss": 3.1436, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.8276911373145683, |
|
"grad_norm": 18.625, |
|
"learning_rate": 2.236346516007533e-05, |
|
"loss": 3.0876, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.8429060479269683, |
|
"grad_norm": 26.375, |
|
"learning_rate": 2.2206528562460768e-05, |
|
"loss": 3.0573, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.8581209585393683, |
|
"grad_norm": 44.0, |
|
"learning_rate": 2.2049591964846203e-05, |
|
"loss": 3.0853, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.8733358691517688, |
|
"grad_norm": 33.0, |
|
"learning_rate": 2.189265536723164e-05, |
|
"loss": 3.2242, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.8885507797641687, |
|
"grad_norm": 89.0, |
|
"learning_rate": 2.1735718769617075e-05, |
|
"loss": 3.1315, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.903765690376569, |
|
"grad_norm": 29.875, |
|
"learning_rate": 2.1578782172002513e-05, |
|
"loss": 3.1638, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.918980600988969, |
|
"grad_norm": 27.125, |
|
"learning_rate": 2.1421845574387948e-05, |
|
"loss": 3.0408, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.934195511601369, |
|
"grad_norm": 42.5, |
|
"learning_rate": 2.1264908976773386e-05, |
|
"loss": 3.0296, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.9494104222137696, |
|
"grad_norm": 37.5, |
|
"learning_rate": 2.110797237915882e-05, |
|
"loss": 3.1607, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.9646253328261696, |
|
"grad_norm": 31.5, |
|
"learning_rate": 2.0951035781544255e-05, |
|
"loss": 3.0746, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.97984024343857, |
|
"grad_norm": 40.0, |
|
"learning_rate": 2.0794099183929693e-05, |
|
"loss": 3.0749, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.99505515405097, |
|
"grad_norm": 21.375, |
|
"learning_rate": 2.0637162586315128e-05, |
|
"loss": 3.1096, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 3.00912894636744, |
|
"grad_norm": 37.25, |
|
"learning_rate": 2.0480225988700566e-05, |
|
"loss": 2.5168, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 3.0243438569798404, |
|
"grad_norm": 37.75, |
|
"learning_rate": 2.0323289391086004e-05, |
|
"loss": 2.4061, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 3.0395587675922404, |
|
"grad_norm": 28.125, |
|
"learning_rate": 2.0166352793471438e-05, |
|
"loss": 2.4859, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.0547736782046404, |
|
"grad_norm": 29.375, |
|
"learning_rate": 2.0009416195856876e-05, |
|
"loss": 2.3777, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 3.069988588817041, |
|
"grad_norm": 88.5, |
|
"learning_rate": 1.985247959824231e-05, |
|
"loss": 2.4563, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 3.085203499429441, |
|
"grad_norm": 33.0, |
|
"learning_rate": 1.969554300062775e-05, |
|
"loss": 2.393, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 3.100418410041841, |
|
"grad_norm": 25.5, |
|
"learning_rate": 1.9538606403013183e-05, |
|
"loss": 2.4262, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 3.1156333206542413, |
|
"grad_norm": 52.25, |
|
"learning_rate": 1.9381669805398618e-05, |
|
"loss": 2.394, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.1308482312666412, |
|
"grad_norm": 28.25, |
|
"learning_rate": 1.9224733207784056e-05, |
|
"loss": 2.4209, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 3.1460631418790417, |
|
"grad_norm": 24.125, |
|
"learning_rate": 1.906779661016949e-05, |
|
"loss": 2.4335, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 3.1612780524914417, |
|
"grad_norm": 35.5, |
|
"learning_rate": 1.891086001255493e-05, |
|
"loss": 2.4935, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 3.1764929631038417, |
|
"grad_norm": 42.5, |
|
"learning_rate": 1.8753923414940363e-05, |
|
"loss": 2.4604, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 3.191707873716242, |
|
"grad_norm": 29.25, |
|
"learning_rate": 1.85969868173258e-05, |
|
"loss": 2.4856, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.206922784328642, |
|
"grad_norm": 34.75, |
|
"learning_rate": 1.844005021971124e-05, |
|
"loss": 2.4311, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 3.222137694941042, |
|
"grad_norm": 56.0, |
|
"learning_rate": 1.8283113622096674e-05, |
|
"loss": 2.4334, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 3.2373526055534425, |
|
"grad_norm": 38.25, |
|
"learning_rate": 1.812617702448211e-05, |
|
"loss": 2.5363, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 3.2525675161658425, |
|
"grad_norm": 36.25, |
|
"learning_rate": 1.7969240426867546e-05, |
|
"loss": 2.4689, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 3.2677824267782425, |
|
"grad_norm": 27.375, |
|
"learning_rate": 1.7812303829252984e-05, |
|
"loss": 2.522, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.282997337390643, |
|
"grad_norm": 37.5, |
|
"learning_rate": 1.765536723163842e-05, |
|
"loss": 2.561, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 3.298212248003043, |
|
"grad_norm": 56.5, |
|
"learning_rate": 1.7498430634023853e-05, |
|
"loss": 2.5073, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 3.3134271586154433, |
|
"grad_norm": 31.875, |
|
"learning_rate": 1.734149403640929e-05, |
|
"loss": 2.6292, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 3.3286420692278433, |
|
"grad_norm": 66.5, |
|
"learning_rate": 1.7184557438794726e-05, |
|
"loss": 2.5683, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 3.3438569798402433, |
|
"grad_norm": 163.0, |
|
"learning_rate": 1.7027620841180164e-05, |
|
"loss": 2.4828, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.3590718904526438, |
|
"grad_norm": 28.625, |
|
"learning_rate": 1.6870684243565602e-05, |
|
"loss": 2.4985, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 3.3742868010650438, |
|
"grad_norm": 26.5, |
|
"learning_rate": 1.6713747645951036e-05, |
|
"loss": 2.5606, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 3.3895017116774437, |
|
"grad_norm": 29.0, |
|
"learning_rate": 1.6556811048336474e-05, |
|
"loss": 2.5204, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 3.404716622289844, |
|
"grad_norm": 97.5, |
|
"learning_rate": 1.639987445072191e-05, |
|
"loss": 2.5655, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 3.419931532902244, |
|
"grad_norm": 110.0, |
|
"learning_rate": 1.6242937853107347e-05, |
|
"loss": 2.5357, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 3.435146443514644, |
|
"grad_norm": 41.75, |
|
"learning_rate": 1.608600125549278e-05, |
|
"loss": 2.5904, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 3.4503613541270446, |
|
"grad_norm": 79.5, |
|
"learning_rate": 1.5929064657878216e-05, |
|
"loss": 2.5566, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 3.4655762647394446, |
|
"grad_norm": 51.25, |
|
"learning_rate": 1.5772128060263654e-05, |
|
"loss": 2.7638, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 3.480791175351845, |
|
"grad_norm": 45.5, |
|
"learning_rate": 1.561519146264909e-05, |
|
"loss": 2.6242, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 3.496006085964245, |
|
"grad_norm": 35.0, |
|
"learning_rate": 1.5458254865034527e-05, |
|
"loss": 2.5996, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.511220996576645, |
|
"grad_norm": 33.75, |
|
"learning_rate": 1.530131826741996e-05, |
|
"loss": 2.6636, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 3.5264359071890454, |
|
"grad_norm": 36.75, |
|
"learning_rate": 1.51443816698054e-05, |
|
"loss": 2.5936, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 3.5416508178014454, |
|
"grad_norm": 73.5, |
|
"learning_rate": 1.4987445072190837e-05, |
|
"loss": 2.6029, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 3.5568657284138454, |
|
"grad_norm": 33.25, |
|
"learning_rate": 1.4830508474576272e-05, |
|
"loss": 2.6107, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 3.572080639026246, |
|
"grad_norm": 31.0, |
|
"learning_rate": 1.4673571876961708e-05, |
|
"loss": 2.6401, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 3.587295549638646, |
|
"grad_norm": 46.75, |
|
"learning_rate": 1.4516635279347143e-05, |
|
"loss": 2.6327, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 3.602510460251046, |
|
"grad_norm": 2928.0, |
|
"learning_rate": 1.435969868173258e-05, |
|
"loss": 2.668, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 3.6177253708634463, |
|
"grad_norm": 31.375, |
|
"learning_rate": 1.4202762084118019e-05, |
|
"loss": 2.6043, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 3.6329402814758462, |
|
"grad_norm": 27.75, |
|
"learning_rate": 1.4045825486503453e-05, |
|
"loss": 2.6445, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 3.6481551920882467, |
|
"grad_norm": 44.0, |
|
"learning_rate": 1.388888888888889e-05, |
|
"loss": 2.5699, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.6633701027006467, |
|
"grad_norm": 29.875, |
|
"learning_rate": 1.3731952291274324e-05, |
|
"loss": 2.5164, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 3.6785850133130467, |
|
"grad_norm": 30.75, |
|
"learning_rate": 1.3575015693659762e-05, |
|
"loss": 2.6119, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 3.693799923925447, |
|
"grad_norm": 74.5, |
|
"learning_rate": 1.34180790960452e-05, |
|
"loss": 2.6144, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 3.709014834537847, |
|
"grad_norm": 38.5, |
|
"learning_rate": 1.3261142498430635e-05, |
|
"loss": 2.6119, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 3.7242297451502475, |
|
"grad_norm": 70.5, |
|
"learning_rate": 1.3104205900816071e-05, |
|
"loss": 2.5528, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 3.7394446557626475, |
|
"grad_norm": 31.5, |
|
"learning_rate": 1.2947269303201506e-05, |
|
"loss": 2.6334, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 3.7546595663750475, |
|
"grad_norm": 41.5, |
|
"learning_rate": 1.2790332705586944e-05, |
|
"loss": 2.6298, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 3.7698744769874475, |
|
"grad_norm": 46.5, |
|
"learning_rate": 1.2633396107972378e-05, |
|
"loss": 2.563, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 3.785089387599848, |
|
"grad_norm": 29.125, |
|
"learning_rate": 1.2476459510357816e-05, |
|
"loss": 2.5884, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 3.800304298212248, |
|
"grad_norm": 35.25, |
|
"learning_rate": 1.2319522912743252e-05, |
|
"loss": 2.6457, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.8155192088246483, |
|
"grad_norm": 22.375, |
|
"learning_rate": 1.2162586315128689e-05, |
|
"loss": 2.5394, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 3.8307341194370483, |
|
"grad_norm": 48.75, |
|
"learning_rate": 1.2005649717514125e-05, |
|
"loss": 2.5998, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 3.8459490300494483, |
|
"grad_norm": 30.375, |
|
"learning_rate": 1.1848713119899561e-05, |
|
"loss": 2.6238, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 3.8611639406618488, |
|
"grad_norm": 38.0, |
|
"learning_rate": 1.1691776522284998e-05, |
|
"loss": 2.6113, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 3.8763788512742487, |
|
"grad_norm": 39.0, |
|
"learning_rate": 1.1534839924670434e-05, |
|
"loss": 2.7116, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 3.891593761886649, |
|
"grad_norm": 44.75, |
|
"learning_rate": 1.137790332705587e-05, |
|
"loss": 2.6315, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 3.906808672499049, |
|
"grad_norm": 27.0, |
|
"learning_rate": 1.1220966729441306e-05, |
|
"loss": 2.6728, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 3.922023583111449, |
|
"grad_norm": 37.25, |
|
"learning_rate": 1.1064030131826743e-05, |
|
"loss": 2.7066, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 3.937238493723849, |
|
"grad_norm": 75.5, |
|
"learning_rate": 1.0907093534212179e-05, |
|
"loss": 2.5966, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 3.9524534043362496, |
|
"grad_norm": 37.5, |
|
"learning_rate": 1.0750156936597615e-05, |
|
"loss": 2.6743, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 3.9676683149486496, |
|
"grad_norm": 22.5, |
|
"learning_rate": 1.0593220338983052e-05, |
|
"loss": 2.5749, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 3.98288322556105, |
|
"grad_norm": 27.125, |
|
"learning_rate": 1.0436283741368488e-05, |
|
"loss": 2.4513, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 3.99809813617345, |
|
"grad_norm": 32.25, |
|
"learning_rate": 1.0279347143753924e-05, |
|
"loss": 2.5954, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 4.01217192848992, |
|
"grad_norm": 27.75, |
|
"learning_rate": 1.012241054613936e-05, |
|
"loss": 2.1217, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 4.02738683910232, |
|
"grad_norm": 37.0, |
|
"learning_rate": 9.965473948524797e-06, |
|
"loss": 2.4047, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 4.042601749714721, |
|
"grad_norm": 190.0, |
|
"learning_rate": 9.808537350910233e-06, |
|
"loss": 2.4063, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 4.057816660327121, |
|
"grad_norm": 32.0, |
|
"learning_rate": 9.65160075329567e-06, |
|
"loss": 2.2504, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 4.073031570939521, |
|
"grad_norm": 30.375, |
|
"learning_rate": 9.494664155681106e-06, |
|
"loss": 2.4439, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 4.088246481551921, |
|
"grad_norm": 44.75, |
|
"learning_rate": 9.337727558066542e-06, |
|
"loss": 2.3362, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 4.103461392164321, |
|
"grad_norm": 26.375, |
|
"learning_rate": 9.180790960451978e-06, |
|
"loss": 2.263, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 4.118676302776721, |
|
"grad_norm": 38.5, |
|
"learning_rate": 9.023854362837414e-06, |
|
"loss": 2.3142, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 4.133891213389122, |
|
"grad_norm": 31.125, |
|
"learning_rate": 8.86691776522285e-06, |
|
"loss": 2.4242, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 4.149106124001522, |
|
"grad_norm": 30.625, |
|
"learning_rate": 8.709981167608287e-06, |
|
"loss": 2.3795, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 4.164321034613922, |
|
"grad_norm": 240.0, |
|
"learning_rate": 8.553044569993723e-06, |
|
"loss": 2.4871, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 4.179535945226322, |
|
"grad_norm": 326.0, |
|
"learning_rate": 8.39610797237916e-06, |
|
"loss": 2.3286, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 4.194750855838722, |
|
"grad_norm": 61.25, |
|
"learning_rate": 8.239171374764596e-06, |
|
"loss": 2.3469, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 4.2099657664511225, |
|
"grad_norm": 30.0, |
|
"learning_rate": 8.082234777150032e-06, |
|
"loss": 2.4634, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 4.2251806770635225, |
|
"grad_norm": 43.25, |
|
"learning_rate": 7.925298179535467e-06, |
|
"loss": 2.5155, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 4.2403955876759225, |
|
"grad_norm": 63.75, |
|
"learning_rate": 7.768361581920905e-06, |
|
"loss": 2.5562, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 4.2556104982883225, |
|
"grad_norm": 180.0, |
|
"learning_rate": 7.611424984306341e-06, |
|
"loss": 2.856, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 4.2708254089007225, |
|
"grad_norm": 57.25, |
|
"learning_rate": 7.454488386691777e-06, |
|
"loss": 2.8994, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 4.2860403195131225, |
|
"grad_norm": 38.0, |
|
"learning_rate": 7.297551789077213e-06, |
|
"loss": 2.7051, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 4.301255230125523, |
|
"grad_norm": 206.0, |
|
"learning_rate": 7.140615191462649e-06, |
|
"loss": 2.7805, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 4.316470140737923, |
|
"grad_norm": 74.0, |
|
"learning_rate": 6.983678593848085e-06, |
|
"loss": 2.722, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 4.331685051350323, |
|
"grad_norm": 58.0, |
|
"learning_rate": 6.826741996233522e-06, |
|
"loss": 2.5927, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 4.346899961962723, |
|
"grad_norm": 49.75, |
|
"learning_rate": 6.669805398618959e-06, |
|
"loss": 2.5875, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 4.362114872575123, |
|
"grad_norm": 312.0, |
|
"learning_rate": 6.512868801004394e-06, |
|
"loss": 2.5255, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 4.377329783187523, |
|
"grad_norm": 33.5, |
|
"learning_rate": 6.3559322033898304e-06, |
|
"loss": 2.551, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 4.392544693799924, |
|
"grad_norm": 51.5, |
|
"learning_rate": 6.1989956057752676e-06, |
|
"loss": 2.5554, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 4.407759604412324, |
|
"grad_norm": 43.75, |
|
"learning_rate": 6.042059008160703e-06, |
|
"loss": 2.601, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 4.422974515024724, |
|
"grad_norm": 28.5, |
|
"learning_rate": 5.885122410546139e-06, |
|
"loss": 2.4956, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 4.438189425637124, |
|
"grad_norm": 30.5, |
|
"learning_rate": 5.728185812931576e-06, |
|
"loss": 2.5699, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 4.453404336249524, |
|
"grad_norm": 68.5, |
|
"learning_rate": 5.571249215317012e-06, |
|
"loss": 2.5997, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 4.468619246861925, |
|
"grad_norm": 42.5, |
|
"learning_rate": 5.414312617702449e-06, |
|
"loss": 2.5853, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 4.483834157474325, |
|
"grad_norm": 124.0, |
|
"learning_rate": 5.2573760200878844e-06, |
|
"loss": 2.6104, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 4.499049068086725, |
|
"grad_norm": 860.0, |
|
"learning_rate": 5.100439422473321e-06, |
|
"loss": 2.5993, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 4.514263978699125, |
|
"grad_norm": 42.0, |
|
"learning_rate": 4.943502824858758e-06, |
|
"loss": 2.6001, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 4.529478889311525, |
|
"grad_norm": 56.25, |
|
"learning_rate": 4.786566227244193e-06, |
|
"loss": 2.4628, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 4.544693799923925, |
|
"grad_norm": 45.0, |
|
"learning_rate": 4.6296296296296296e-06, |
|
"loss": 2.5943, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 4.559908710536326, |
|
"grad_norm": 64.0, |
|
"learning_rate": 4.472693032015067e-06, |
|
"loss": 2.6277, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 4.575123621148726, |
|
"grad_norm": 157.0, |
|
"learning_rate": 4.315756434400502e-06, |
|
"loss": 2.5966, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 4.590338531761126, |
|
"grad_norm": 208.0, |
|
"learning_rate": 4.1588198367859384e-06, |
|
"loss": 2.5342, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 4.605553442373526, |
|
"grad_norm": 44.75, |
|
"learning_rate": 4.001883239171375e-06, |
|
"loss": 2.5117, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 4.620768352985927, |
|
"grad_norm": 159.0, |
|
"learning_rate": 3.844946641556811e-06, |
|
"loss": 2.4587, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 4.635983263598327, |
|
"grad_norm": 49.75, |
|
"learning_rate": 3.6880100439422477e-06, |
|
"loss": 2.5882, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 4.651198174210727, |
|
"grad_norm": 31.125, |
|
"learning_rate": 3.531073446327684e-06, |
|
"loss": 2.4809, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 4.666413084823127, |
|
"grad_norm": 26.625, |
|
"learning_rate": 3.37413684871312e-06, |
|
"loss": 2.5864, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 4.681627995435527, |
|
"grad_norm": 45.0, |
|
"learning_rate": 3.2172002510985566e-06, |
|
"loss": 2.4663, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 4.696842906047927, |
|
"grad_norm": 81.0, |
|
"learning_rate": 3.0602636534839924e-06, |
|
"loss": 2.4853, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 4.712057816660327, |
|
"grad_norm": 976.0, |
|
"learning_rate": 2.903327055869429e-06, |
|
"loss": 2.5038, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 4.7272727272727275, |
|
"grad_norm": 50.25, |
|
"learning_rate": 2.746390458254865e-06, |
|
"loss": 2.5078, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 4.7424876378851275, |
|
"grad_norm": 23.375, |
|
"learning_rate": 2.5894538606403013e-06, |
|
"loss": 2.434, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 4.7577025484975275, |
|
"grad_norm": 69.5, |
|
"learning_rate": 2.4325172630257376e-06, |
|
"loss": 2.5189, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 4.7729174591099275, |
|
"grad_norm": 22.5, |
|
"learning_rate": 2.2755806654111743e-06, |
|
"loss": 2.5092, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 4.788132369722328, |
|
"grad_norm": 4192.0, |
|
"learning_rate": 2.11864406779661e-06, |
|
"loss": 2.5068, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 4.803347280334728, |
|
"grad_norm": 36.0, |
|
"learning_rate": 1.9617074701820464e-06, |
|
"loss": 2.5104, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 4.818562190947128, |
|
"grad_norm": 55.5, |
|
"learning_rate": 1.804770872567483e-06, |
|
"loss": 2.4943, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 4.833777101559528, |
|
"grad_norm": 23.0, |
|
"learning_rate": 1.647834274952919e-06, |
|
"loss": 2.4493, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 4.848992012171928, |
|
"grad_norm": 71.0, |
|
"learning_rate": 1.4908976773383553e-06, |
|
"loss": 2.5597, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 4.864206922784328, |
|
"grad_norm": 83.0, |
|
"learning_rate": 1.3339610797237918e-06, |
|
"loss": 2.5889, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 4.879421833396728, |
|
"grad_norm": 49.25, |
|
"learning_rate": 1.1770244821092279e-06, |
|
"loss": 2.5008, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 4.894636744009129, |
|
"grad_norm": 34.75, |
|
"learning_rate": 1.0200878844946644e-06, |
|
"loss": 2.5614, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 4.909851654621529, |
|
"grad_norm": 34.25, |
|
"learning_rate": 8.631512868801004e-07, |
|
"loss": 2.5362, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 4.925066565233929, |
|
"grad_norm": 114.5, |
|
"learning_rate": 7.062146892655367e-07, |
|
"loss": 2.5529, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 4.940281475846329, |
|
"grad_norm": 444.0, |
|
"learning_rate": 5.49278091650973e-07, |
|
"loss": 2.4373, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 4.95549638645873, |
|
"grad_norm": 208.0, |
|
"learning_rate": 3.9234149403640934e-07, |
|
"loss": 2.5953, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 4.97071129707113, |
|
"grad_norm": 101.0, |
|
"learning_rate": 2.3540489642184557e-07, |
|
"loss": 2.5719, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 4.98592620768353, |
|
"grad_norm": 55.5, |
|
"learning_rate": 7.846829880728186e-08, |
|
"loss": 2.5252, |
|
"step": 3280 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3285, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.590478085395579e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|