|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.6051364365971108, |
|
"eval_steps": 500, |
|
"global_step": 3000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 7.3019086777975994, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6939, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 5.150989333266983, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7167, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.640558809610037, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.5683, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 7.517999397731128, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5472, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.9687061679463425, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.4439, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.643479206523606, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2486, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.2754773308695095, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.2217, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.7144730049127388, |
|
"learning_rate": 4e-05, |
|
"loss": 0.169, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.4702829704135114, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.1994, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.3985127340985621, |
|
"learning_rate": 5e-05, |
|
"loss": 0.1612, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.375992184386137, |
|
"learning_rate": 4.982758620689655e-05, |
|
"loss": 0.1576, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.9528635753013313, |
|
"learning_rate": 4.9655172413793107e-05, |
|
"loss": 0.1393, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.075169010198401, |
|
"learning_rate": 4.9482758620689655e-05, |
|
"loss": 0.1969, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.0953991165751207, |
|
"learning_rate": 4.931034482758621e-05, |
|
"loss": 0.1294, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.942660591044849, |
|
"learning_rate": 4.913793103448276e-05, |
|
"loss": 0.1306, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.1508904015728345, |
|
"learning_rate": 4.896551724137931e-05, |
|
"loss": 0.1526, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.9862795165358471, |
|
"learning_rate": 4.8793103448275864e-05, |
|
"loss": 0.1186, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.633061833817991, |
|
"learning_rate": 4.862068965517241e-05, |
|
"loss": 0.1457, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.8017052368446178, |
|
"learning_rate": 4.844827586206897e-05, |
|
"loss": 0.1234, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.1560694100709803, |
|
"learning_rate": 4.827586206896552e-05, |
|
"loss": 0.1346, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.5737689267430703, |
|
"learning_rate": 4.810344827586207e-05, |
|
"loss": 0.116, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.957864677854788, |
|
"learning_rate": 4.793103448275863e-05, |
|
"loss": 0.1692, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.215039223521855, |
|
"learning_rate": 4.7758620689655176e-05, |
|
"loss": 0.1245, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.370517239734168, |
|
"learning_rate": 4.7586206896551725e-05, |
|
"loss": 0.1476, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.7341334563022532, |
|
"learning_rate": 4.741379310344828e-05, |
|
"loss": 0.1236, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.5994298113068974, |
|
"learning_rate": 4.724137931034483e-05, |
|
"loss": 0.1161, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.5317433190951963, |
|
"learning_rate": 4.7068965517241385e-05, |
|
"loss": 0.1035, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.191977732539556, |
|
"learning_rate": 4.689655172413793e-05, |
|
"loss": 0.1427, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.6038667570691656, |
|
"learning_rate": 4.672413793103448e-05, |
|
"loss": 0.1225, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.577572731831179, |
|
"learning_rate": 4.655172413793104e-05, |
|
"loss": 0.1399, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.6199241001441385, |
|
"learning_rate": 4.6379310344827586e-05, |
|
"loss": 0.1242, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.236577821186196, |
|
"learning_rate": 4.6206896551724135e-05, |
|
"loss": 0.1656, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.7294690605254757, |
|
"learning_rate": 4.603448275862069e-05, |
|
"loss": 0.1382, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.196527516378511, |
|
"learning_rate": 4.586206896551724e-05, |
|
"loss": 0.1257, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.1057444340221463, |
|
"learning_rate": 4.5689655172413794e-05, |
|
"loss": 0.1238, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.5409556870328274, |
|
"learning_rate": 4.551724137931035e-05, |
|
"loss": 0.1383, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.5204083616874053, |
|
"learning_rate": 4.53448275862069e-05, |
|
"loss": 0.1068, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.3557725298931746, |
|
"learning_rate": 4.5172413793103454e-05, |
|
"loss": 0.1071, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 3.2601538460418644, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.125, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.9031725385762286, |
|
"learning_rate": 4.482758620689655e-05, |
|
"loss": 0.0991, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.3946050262183123, |
|
"learning_rate": 4.465517241379311e-05, |
|
"loss": 0.1156, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.097644875106397, |
|
"learning_rate": 4.4482758620689656e-05, |
|
"loss": 0.1366, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.37846299019108, |
|
"learning_rate": 4.431034482758621e-05, |
|
"loss": 0.126, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.8340152889320331, |
|
"learning_rate": 4.413793103448276e-05, |
|
"loss": 0.1066, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.8304505611337867, |
|
"learning_rate": 4.396551724137931e-05, |
|
"loss": 0.0868, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.550196490898523, |
|
"learning_rate": 4.3793103448275864e-05, |
|
"loss": 0.1286, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.176112247796248, |
|
"learning_rate": 4.362068965517241e-05, |
|
"loss": 0.1206, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.6589263894091213, |
|
"learning_rate": 4.344827586206897e-05, |
|
"loss": 0.1008, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.8349611508902046, |
|
"learning_rate": 4.327586206896552e-05, |
|
"loss": 0.1198, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 2.1218964920724126, |
|
"learning_rate": 4.3103448275862066e-05, |
|
"loss": 0.1166, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"eval_loss": 0.6078919172286987, |
|
"eval_runtime": 116.8471, |
|
"eval_samples_per_second": 11.288, |
|
"eval_steps_per_second": 2.824, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 2.5775141311007856, |
|
"learning_rate": 4.293103448275863e-05, |
|
"loss": 0.1124, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.6019517017800202, |
|
"learning_rate": 4.275862068965518e-05, |
|
"loss": 0.1068, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.6901962755310205, |
|
"learning_rate": 4.2586206896551725e-05, |
|
"loss": 0.1286, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 3.9517995356721767, |
|
"learning_rate": 4.241379310344828e-05, |
|
"loss": 0.1149, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 2.0428896228074076, |
|
"learning_rate": 4.224137931034483e-05, |
|
"loss": 0.141, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 2.263258592133553, |
|
"learning_rate": 4.2068965517241385e-05, |
|
"loss": 0.0949, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.4823165953974604, |
|
"learning_rate": 4.1896551724137934e-05, |
|
"loss": 0.1365, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.4441377020989878, |
|
"learning_rate": 4.172413793103448e-05, |
|
"loss": 0.1015, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.4779059254436886, |
|
"learning_rate": 4.155172413793104e-05, |
|
"loss": 0.0988, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.7777823671018818, |
|
"learning_rate": 4.1379310344827587e-05, |
|
"loss": 0.1124, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.737579831138191, |
|
"learning_rate": 4.120689655172414e-05, |
|
"loss": 0.086, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 2.708453961232997, |
|
"learning_rate": 4.103448275862069e-05, |
|
"loss": 0.0933, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.8871805824236731, |
|
"learning_rate": 4.086206896551724e-05, |
|
"loss": 0.1407, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.7300112722427339, |
|
"learning_rate": 4.0689655172413795e-05, |
|
"loss": 0.1224, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.4631236252240614, |
|
"learning_rate": 4.0517241379310344e-05, |
|
"loss": 0.1014, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.2602431597419264, |
|
"learning_rate": 4.03448275862069e-05, |
|
"loss": 0.1583, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.2077937041919453, |
|
"learning_rate": 4.0172413793103455e-05, |
|
"loss": 0.1209, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.4386184566429954, |
|
"learning_rate": 4e-05, |
|
"loss": 0.1016, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.6160358835758584, |
|
"learning_rate": 3.982758620689656e-05, |
|
"loss": 0.1062, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.9278794640498955, |
|
"learning_rate": 3.965517241379311e-05, |
|
"loss": 0.1037, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.2872571900237024, |
|
"learning_rate": 3.9482758620689656e-05, |
|
"loss": 0.096, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.243554309347296, |
|
"learning_rate": 3.931034482758621e-05, |
|
"loss": 0.1084, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.5090589714253309, |
|
"learning_rate": 3.913793103448276e-05, |
|
"loss": 0.0877, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.1419550623025168, |
|
"learning_rate": 3.896551724137931e-05, |
|
"loss": 0.0994, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.7807417973632438, |
|
"learning_rate": 3.8793103448275865e-05, |
|
"loss": 0.1029, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.3342960263057682, |
|
"learning_rate": 3.862068965517241e-05, |
|
"loss": 0.1072, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 2.3865282340158136, |
|
"learning_rate": 3.844827586206897e-05, |
|
"loss": 0.1193, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.5428742248459941, |
|
"learning_rate": 3.827586206896552e-05, |
|
"loss": 0.1156, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.7660532115509044, |
|
"learning_rate": 3.8103448275862066e-05, |
|
"loss": 0.122, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.8149742752994733, |
|
"learning_rate": 3.793103448275862e-05, |
|
"loss": 0.1346, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.7456408876472995, |
|
"learning_rate": 3.775862068965517e-05, |
|
"loss": 0.1223, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.10163248244056, |
|
"learning_rate": 3.7586206896551726e-05, |
|
"loss": 0.1031, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.6441057737088702, |
|
"learning_rate": 3.741379310344828e-05, |
|
"loss": 0.1059, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.3999279790163484, |
|
"learning_rate": 3.724137931034483e-05, |
|
"loss": 0.1125, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.2081477934156903, |
|
"learning_rate": 3.7068965517241385e-05, |
|
"loss": 0.1266, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 2.366783771480017, |
|
"learning_rate": 3.6896551724137934e-05, |
|
"loss": 0.1127, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.3077873674136173, |
|
"learning_rate": 3.672413793103448e-05, |
|
"loss": 0.1095, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.8197812508114701, |
|
"learning_rate": 3.655172413793104e-05, |
|
"loss": 0.0932, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.0806192057981219, |
|
"learning_rate": 3.637931034482759e-05, |
|
"loss": 0.1166, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.520666439337001, |
|
"learning_rate": 3.620689655172414e-05, |
|
"loss": 0.0883, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.690002270629302, |
|
"learning_rate": 3.603448275862069e-05, |
|
"loss": 0.1199, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.4319374130118003, |
|
"learning_rate": 3.586206896551724e-05, |
|
"loss": 0.0991, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.0626084369653164, |
|
"learning_rate": 3.5689655172413795e-05, |
|
"loss": 0.0923, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 2.0848060597460902, |
|
"learning_rate": 3.5517241379310344e-05, |
|
"loss": 0.0979, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.4997189461483256, |
|
"learning_rate": 3.53448275862069e-05, |
|
"loss": 0.0949, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.7887817042743388, |
|
"learning_rate": 3.517241379310345e-05, |
|
"loss": 0.1135, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 3.242965692388458, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.1315, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.5034762176322083, |
|
"learning_rate": 3.482758620689655e-05, |
|
"loss": 0.1177, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.6679474444200848, |
|
"learning_rate": 3.465517241379311e-05, |
|
"loss": 0.1182, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 2.814574507251776, |
|
"learning_rate": 3.4482758620689657e-05, |
|
"loss": 0.0912, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"eval_loss": 0.6101276874542236, |
|
"eval_runtime": 113.9995, |
|
"eval_samples_per_second": 11.57, |
|
"eval_steps_per_second": 2.895, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 2.1321319681580535, |
|
"learning_rate": 3.431034482758621e-05, |
|
"loss": 0.0983, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.8915101367452352, |
|
"learning_rate": 3.413793103448276e-05, |
|
"loss": 0.1113, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.4160985095468477, |
|
"learning_rate": 3.3965517241379316e-05, |
|
"loss": 0.1076, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.8562788974878586, |
|
"learning_rate": 3.3793103448275865e-05, |
|
"loss": 0.1011, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.3793192563691294, |
|
"learning_rate": 3.3620689655172414e-05, |
|
"loss": 0.0978, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.4606563129628805, |
|
"learning_rate": 3.344827586206897e-05, |
|
"loss": 0.1069, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.5680856211032999, |
|
"learning_rate": 3.327586206896552e-05, |
|
"loss": 0.0988, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.5829345931951275, |
|
"learning_rate": 3.310344827586207e-05, |
|
"loss": 0.1256, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.6200852939319585, |
|
"learning_rate": 3.293103448275862e-05, |
|
"loss": 0.097, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 2.259656836213122, |
|
"learning_rate": 3.275862068965517e-05, |
|
"loss": 0.1137, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 2.2483622341560645, |
|
"learning_rate": 3.2586206896551726e-05, |
|
"loss": 0.0999, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.168198861956421, |
|
"learning_rate": 3.2413793103448275e-05, |
|
"loss": 0.1018, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.4699824799031482, |
|
"learning_rate": 3.2241379310344824e-05, |
|
"loss": 0.1132, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.2571654549549751, |
|
"learning_rate": 3.206896551724138e-05, |
|
"loss": 0.1054, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.5559534032307631, |
|
"learning_rate": 3.1896551724137935e-05, |
|
"loss": 0.0789, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.636369759504475, |
|
"learning_rate": 3.172413793103448e-05, |
|
"loss": 0.0902, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.6137142935446496, |
|
"learning_rate": 3.155172413793104e-05, |
|
"loss": 0.1199, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.7448003760796802, |
|
"learning_rate": 3.137931034482759e-05, |
|
"loss": 0.1295, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.3261005358227276, |
|
"learning_rate": 3.120689655172414e-05, |
|
"loss": 0.1117, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.7353127177901462, |
|
"learning_rate": 3.103448275862069e-05, |
|
"loss": 0.0951, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 2.8569975913367074, |
|
"learning_rate": 3.086206896551724e-05, |
|
"loss": 0.112, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.3481947218871082, |
|
"learning_rate": 3.0689655172413796e-05, |
|
"loss": 0.0876, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.015933141613929, |
|
"learning_rate": 3.0517241379310348e-05, |
|
"loss": 0.0993, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.0588164394448019, |
|
"learning_rate": 3.0344827586206897e-05, |
|
"loss": 0.1034, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.3594986645993228, |
|
"learning_rate": 3.017241379310345e-05, |
|
"loss": 0.1004, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.333098402625009, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1259, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.1324206075196583, |
|
"learning_rate": 2.9827586206896553e-05, |
|
"loss": 0.1155, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.2270687927795876, |
|
"learning_rate": 2.96551724137931e-05, |
|
"loss": 0.092, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.3085362507403875, |
|
"learning_rate": 2.9482758620689654e-05, |
|
"loss": 0.1064, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.7135250277750762, |
|
"learning_rate": 2.9310344827586206e-05, |
|
"loss": 0.1132, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.6121189707451158, |
|
"learning_rate": 2.913793103448276e-05, |
|
"loss": 0.1006, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.3958680925504208, |
|
"learning_rate": 2.8965517241379313e-05, |
|
"loss": 0.0956, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.546226150121884, |
|
"learning_rate": 2.8793103448275865e-05, |
|
"loss": 0.1142, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 2.1837739995965415, |
|
"learning_rate": 2.8620689655172417e-05, |
|
"loss": 0.1127, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.9402402206909504, |
|
"learning_rate": 2.844827586206897e-05, |
|
"loss": 0.0922, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.7914401953164802, |
|
"learning_rate": 2.8275862068965518e-05, |
|
"loss": 0.1038, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.505804090650568, |
|
"learning_rate": 2.810344827586207e-05, |
|
"loss": 0.1034, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.9907350713586716, |
|
"learning_rate": 2.7931034482758622e-05, |
|
"loss": 0.103, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.6948381773166858, |
|
"learning_rate": 2.7758620689655175e-05, |
|
"loss": 0.1091, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.3995985437024723, |
|
"learning_rate": 2.7586206896551727e-05, |
|
"loss": 0.0852, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.9347024029069393, |
|
"learning_rate": 2.7413793103448275e-05, |
|
"loss": 0.1393, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.608776792445342, |
|
"learning_rate": 2.7241379310344827e-05, |
|
"loss": 0.0951, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.6005483580619249, |
|
"learning_rate": 2.706896551724138e-05, |
|
"loss": 0.1037, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 2.374208686020403, |
|
"learning_rate": 2.689655172413793e-05, |
|
"loss": 0.0926, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.7554923995400171, |
|
"learning_rate": 2.672413793103448e-05, |
|
"loss": 0.1164, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.2965114220197742, |
|
"learning_rate": 2.6551724137931032e-05, |
|
"loss": 0.1102, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.8857303249108055, |
|
"learning_rate": 2.637931034482759e-05, |
|
"loss": 0.0868, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.406207551120988, |
|
"learning_rate": 2.620689655172414e-05, |
|
"loss": 0.1179, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.275728362064451, |
|
"learning_rate": 2.6034482758620692e-05, |
|
"loss": 0.1128, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.7122434387720797, |
|
"learning_rate": 2.5862068965517244e-05, |
|
"loss": 0.1045, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.6075221300125122, |
|
"eval_runtime": 114.4297, |
|
"eval_samples_per_second": 11.527, |
|
"eval_steps_per_second": 2.884, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.3880467648229133, |
|
"learning_rate": 2.5689655172413796e-05, |
|
"loss": 0.1031, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 2.3475935500456657, |
|
"learning_rate": 2.551724137931035e-05, |
|
"loss": 0.1064, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.9746447047097486, |
|
"learning_rate": 2.5344827586206897e-05, |
|
"loss": 0.0995, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.2244623226185645, |
|
"learning_rate": 2.517241379310345e-05, |
|
"loss": 0.1044, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 2.2605012043265216, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.1017, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.624612730097256, |
|
"learning_rate": 2.4827586206896553e-05, |
|
"loss": 0.0973, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.3648662151461801, |
|
"learning_rate": 2.4655172413793105e-05, |
|
"loss": 0.0836, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.4642386177378814, |
|
"learning_rate": 2.4482758620689654e-05, |
|
"loss": 0.1058, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.3057087388796036, |
|
"learning_rate": 2.4310344827586206e-05, |
|
"loss": 0.1092, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 2.092348689417081, |
|
"learning_rate": 2.413793103448276e-05, |
|
"loss": 0.0958, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.3731115281087607, |
|
"learning_rate": 2.3965517241379314e-05, |
|
"loss": 0.0883, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.3983722722394134, |
|
"learning_rate": 2.3793103448275862e-05, |
|
"loss": 0.0944, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 2.4163442262002937, |
|
"learning_rate": 2.3620689655172415e-05, |
|
"loss": 0.1062, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.5006079944233688, |
|
"learning_rate": 2.3448275862068967e-05, |
|
"loss": 0.1129, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.54283270803711, |
|
"learning_rate": 2.327586206896552e-05, |
|
"loss": 0.0861, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 2.269678144223195, |
|
"learning_rate": 2.3103448275862067e-05, |
|
"loss": 0.0978, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.8105959776831768, |
|
"learning_rate": 2.293103448275862e-05, |
|
"loss": 0.1141, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.9135579713863027, |
|
"learning_rate": 2.2758620689655175e-05, |
|
"loss": 0.0918, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.6604450253018581, |
|
"learning_rate": 2.2586206896551727e-05, |
|
"loss": 0.096, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 2.115987565053461, |
|
"learning_rate": 2.2413793103448276e-05, |
|
"loss": 0.1166, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.4279927513160544, |
|
"learning_rate": 2.2241379310344828e-05, |
|
"loss": 0.1052, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.185880441960968, |
|
"learning_rate": 2.206896551724138e-05, |
|
"loss": 0.1053, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.6969029997828415, |
|
"learning_rate": 2.1896551724137932e-05, |
|
"loss": 0.106, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 2.0330998697970286, |
|
"learning_rate": 2.1724137931034484e-05, |
|
"loss": 0.1073, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.19027851417408, |
|
"learning_rate": 2.1551724137931033e-05, |
|
"loss": 0.091, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.2470713090698218, |
|
"learning_rate": 2.137931034482759e-05, |
|
"loss": 0.0898, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.235740059996042, |
|
"learning_rate": 2.120689655172414e-05, |
|
"loss": 0.1327, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.5741742016710085, |
|
"learning_rate": 2.1034482758620692e-05, |
|
"loss": 0.1126, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.9343547126371113, |
|
"learning_rate": 2.086206896551724e-05, |
|
"loss": 0.0819, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 2.2764271447338937, |
|
"learning_rate": 2.0689655172413793e-05, |
|
"loss": 0.1204, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.981384842073209, |
|
"learning_rate": 2.0517241379310345e-05, |
|
"loss": 0.1182, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.044063198588911, |
|
"learning_rate": 2.0344827586206897e-05, |
|
"loss": 0.1005, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 2.370183172473789, |
|
"learning_rate": 2.017241379310345e-05, |
|
"loss": 0.1174, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.9052733799672823, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1125, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.628277530114902, |
|
"learning_rate": 1.9827586206896554e-05, |
|
"loss": 0.1015, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.2522124245986863, |
|
"learning_rate": 1.9655172413793106e-05, |
|
"loss": 0.0924, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.6911426489002421, |
|
"learning_rate": 1.9482758620689655e-05, |
|
"loss": 0.0965, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.821890613342771, |
|
"learning_rate": 1.9310344827586207e-05, |
|
"loss": 0.081, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.7643588179781782, |
|
"learning_rate": 1.913793103448276e-05, |
|
"loss": 0.0761, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 1.1095002263403428, |
|
"learning_rate": 1.896551724137931e-05, |
|
"loss": 0.0871, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 1.4639820667455608, |
|
"learning_rate": 1.8793103448275863e-05, |
|
"loss": 0.0805, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 1.6214269161589794, |
|
"learning_rate": 1.8620689655172415e-05, |
|
"loss": 0.0902, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 1.5979085316952373, |
|
"learning_rate": 1.8448275862068967e-05, |
|
"loss": 0.0967, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.2001043976090235, |
|
"learning_rate": 1.827586206896552e-05, |
|
"loss": 0.069, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 2.100190633629739, |
|
"learning_rate": 1.810344827586207e-05, |
|
"loss": 0.1024, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 1.7393396532511867, |
|
"learning_rate": 1.793103448275862e-05, |
|
"loss": 0.0728, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 1.873599965711283, |
|
"learning_rate": 1.7758620689655172e-05, |
|
"loss": 0.0735, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 1.4460752726376342, |
|
"learning_rate": 1.7586206896551724e-05, |
|
"loss": 0.1, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.8772715867399261, |
|
"learning_rate": 1.7413793103448276e-05, |
|
"loss": 0.0794, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 1.398173054729605, |
|
"learning_rate": 1.7241379310344828e-05, |
|
"loss": 0.078, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"eval_loss": 0.6523420810699463, |
|
"eval_runtime": 115.4048, |
|
"eval_samples_per_second": 11.429, |
|
"eval_steps_per_second": 2.859, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.8544231422617872, |
|
"learning_rate": 1.706896551724138e-05, |
|
"loss": 0.1015, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.0455645860511078, |
|
"learning_rate": 1.6896551724137932e-05, |
|
"loss": 0.0957, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 2.2782017103278265, |
|
"learning_rate": 1.6724137931034485e-05, |
|
"loss": 0.1054, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 1.7712576675859857, |
|
"learning_rate": 1.6551724137931037e-05, |
|
"loss": 0.1054, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 1.0902741267616887, |
|
"learning_rate": 1.6379310344827585e-05, |
|
"loss": 0.0807, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 1.3438284680174908, |
|
"learning_rate": 1.6206896551724137e-05, |
|
"loss": 0.0686, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 1.0087838105436577, |
|
"learning_rate": 1.603448275862069e-05, |
|
"loss": 0.0906, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.9291493020789636, |
|
"learning_rate": 1.586206896551724e-05, |
|
"loss": 0.0777, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.3059876158609884, |
|
"learning_rate": 1.5689655172413794e-05, |
|
"loss": 0.1005, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.7541194076462521, |
|
"learning_rate": 1.5517241379310346e-05, |
|
"loss": 0.0711, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.976743599620672, |
|
"learning_rate": 1.5344827586206898e-05, |
|
"loss": 0.0734, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 1.5352620896805385, |
|
"learning_rate": 1.5172413793103448e-05, |
|
"loss": 0.0966, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 1.165805963424826, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.0804, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 1.5398844769347386, |
|
"learning_rate": 1.482758620689655e-05, |
|
"loss": 0.076, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.9475538051861888, |
|
"learning_rate": 1.4655172413793103e-05, |
|
"loss": 0.0771, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 1.3122536092219965, |
|
"learning_rate": 1.4482758620689657e-05, |
|
"loss": 0.0652, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.8569841144318953, |
|
"learning_rate": 1.4310344827586209e-05, |
|
"loss": 0.0666, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 1.0744088797395874, |
|
"learning_rate": 1.4137931034482759e-05, |
|
"loss": 0.0563, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.7306709750154786, |
|
"learning_rate": 1.3965517241379311e-05, |
|
"loss": 0.0724, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.1617485610561908, |
|
"learning_rate": 1.3793103448275863e-05, |
|
"loss": 0.0592, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.6124055507643191, |
|
"learning_rate": 1.3620689655172414e-05, |
|
"loss": 0.0755, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 1.0636933402698119, |
|
"learning_rate": 1.3448275862068966e-05, |
|
"loss": 0.0807, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 2.5428572768451514, |
|
"learning_rate": 1.3275862068965516e-05, |
|
"loss": 0.0894, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.6519996193761897, |
|
"learning_rate": 1.310344827586207e-05, |
|
"loss": 0.0796, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.7011145463555521, |
|
"learning_rate": 1.2931034482758622e-05, |
|
"loss": 0.0671, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 1.441376178837708, |
|
"learning_rate": 1.2758620689655174e-05, |
|
"loss": 0.0799, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.9345990828016713, |
|
"learning_rate": 1.2586206896551725e-05, |
|
"loss": 0.0684, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 1.3573336012486517, |
|
"learning_rate": 1.2413793103448277e-05, |
|
"loss": 0.0776, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 2.1805616160045, |
|
"learning_rate": 1.2241379310344827e-05, |
|
"loss": 0.0819, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.8476103473594272, |
|
"learning_rate": 1.206896551724138e-05, |
|
"loss": 0.0761, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 2.132874306113927, |
|
"learning_rate": 1.1896551724137931e-05, |
|
"loss": 0.1027, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.1904353387368432, |
|
"learning_rate": 1.1724137931034483e-05, |
|
"loss": 0.0623, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.6470612203805093, |
|
"learning_rate": 1.1551724137931034e-05, |
|
"loss": 0.0872, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.8675118975708371, |
|
"learning_rate": 1.1379310344827587e-05, |
|
"loss": 0.0728, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 1.2991706254841981, |
|
"learning_rate": 1.1206896551724138e-05, |
|
"loss": 0.079, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 1.9364824626463861, |
|
"learning_rate": 1.103448275862069e-05, |
|
"loss": 0.0783, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 1.6597471057325688, |
|
"learning_rate": 1.0862068965517242e-05, |
|
"loss": 0.0902, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 1.454588875045873, |
|
"learning_rate": 1.0689655172413794e-05, |
|
"loss": 0.0774, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.1670103110646668, |
|
"learning_rate": 1.0517241379310346e-05, |
|
"loss": 0.0659, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.4612075298157572, |
|
"learning_rate": 1.0344827586206897e-05, |
|
"loss": 0.0775, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 3.932323942941103, |
|
"learning_rate": 1.0172413793103449e-05, |
|
"loss": 0.0671, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.9979331315685359, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0695, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 1.9070655793934348, |
|
"learning_rate": 9.827586206896553e-06, |
|
"loss": 0.0692, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 1.5364730689079074, |
|
"learning_rate": 9.655172413793103e-06, |
|
"loss": 0.0738, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 1.1024432074823722, |
|
"learning_rate": 9.482758620689655e-06, |
|
"loss": 0.0855, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 1.8514817180884215, |
|
"learning_rate": 9.310344827586207e-06, |
|
"loss": 0.076, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 1.1275915444574418, |
|
"learning_rate": 9.13793103448276e-06, |
|
"loss": 0.0688, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 2.1236224333360454, |
|
"learning_rate": 8.96551724137931e-06, |
|
"loss": 0.0985, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 1.5716827436886402, |
|
"learning_rate": 8.793103448275862e-06, |
|
"loss": 0.0827, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.9799310640151211, |
|
"learning_rate": 8.620689655172414e-06, |
|
"loss": 0.0807, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"eval_loss": 0.6613200306892395, |
|
"eval_runtime": 115.1632, |
|
"eval_samples_per_second": 11.453, |
|
"eval_steps_per_second": 2.865, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 1.429365154274089, |
|
"learning_rate": 8.448275862068966e-06, |
|
"loss": 0.0803, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 2.034554738355009, |
|
"learning_rate": 8.275862068965518e-06, |
|
"loss": 0.0771, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 1.3116032945356366, |
|
"learning_rate": 8.103448275862069e-06, |
|
"loss": 0.102, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.538270992356044, |
|
"learning_rate": 7.93103448275862e-06, |
|
"loss": 0.0781, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 1.359815780671966, |
|
"learning_rate": 7.758620689655173e-06, |
|
"loss": 0.082, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 1.3555113208274487, |
|
"learning_rate": 7.586206896551724e-06, |
|
"loss": 0.0662, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.9126598313656492, |
|
"learning_rate": 7.413793103448275e-06, |
|
"loss": 0.0809, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 1.0362500054468016, |
|
"learning_rate": 7.241379310344828e-06, |
|
"loss": 0.0763, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 1.4486705272957705, |
|
"learning_rate": 7.0689655172413796e-06, |
|
"loss": 0.0772, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 1.942597284762883, |
|
"learning_rate": 6.896551724137932e-06, |
|
"loss": 0.0627, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 1.3527152216583775, |
|
"learning_rate": 6.724137931034483e-06, |
|
"loss": 0.0742, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.7183775146684962, |
|
"learning_rate": 6.551724137931035e-06, |
|
"loss": 0.0478, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 1.081708916490155, |
|
"learning_rate": 6.379310344827587e-06, |
|
"loss": 0.0858, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 2.083585945496064, |
|
"learning_rate": 6.206896551724138e-06, |
|
"loss": 0.094, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 1.3628676873699075, |
|
"learning_rate": 6.03448275862069e-06, |
|
"loss": 0.0996, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 1.1584513648567698, |
|
"learning_rate": 5.862068965517242e-06, |
|
"loss": 0.0678, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 1.239472995186361, |
|
"learning_rate": 5.689655172413794e-06, |
|
"loss": 0.0745, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 1.0885278290149767, |
|
"learning_rate": 5.517241379310345e-06, |
|
"loss": 0.0755, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 1.4830428953890444, |
|
"learning_rate": 5.344827586206897e-06, |
|
"loss": 0.0799, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 1.6819930232389664, |
|
"learning_rate": 5.172413793103448e-06, |
|
"loss": 0.0849, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 1.5372127437839038, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0758, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 1.367779239832988, |
|
"learning_rate": 4.827586206896552e-06, |
|
"loss": 0.073, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 1.4887137481978274, |
|
"learning_rate": 4.655172413793104e-06, |
|
"loss": 0.061, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 2.1404166354958325, |
|
"learning_rate": 4.482758620689655e-06, |
|
"loss": 0.0957, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 1.8642358306515179, |
|
"learning_rate": 4.310344827586207e-06, |
|
"loss": 0.0782, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 1.3698287268201832, |
|
"learning_rate": 4.137931034482759e-06, |
|
"loss": 0.0647, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 1.4319268282045232, |
|
"learning_rate": 3.96551724137931e-06, |
|
"loss": 0.0787, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 1.383030067235673, |
|
"learning_rate": 3.793103448275862e-06, |
|
"loss": 0.0681, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 2.161260735181774, |
|
"learning_rate": 3.620689655172414e-06, |
|
"loss": 0.1013, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.6657048923514055, |
|
"learning_rate": 3.448275862068966e-06, |
|
"loss": 0.062, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.9193769932816829, |
|
"learning_rate": 3.2758620689655175e-06, |
|
"loss": 0.0696, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 1.2415022440375736, |
|
"learning_rate": 3.103448275862069e-06, |
|
"loss": 0.103, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 1.8151340036133554, |
|
"learning_rate": 2.931034482758621e-06, |
|
"loss": 0.0821, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.928286704694044, |
|
"learning_rate": 2.7586206896551725e-06, |
|
"loss": 0.057, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.3442506927303919, |
|
"learning_rate": 2.586206896551724e-06, |
|
"loss": 0.0621, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 2.1790836959559523, |
|
"learning_rate": 2.413793103448276e-06, |
|
"loss": 0.074, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.6594054854326988, |
|
"learning_rate": 2.2413793103448275e-06, |
|
"loss": 0.0814, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 1.3726881004102018, |
|
"learning_rate": 2.0689655172413796e-06, |
|
"loss": 0.0712, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 1.679214299803237, |
|
"learning_rate": 1.896551724137931e-06, |
|
"loss": 0.0627, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 1.0321949433654767, |
|
"learning_rate": 1.724137931034483e-06, |
|
"loss": 0.1165, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.6973352883003652, |
|
"learning_rate": 1.5517241379310346e-06, |
|
"loss": 0.0779, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 1.2300329446905003, |
|
"learning_rate": 1.3793103448275862e-06, |
|
"loss": 0.0871, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 1.7134185607297843, |
|
"learning_rate": 1.206896551724138e-06, |
|
"loss": 0.0746, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 1.0698704299983892, |
|
"learning_rate": 1.0344827586206898e-06, |
|
"loss": 0.0801, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 2.2453660227340957, |
|
"learning_rate": 8.620689655172415e-07, |
|
"loss": 0.0867, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 1.1712923935242836, |
|
"learning_rate": 6.896551724137931e-07, |
|
"loss": 0.1029, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 1.1626149131019259, |
|
"learning_rate": 5.172413793103449e-07, |
|
"loss": 0.0875, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 1.5972056170209479, |
|
"learning_rate": 3.4482758620689656e-07, |
|
"loss": 0.0786, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.4660799065028676, |
|
"learning_rate": 1.7241379310344828e-07, |
|
"loss": 0.0916, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 1.1438897568043958, |
|
"learning_rate": 0.0, |
|
"loss": 0.0669, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"eval_loss": 0.6566838622093201, |
|
"eval_runtime": 114.3635, |
|
"eval_samples_per_second": 11.533, |
|
"eval_steps_per_second": 2.886, |
|
"step": 3000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"total_flos": 86675375849472.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|