|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 1875, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 1.5132097727696663, |
|
"learning_rate": 9.999298177883903e-05, |
|
"loss": 1.6713, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 1.0288849642854194, |
|
"learning_rate": 9.997192908557323e-05, |
|
"loss": 0.9562, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 2.628046163977058, |
|
"learning_rate": 9.993684783030088e-05, |
|
"loss": 0.7998, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 1.0977335920429718, |
|
"learning_rate": 9.988774786134234e-05, |
|
"loss": 0.7019, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.143511217142594, |
|
"learning_rate": 9.982464296247522e-05, |
|
"loss": 0.6965, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 1.1886880844299708, |
|
"learning_rate": 9.974755084906502e-05, |
|
"loss": 0.6417, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 1.3088809099781331, |
|
"learning_rate": 9.965649316309178e-05, |
|
"loss": 0.6388, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 2.0299623866899035, |
|
"learning_rate": 9.955149546707465e-05, |
|
"loss": 0.6665, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 1.5251129511711958, |
|
"learning_rate": 9.94325872368957e-05, |
|
"loss": 0.6014, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.307242452061001, |
|
"learning_rate": 9.929980185352526e-05, |
|
"loss": 0.5971, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 1.0674735946460283, |
|
"learning_rate": 9.915317659365077e-05, |
|
"loss": 0.6206, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 1.3671834842056045, |
|
"learning_rate": 9.899275261921234e-05, |
|
"loss": 0.5597, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 1.3041158193336297, |
|
"learning_rate": 9.881857496584726e-05, |
|
"loss": 0.5774, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 1.2739151782924372, |
|
"learning_rate": 9.863069253024719e-05, |
|
"loss": 0.5656, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.1009566652519438, |
|
"learning_rate": 9.842915805643155e-05, |
|
"loss": 0.5724, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 1.2004741768824123, |
|
"learning_rate": 9.821402812094073e-05, |
|
"loss": 0.5796, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 1.5111413381395908, |
|
"learning_rate": 9.798536311695334e-05, |
|
"loss": 0.5556, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 1.0331976579035735, |
|
"learning_rate": 9.774322723733216e-05, |
|
"loss": 0.5451, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 1.1745428366022237, |
|
"learning_rate": 9.748768845660334e-05, |
|
"loss": 0.5605, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.067749681555933, |
|
"learning_rate": 9.721881851187406e-05, |
|
"loss": 0.5644, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 1.1773627454789788, |
|
"learning_rate": 9.693669288269372e-05, |
|
"loss": 0.5658, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 1.4049555802409854, |
|
"learning_rate": 9.664139076986473e-05, |
|
"loss": 0.5483, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 1.436137944296166, |
|
"learning_rate": 9.63329950732086e-05, |
|
"loss": 0.5376, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 1.2686595633739197, |
|
"learning_rate": 9.601159236829352e-05, |
|
"loss": 0.5402, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.4062419829256017, |
|
"learning_rate": 9.567727288213005e-05, |
|
"loss": 0.5552, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 0.9610102580305002, |
|
"learning_rate": 9.533013046784189e-05, |
|
"loss": 0.5118, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 1.2008075396073077, |
|
"learning_rate": 9.497026257831855e-05, |
|
"loss": 0.5479, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 1.116478910009909, |
|
"learning_rate": 9.459777023885755e-05, |
|
"loss": 0.4773, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 1.2287813088079407, |
|
"learning_rate": 9.421275801880362e-05, |
|
"loss": 0.5149, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.3153026196754682, |
|
"learning_rate": 9.381533400219318e-05, |
|
"loss": 0.5209, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 1.6158648577050707, |
|
"learning_rate": 9.340560975741197e-05, |
|
"loss": 0.5065, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 1.1601862022387133, |
|
"learning_rate": 9.298370030587456e-05, |
|
"loss": 0.515, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 1.2591523341167126, |
|
"learning_rate": 9.254972408973461e-05, |
|
"loss": 0.5375, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 1.0361296810068354, |
|
"learning_rate": 9.210380293863462e-05, |
|
"loss": 0.5085, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.0654093627590127, |
|
"learning_rate": 9.164606203550497e-05, |
|
"loss": 0.5137, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 1.2697508248750302, |
|
"learning_rate": 9.117662988142138e-05, |
|
"loss": 0.4906, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 1.2602467399423862, |
|
"learning_rate": 9.069563825953092e-05, |
|
"loss": 0.4966, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 2.0044411177134918, |
|
"learning_rate": 9.020322219805674e-05, |
|
"loss": 0.5063, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 1.148187026295081, |
|
"learning_rate": 8.969951993239177e-05, |
|
"loss": 0.4864, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.0529856886801567, |
|
"learning_rate": 8.9184672866292e-05, |
|
"loss": 0.4632, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 1.1830984654167522, |
|
"learning_rate": 8.865882553218037e-05, |
|
"loss": 0.5097, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 1.1495869848318996, |
|
"learning_rate": 8.81221255505724e-05, |
|
"loss": 0.4795, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 1.1235900402260404, |
|
"learning_rate": 8.757472358863481e-05, |
|
"loss": 0.5006, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 1.2883600054948787, |
|
"learning_rate": 8.701677331788891e-05, |
|
"loss": 0.4822, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.0725324329031838, |
|
"learning_rate": 8.644843137107059e-05, |
|
"loss": 0.4856, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 2.9167440508075955, |
|
"learning_rate": 8.586985729815894e-05, |
|
"loss": 0.4648, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 0.9522185207229502, |
|
"learning_rate": 8.528121352158604e-05, |
|
"loss": 0.488, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 1.5907707346034905, |
|
"learning_rate": 8.468266529064025e-05, |
|
"loss": 0.5039, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 1.1370580239916313, |
|
"learning_rate": 8.4074380635076e-05, |
|
"loss": 0.4737, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.9782419363730104, |
|
"learning_rate": 8.345653031794292e-05, |
|
"loss": 0.5296, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 1.1919631254125505, |
|
"learning_rate": 8.282928778764783e-05, |
|
"loss": 0.4688, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 1.2994447479210165, |
|
"learning_rate": 8.21928291292627e-05, |
|
"loss": 0.4675, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 0.9689627531529147, |
|
"learning_rate": 8.154733301509248e-05, |
|
"loss": 0.4408, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 1.1649720718750445, |
|
"learning_rate": 8.089298065451672e-05, |
|
"loss": 0.4608, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.3438713492048824, |
|
"learning_rate": 8.022995574311876e-05, |
|
"loss": 0.4476, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 1.0737706296113363, |
|
"learning_rate": 7.95584444111171e-05, |
|
"loss": 0.5027, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 1.3469637742497171, |
|
"learning_rate": 7.887863517111338e-05, |
|
"loss": 0.4929, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 1.1251117260635928, |
|
"learning_rate": 7.819071886517134e-05, |
|
"loss": 0.4739, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 1.0579726403654415, |
|
"learning_rate": 7.7494888611242e-05, |
|
"loss": 0.4529, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.0498899044694738, |
|
"learning_rate": 7.679133974894983e-05, |
|
"loss": 0.4619, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 1.2353618604203835, |
|
"learning_rate": 7.60802697847554e-05, |
|
"loss": 0.462, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 1.4618575476184537, |
|
"learning_rate": 7.536187833650947e-05, |
|
"loss": 0.4478, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.008, |
|
"grad_norm": 0.9313196650505406, |
|
"learning_rate": 7.463636707741458e-05, |
|
"loss": 0.4304, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.024, |
|
"grad_norm": 1.1239041555920188, |
|
"learning_rate": 7.390393967940962e-05, |
|
"loss": 0.4138, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.8736342455754906, |
|
"learning_rate": 7.316480175599309e-05, |
|
"loss": 0.3909, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.056, |
|
"grad_norm": 1.3075877434881118, |
|
"learning_rate": 7.241916080450163e-05, |
|
"loss": 0.3873, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.072, |
|
"grad_norm": 1.1366815390360774, |
|
"learning_rate": 7.166722614785937e-05, |
|
"loss": 0.3744, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.088, |
|
"grad_norm": 1.1482160099998533, |
|
"learning_rate": 7.090920887581506e-05, |
|
"loss": 0.3641, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.104, |
|
"grad_norm": 1.4903122013208903, |
|
"learning_rate": 7.014532178568314e-05, |
|
"loss": 0.3708, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.2283466862099195, |
|
"learning_rate": 6.937577932260515e-05, |
|
"loss": 0.4147, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.1360000000000001, |
|
"grad_norm": 1.044360506141548, |
|
"learning_rate": 6.860079751934908e-05, |
|
"loss": 0.3698, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.152, |
|
"grad_norm": 1.9659986049038847, |
|
"learning_rate": 6.782059393566253e-05, |
|
"loss": 0.3768, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.168, |
|
"grad_norm": 1.1559700548705432, |
|
"learning_rate": 6.70353875971976e-05, |
|
"loss": 0.3713, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.184, |
|
"grad_norm": 1.3638599004402776, |
|
"learning_rate": 6.624539893402382e-05, |
|
"loss": 0.376, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.2823520816693201, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 0.3762, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.216, |
|
"grad_norm": 1.1889832443284072, |
|
"learning_rate": 6.465196300425287e-05, |
|
"loss": 0.3891, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.232, |
|
"grad_norm": 1.128194765929284, |
|
"learning_rate": 6.384896306108612e-05, |
|
"loss": 0.3772, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.248, |
|
"grad_norm": 1.4491197750693638, |
|
"learning_rate": 6.304207531449486e-05, |
|
"loss": 0.3678, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.264, |
|
"grad_norm": 1.5686675701147237, |
|
"learning_rate": 6.223152628114537e-05, |
|
"loss": 0.3648, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.0902715474500886, |
|
"learning_rate": 6.141754350553279e-05, |
|
"loss": 0.3909, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.296, |
|
"grad_norm": 1.6337999430835446, |
|
"learning_rate": 6.0600355496102745e-05, |
|
"loss": 0.383, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.312, |
|
"grad_norm": 1.321659745725057, |
|
"learning_rate": 5.9780191661102415e-05, |
|
"loss": 0.3802, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.328, |
|
"grad_norm": 1.6867441595334431, |
|
"learning_rate": 5.8957282244179124e-05, |
|
"loss": 0.423, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.3439999999999999, |
|
"grad_norm": 1.2441664958096805, |
|
"learning_rate": 5.813185825974419e-05, |
|
"loss": 0.3641, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 1.6422801866133996, |
|
"learning_rate": 5.730415142812059e-05, |
|
"loss": 0.3519, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.376, |
|
"grad_norm": 1.7154734289625684, |
|
"learning_rate": 5.6474394110492344e-05, |
|
"loss": 0.3732, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.392, |
|
"grad_norm": 1.3529634625288804, |
|
"learning_rate": 5.564281924367408e-05, |
|
"loss": 0.3954, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.408, |
|
"grad_norm": 1.4693831158149995, |
|
"learning_rate": 5.480966027471889e-05, |
|
"loss": 0.3669, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.424, |
|
"grad_norm": 1.5051265025599196, |
|
"learning_rate": 5.3975151095382995e-05, |
|
"loss": 0.3994, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 1.2893604881333178, |
|
"learning_rate": 5.313952597646568e-05, |
|
"loss": 0.3904, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.456, |
|
"grad_norm": 1.5665270237662434, |
|
"learning_rate": 5.230301950204262e-05, |
|
"loss": 0.3821, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.472, |
|
"grad_norm": 1.1847281856196625, |
|
"learning_rate": 5.1465866503611426e-05, |
|
"loss": 0.3713, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.488, |
|
"grad_norm": 1.6027343131881702, |
|
"learning_rate": 5.062830199416764e-05, |
|
"loss": 0.3785, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.504, |
|
"grad_norm": 1.1745245908384976, |
|
"learning_rate": 4.979056110222981e-05, |
|
"loss": 0.3851, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.2252730329400867, |
|
"learning_rate": 4.895287900583216e-05, |
|
"loss": 0.3991, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.536, |
|
"grad_norm": 1.9189789923421308, |
|
"learning_rate": 4.811549086650327e-05, |
|
"loss": 0.3726, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.552, |
|
"grad_norm": 1.9319344659060065, |
|
"learning_rate": 4.7278631763249554e-05, |
|
"loss": 0.3703, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.568, |
|
"grad_norm": 1.3155775136973578, |
|
"learning_rate": 4.6442536626561675e-05, |
|
"loss": 0.3462, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.584, |
|
"grad_norm": 1.3059644425743502, |
|
"learning_rate": 4.560744017246284e-05, |
|
"loss": 0.3486, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.1918137712202796, |
|
"learning_rate": 4.477357683661734e-05, |
|
"loss": 0.3384, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.616, |
|
"grad_norm": 1.3051153832513638, |
|
"learning_rate": 4.394118070851749e-05, |
|
"loss": 0.3976, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.6320000000000001, |
|
"grad_norm": 2.1135344091319266, |
|
"learning_rate": 4.31104854657681e-05, |
|
"loss": 0.3607, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.6480000000000001, |
|
"grad_norm": 1.3948892979651148, |
|
"learning_rate": 4.228172430848644e-05, |
|
"loss": 0.371, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.6640000000000001, |
|
"grad_norm": 1.3934093132103698, |
|
"learning_rate": 4.1455129893836174e-05, |
|
"loss": 0.3627, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.6800000000000002, |
|
"grad_norm": 2.099040035068272, |
|
"learning_rate": 4.063093427071376e-05, |
|
"loss": 0.3725, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.696, |
|
"grad_norm": 1.1171081747364493, |
|
"learning_rate": 3.9809368814605766e-05, |
|
"loss": 0.3466, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.712, |
|
"grad_norm": 1.2557303206572503, |
|
"learning_rate": 3.899066416263493e-05, |
|
"loss": 0.3629, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.728, |
|
"grad_norm": 1.2612524984380895, |
|
"learning_rate": 3.817505014881378e-05, |
|
"loss": 0.3941, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.744, |
|
"grad_norm": 1.4536732919602855, |
|
"learning_rate": 3.736275573952354e-05, |
|
"loss": 0.3362, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 1.5347325608557592, |
|
"learning_rate": 3.655400896923672e-05, |
|
"loss": 0.3197, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.776, |
|
"grad_norm": 1.5306810075691544, |
|
"learning_rate": 3.5749036876501194e-05, |
|
"loss": 0.3447, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.792, |
|
"grad_norm": 1.2300188758500825, |
|
"learning_rate": 3.494806544020398e-05, |
|
"loss": 0.3535, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.808, |
|
"grad_norm": 1.4275688891160825, |
|
"learning_rate": 3.4151319516132416e-05, |
|
"loss": 0.3447, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.8239999999999998, |
|
"grad_norm": 1.397634927032295, |
|
"learning_rate": 3.335902277385067e-05, |
|
"loss": 0.338, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.8399999999999999, |
|
"grad_norm": 1.8570000554042922, |
|
"learning_rate": 3.257139763390925e-05, |
|
"loss": 0.3915, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.8559999999999999, |
|
"grad_norm": 1.0694824076607796, |
|
"learning_rate": 3.178866520540509e-05, |
|
"loss": 0.332, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.8719999999999999, |
|
"grad_norm": 1.9227266591369816, |
|
"learning_rate": 3.101104522390995e-05, |
|
"loss": 0.3734, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.888, |
|
"grad_norm": 1.4206748536497713, |
|
"learning_rate": 3.023875598978419e-05, |
|
"loss": 0.3409, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.904, |
|
"grad_norm": 1.4359353896901201, |
|
"learning_rate": 2.9472014306893603e-05, |
|
"loss": 0.3333, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 1.2517700720828175, |
|
"learning_rate": 2.8711035421746367e-05, |
|
"loss": 0.369, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.936, |
|
"grad_norm": 1.3801398040671706, |
|
"learning_rate": 2.795603296306708e-05, |
|
"loss": 0.3449, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.952, |
|
"grad_norm": 1.8786886188992349, |
|
"learning_rate": 2.7207218881825014e-05, |
|
"loss": 0.3662, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.968, |
|
"grad_norm": 1.2874259666120602, |
|
"learning_rate": 2.6464803391733374e-05, |
|
"loss": 0.3773, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.984, |
|
"grad_norm": 1.1788438157215508, |
|
"learning_rate": 2.5728994910236304e-05, |
|
"loss": 0.3245, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.5631045640027712, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 0.3412, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.016, |
|
"grad_norm": 1.4435057320180134, |
|
"learning_rate": 2.4278023310924673e-05, |
|
"loss": 0.3027, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.032, |
|
"grad_norm": 1.690073924846414, |
|
"learning_rate": 2.3563267522693415e-05, |
|
"loss": 0.2745, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.048, |
|
"grad_norm": 1.571802508559373, |
|
"learning_rate": 2.2855933287874138e-05, |
|
"loss": 0.2686, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.064, |
|
"grad_norm": 1.2829706151133036, |
|
"learning_rate": 2.215621917559062e-05, |
|
"loss": 0.2628, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 1.5440500775973827, |
|
"learning_rate": 2.1464321615778422e-05, |
|
"loss": 0.274, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.096, |
|
"grad_norm": 2.0539984960007627, |
|
"learning_rate": 2.07804348440414e-05, |
|
"loss": 0.2711, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.112, |
|
"grad_norm": 1.6138729128750948, |
|
"learning_rate": 2.0104750847124075e-05, |
|
"loss": 0.2662, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.128, |
|
"grad_norm": 1.4560623181879653, |
|
"learning_rate": 1.9437459309015427e-05, |
|
"loss": 0.2722, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.144, |
|
"grad_norm": 1.4659611175991962, |
|
"learning_rate": 1.8778747557699224e-05, |
|
"loss": 0.2965, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 1.375382006257737, |
|
"learning_rate": 1.8128800512565513e-05, |
|
"loss": 0.2789, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.176, |
|
"grad_norm": 1.599179489252925, |
|
"learning_rate": 1.7487800632498545e-05, |
|
"loss": 0.2727, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.192, |
|
"grad_norm": 1.517535594054785, |
|
"learning_rate": 1.685592786465524e-05, |
|
"loss": 0.2579, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.208, |
|
"grad_norm": 2.1592276480492414, |
|
"learning_rate": 1.6233359593948777e-05, |
|
"loss": 0.2639, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.224, |
|
"grad_norm": 1.7544518130502498, |
|
"learning_rate": 1.5620270593251635e-05, |
|
"loss": 0.2909, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 1.44885168688834, |
|
"learning_rate": 1.5016832974331724e-05, |
|
"loss": 0.2675, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.2560000000000002, |
|
"grad_norm": 1.6827124662772142, |
|
"learning_rate": 1.4423216139535734e-05, |
|
"loss": 0.2501, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.2720000000000002, |
|
"grad_norm": 1.8488651881141411, |
|
"learning_rate": 1.3839586734232906e-05, |
|
"loss": 0.28, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.288, |
|
"grad_norm": 1.8044696205561654, |
|
"learning_rate": 1.3266108600032929e-05, |
|
"loss": 0.2946, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.304, |
|
"grad_norm": 1.5680732030217428, |
|
"learning_rate": 1.2702942728790895e-05, |
|
"loss": 0.2801, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 1.8180785380951152, |
|
"learning_rate": 1.2150247217412186e-05, |
|
"loss": 0.2686, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.336, |
|
"grad_norm": 1.4756615703317884, |
|
"learning_rate": 1.160817722347014e-05, |
|
"loss": 0.2623, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.352, |
|
"grad_norm": 1.4951599689749675, |
|
"learning_rate": 1.1076884921648834e-05, |
|
"loss": 0.2865, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.368, |
|
"grad_norm": 1.8073936077115729, |
|
"learning_rate": 1.0556519461023301e-05, |
|
"loss": 0.2677, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.384, |
|
"grad_norm": 1.6787640384880986, |
|
"learning_rate": 1.0047226923189024e-05, |
|
"loss": 0.2821, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.7736081937737844, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 0.2628, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.416, |
|
"grad_norm": 1.8163690827077064, |
|
"learning_rate": 9.06242935969528e-06, |
|
"loss": 0.2648, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.432, |
|
"grad_norm": 1.914925459991879, |
|
"learning_rate": 8.587200795119793e-06, |
|
"loss": 0.2718, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.448, |
|
"grad_norm": 1.557572030989793, |
|
"learning_rate": 8.123597997892918e-06, |
|
"loss": 0.2779, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.464, |
|
"grad_norm": 2.4189475486471412, |
|
"learning_rate": 7.671751114693104e-06, |
|
"loss": 0.2522, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 1.5979978703748492, |
|
"learning_rate": 7.2317869919746705e-06, |
|
"loss": 0.2668, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.496, |
|
"grad_norm": 1.5867221943137244, |
|
"learning_rate": 6.803829140358237e-06, |
|
"loss": 0.2582, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.512, |
|
"grad_norm": 2.0088219455242866, |
|
"learning_rate": 6.3879976999578154e-06, |
|
"loss": 0.2509, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.528, |
|
"grad_norm": 1.7960925687722837, |
|
"learning_rate": 5.98440940665399e-06, |
|
"loss": 0.2743, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.544, |
|
"grad_norm": 1.5572986827037758, |
|
"learning_rate": 5.593177559322777e-06, |
|
"loss": 0.2833, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 1.370850993490327, |
|
"learning_rate": 5.214411988029355e-06, |
|
"loss": 0.2693, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.576, |
|
"grad_norm": 1.3820976570272918, |
|
"learning_rate": 4.848219023195644e-06, |
|
"loss": 0.2607, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.592, |
|
"grad_norm": 1.7956675550522907, |
|
"learning_rate": 4.494701465750217e-06, |
|
"loss": 0.2632, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.608, |
|
"grad_norm": 1.657852197945445, |
|
"learning_rate": 4.153958558269189e-06, |
|
"loss": 0.2573, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.624, |
|
"grad_norm": 1.6754996668537725, |
|
"learning_rate": 3.826085957115888e-06, |
|
"loss": 0.2411, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 1.3160903872279537, |
|
"learning_rate": 3.511175705587433e-06, |
|
"loss": 0.2601, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.656, |
|
"grad_norm": 1.4417765456611342, |
|
"learning_rate": 3.2093162080754637e-06, |
|
"loss": 0.2832, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.672, |
|
"grad_norm": 1.5418721821515773, |
|
"learning_rate": 2.9205922052484958e-06, |
|
"loss": 0.2725, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.6879999999999997, |
|
"grad_norm": 1.3747599439593585, |
|
"learning_rate": 2.6450847502627884e-06, |
|
"loss": 0.2654, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.7039999999999997, |
|
"grad_norm": 1.528133223907777, |
|
"learning_rate": 2.3828711860083674e-06, |
|
"loss": 0.2784, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.7199999999999998, |
|
"grad_norm": 2.3538184370373534, |
|
"learning_rate": 2.134025123396638e-06, |
|
"loss": 0.2758, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.7359999999999998, |
|
"grad_norm": 1.6198355766213741, |
|
"learning_rate": 1.8986164206957035e-06, |
|
"loss": 0.2533, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.752, |
|
"grad_norm": 1.8976100233670337, |
|
"learning_rate": 1.6767111639191202e-06, |
|
"loss": 0.268, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.768, |
|
"grad_norm": 1.7822120577723535, |
|
"learning_rate": 1.4683716482736366e-06, |
|
"loss": 0.28, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.784, |
|
"grad_norm": 2.2850018497482183, |
|
"learning_rate": 1.2736563606711382e-06, |
|
"loss": 0.2797, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.79121827664844, |
|
"learning_rate": 1.0926199633097157e-06, |
|
"loss": 0.2831, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.816, |
|
"grad_norm": 1.7925072153699964, |
|
"learning_rate": 9.253132783283547e-07, |
|
"loss": 0.2558, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.832, |
|
"grad_norm": 1.2894709082534288, |
|
"learning_rate": 7.717832735397335e-07, |
|
"loss": 0.2635, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.848, |
|
"grad_norm": 1.4531115959909715, |
|
"learning_rate": 6.3207304924498e-07, |
|
"loss": 0.2639, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.864, |
|
"grad_norm": 1.9570336366099832, |
|
"learning_rate": 5.062218261342122e-07, |
|
"loss": 0.2532, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 1.3421965286430086, |
|
"learning_rate": 3.9426493427611177e-07, |
|
"loss": 0.2327, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.896, |
|
"grad_norm": 2.3252685172502465, |
|
"learning_rate": 2.962338031997691e-07, |
|
"loss": 0.2621, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.912, |
|
"grad_norm": 1.7539361758237508, |
|
"learning_rate": 2.1215595307154667e-07, |
|
"loss": 0.256, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.928, |
|
"grad_norm": 1.417074579458341, |
|
"learning_rate": 1.420549869693033e-07, |
|
"loss": 0.2545, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.944, |
|
"grad_norm": 1.7016791883476743, |
|
"learning_rate": 8.595058425640013e-08, |
|
"loss": 0.2596, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 1.6335011497462837, |
|
"learning_rate": 4.385849505708084e-08, |
|
"loss": 0.2584, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.976, |
|
"grad_norm": 1.7816152560274576, |
|
"learning_rate": 1.5790535835003008e-08, |
|
"loss": 0.2752, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.992, |
|
"grad_norm": 1.579627845758394, |
|
"learning_rate": 1.7545860759693445e-09, |
|
"loss": 0.2778, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 1875, |
|
"total_flos": 299543991877632.0, |
|
"train_loss": 0.3980048195521037, |
|
"train_runtime": 15877.4798, |
|
"train_samples_per_second": 0.945, |
|
"train_steps_per_second": 0.118 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1875, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 299543991877632.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|