|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 1875, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 1.4735389691796648, |
|
"learning_rate": 9.999298177883903e-05, |
|
"loss": 2.0846, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 1.574973327270963, |
|
"learning_rate": 9.997192908557323e-05, |
|
"loss": 1.0283, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 2.0498267249856488, |
|
"learning_rate": 9.993684783030088e-05, |
|
"loss": 0.686, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 1.2702818493413723, |
|
"learning_rate": 9.988774786134234e-05, |
|
"loss": 0.587, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.6979872055713617, |
|
"learning_rate": 9.982464296247522e-05, |
|
"loss": 0.5667, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 1.0417497433733047, |
|
"learning_rate": 9.974755084906502e-05, |
|
"loss": 0.5057, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 0.8791937337247481, |
|
"learning_rate": 9.965649316309178e-05, |
|
"loss": 0.5095, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 0.6707662533067278, |
|
"learning_rate": 9.955149546707465e-05, |
|
"loss": 0.5589, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 0.8382574790909684, |
|
"learning_rate": 9.94325872368957e-05, |
|
"loss": 0.4037, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.3977581534529162, |
|
"learning_rate": 9.929980185352526e-05, |
|
"loss": 0.476, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 0.7995346930439355, |
|
"learning_rate": 9.915317659365077e-05, |
|
"loss": 0.4732, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 1.0293778342519209, |
|
"learning_rate": 9.899275261921234e-05, |
|
"loss": 0.4808, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 0.7924054746311283, |
|
"learning_rate": 9.881857496584726e-05, |
|
"loss": 0.4359, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 0.899393132949154, |
|
"learning_rate": 9.863069253024719e-05, |
|
"loss": 0.377, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.9174662358653375, |
|
"learning_rate": 9.842915805643155e-05, |
|
"loss": 0.4143, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 1.080440113976801, |
|
"learning_rate": 9.821402812094073e-05, |
|
"loss": 0.4396, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 1.7023365502074517, |
|
"learning_rate": 9.798536311695334e-05, |
|
"loss": 0.4219, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 1.0019239067011896, |
|
"learning_rate": 9.774322723733216e-05, |
|
"loss": 0.4189, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 0.8717425800117149, |
|
"learning_rate": 9.748768845660334e-05, |
|
"loss": 0.399, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.0137511699320638, |
|
"learning_rate": 9.721881851187406e-05, |
|
"loss": 0.434, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 0.9295547515585622, |
|
"learning_rate": 9.693669288269372e-05, |
|
"loss": 0.3684, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 2.034518417888826, |
|
"learning_rate": 9.664139076986473e-05, |
|
"loss": 0.3468, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 0.9229607408434366, |
|
"learning_rate": 9.63329950732086e-05, |
|
"loss": 0.4119, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 1.0004931328900635, |
|
"learning_rate": 9.601159236829352e-05, |
|
"loss": 0.3904, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.9012457023667851, |
|
"learning_rate": 9.567727288213005e-05, |
|
"loss": 0.423, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 0.9087395867147162, |
|
"learning_rate": 9.533013046784189e-05, |
|
"loss": 0.3761, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 0.9173174623162017, |
|
"learning_rate": 9.497026257831855e-05, |
|
"loss": 0.3351, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 0.7699242514964085, |
|
"learning_rate": 9.459777023885755e-05, |
|
"loss": 0.3282, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 0.9501543328230019, |
|
"learning_rate": 9.421275801880362e-05, |
|
"loss": 0.4125, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.21516103198593, |
|
"learning_rate": 9.381533400219318e-05, |
|
"loss": 0.3728, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 0.9020121880449892, |
|
"learning_rate": 9.340560975741197e-05, |
|
"loss": 0.3094, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 0.8137757312519368, |
|
"learning_rate": 9.298370030587456e-05, |
|
"loss": 0.3853, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 1.0040829480628546, |
|
"learning_rate": 9.254972408973461e-05, |
|
"loss": 0.3391, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 1.2904098253200862, |
|
"learning_rate": 9.210380293863462e-05, |
|
"loss": 0.397, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.8956729634317536, |
|
"learning_rate": 9.164606203550497e-05, |
|
"loss": 0.3878, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 1.522665457627724, |
|
"learning_rate": 9.117662988142138e-05, |
|
"loss": 0.3415, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 1.0284487945989331, |
|
"learning_rate": 9.069563825953092e-05, |
|
"loss": 0.4046, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 0.8916994827228245, |
|
"learning_rate": 9.020322219805674e-05, |
|
"loss": 0.3674, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 0.8857668635539665, |
|
"learning_rate": 8.969951993239177e-05, |
|
"loss": 0.3478, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.8050131549764538, |
|
"learning_rate": 8.9184672866292e-05, |
|
"loss": 0.375, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 0.8436775930299412, |
|
"learning_rate": 8.865882553218037e-05, |
|
"loss": 0.381, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 0.849143512269743, |
|
"learning_rate": 8.81221255505724e-05, |
|
"loss": 0.3594, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 0.6585410149269071, |
|
"learning_rate": 8.757472358863481e-05, |
|
"loss": 0.3813, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 0.7554756750287495, |
|
"learning_rate": 8.701677331788891e-05, |
|
"loss": 0.3614, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.6356206323184476, |
|
"learning_rate": 8.644843137107059e-05, |
|
"loss": 0.3459, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 0.7202075692536539, |
|
"learning_rate": 8.586985729815894e-05, |
|
"loss": 0.3318, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 0.971675376179715, |
|
"learning_rate": 8.528121352158604e-05, |
|
"loss": 0.3727, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 0.5834787172725806, |
|
"learning_rate": 8.468266529064025e-05, |
|
"loss": 0.3283, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 0.9138752831493647, |
|
"learning_rate": 8.4074380635076e-05, |
|
"loss": 0.3466, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.6628442550543788, |
|
"learning_rate": 8.345653031794292e-05, |
|
"loss": 0.3554, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 1.1376540025434019, |
|
"learning_rate": 8.282928778764783e-05, |
|
"loss": 0.3495, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 0.7241154857095444, |
|
"learning_rate": 8.21928291292627e-05, |
|
"loss": 0.3239, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 0.9670287777704916, |
|
"learning_rate": 8.154733301509248e-05, |
|
"loss": 0.3288, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 0.7396477232733722, |
|
"learning_rate": 8.089298065451672e-05, |
|
"loss": 0.2646, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.8545577142602087, |
|
"learning_rate": 8.022995574311876e-05, |
|
"loss": 0.3072, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 0.8730541522413842, |
|
"learning_rate": 7.95584444111171e-05, |
|
"loss": 0.3707, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 0.977218932636251, |
|
"learning_rate": 7.887863517111338e-05, |
|
"loss": 0.3262, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 0.7836322626790029, |
|
"learning_rate": 7.819071886517134e-05, |
|
"loss": 0.3383, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 1.0708289653309697, |
|
"learning_rate": 7.7494888611242e-05, |
|
"loss": 0.311, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.7697129892326693, |
|
"learning_rate": 7.679133974894983e-05, |
|
"loss": 0.3093, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 0.9966052229687813, |
|
"learning_rate": 7.60802697847554e-05, |
|
"loss": 0.3547, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 0.6690526939456714, |
|
"learning_rate": 7.536187833650947e-05, |
|
"loss": 0.3314, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.008, |
|
"grad_norm": 0.75821824874568, |
|
"learning_rate": 7.463636707741458e-05, |
|
"loss": 0.3445, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.024, |
|
"grad_norm": 0.9242081895927273, |
|
"learning_rate": 7.390393967940962e-05, |
|
"loss": 0.2962, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.8108496898331262, |
|
"learning_rate": 7.316480175599309e-05, |
|
"loss": 0.283, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.056, |
|
"grad_norm": 0.8921672080734943, |
|
"learning_rate": 7.241916080450163e-05, |
|
"loss": 0.2599, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.072, |
|
"grad_norm": 0.7491655839339207, |
|
"learning_rate": 7.166722614785937e-05, |
|
"loss": 0.28, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.088, |
|
"grad_norm": 1.0334160738514429, |
|
"learning_rate": 7.090920887581506e-05, |
|
"loss": 0.2348, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.104, |
|
"grad_norm": 0.6704114246937775, |
|
"learning_rate": 7.014532178568314e-05, |
|
"loss": 0.2721, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.899064629989069, |
|
"learning_rate": 6.937577932260515e-05, |
|
"loss": 0.3143, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.1360000000000001, |
|
"grad_norm": 1.0158660631109402, |
|
"learning_rate": 6.860079751934908e-05, |
|
"loss": 0.2943, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.152, |
|
"grad_norm": 0.5842709942575061, |
|
"learning_rate": 6.782059393566253e-05, |
|
"loss": 0.2593, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.168, |
|
"grad_norm": 0.9083888380038305, |
|
"learning_rate": 6.70353875971976e-05, |
|
"loss": 0.3074, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.184, |
|
"grad_norm": 0.9956074892974983, |
|
"learning_rate": 6.624539893402382e-05, |
|
"loss": 0.2933, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.0364494081920752, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 0.2836, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.216, |
|
"grad_norm": 0.8661955533396878, |
|
"learning_rate": 6.465196300425287e-05, |
|
"loss": 0.3033, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.232, |
|
"grad_norm": 1.051469403032706, |
|
"learning_rate": 6.384896306108612e-05, |
|
"loss": 0.3165, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.248, |
|
"grad_norm": 1.1430816578003544, |
|
"learning_rate": 6.304207531449486e-05, |
|
"loss": 0.2612, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.264, |
|
"grad_norm": 0.962612175196701, |
|
"learning_rate": 6.223152628114537e-05, |
|
"loss": 0.2885, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.131939828433401, |
|
"learning_rate": 6.141754350553279e-05, |
|
"loss": 0.3141, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.296, |
|
"grad_norm": 0.9286238079054427, |
|
"learning_rate": 6.0600355496102745e-05, |
|
"loss": 0.2858, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.312, |
|
"grad_norm": 1.0517327514857011, |
|
"learning_rate": 5.9780191661102415e-05, |
|
"loss": 0.2567, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.328, |
|
"grad_norm": 1.2021599801342249, |
|
"learning_rate": 5.8957282244179124e-05, |
|
"loss": 0.2991, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.3439999999999999, |
|
"grad_norm": 1.0319160295400196, |
|
"learning_rate": 5.813185825974419e-05, |
|
"loss": 0.2465, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 1.089231154026881, |
|
"learning_rate": 5.730415142812059e-05, |
|
"loss": 0.2682, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.376, |
|
"grad_norm": 1.0066143745415537, |
|
"learning_rate": 5.6474394110492344e-05, |
|
"loss": 0.2987, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.392, |
|
"grad_norm": 0.795049556260502, |
|
"learning_rate": 5.564281924367408e-05, |
|
"loss": 0.2888, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.408, |
|
"grad_norm": 0.9657717918565559, |
|
"learning_rate": 5.480966027471889e-05, |
|
"loss": 0.2946, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.424, |
|
"grad_norm": 1.0388183419341974, |
|
"learning_rate": 5.3975151095382995e-05, |
|
"loss": 0.282, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.9202218639117007, |
|
"learning_rate": 5.313952597646568e-05, |
|
"loss": 0.2832, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.456, |
|
"grad_norm": 0.8584135552405918, |
|
"learning_rate": 5.230301950204262e-05, |
|
"loss": 0.3014, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.472, |
|
"grad_norm": 1.2062260258019901, |
|
"learning_rate": 5.1465866503611426e-05, |
|
"loss": 0.2739, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.488, |
|
"grad_norm": 0.8260337878703716, |
|
"learning_rate": 5.062830199416764e-05, |
|
"loss": 0.2696, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.504, |
|
"grad_norm": 0.8342686340750022, |
|
"learning_rate": 4.979056110222981e-05, |
|
"loss": 0.2958, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.06260253365328, |
|
"learning_rate": 4.895287900583216e-05, |
|
"loss": 0.2482, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.536, |
|
"grad_norm": 0.9317052193070622, |
|
"learning_rate": 4.811549086650327e-05, |
|
"loss": 0.2789, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.552, |
|
"grad_norm": 0.8622399411448186, |
|
"learning_rate": 4.7278631763249554e-05, |
|
"loss": 0.2563, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.568, |
|
"grad_norm": 0.8857610071118638, |
|
"learning_rate": 4.6442536626561675e-05, |
|
"loss": 0.259, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.584, |
|
"grad_norm": 0.835587570091306, |
|
"learning_rate": 4.560744017246284e-05, |
|
"loss": 0.2623, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.2092740152455195, |
|
"learning_rate": 4.477357683661734e-05, |
|
"loss": 0.2867, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.616, |
|
"grad_norm": 1.0784101902719292, |
|
"learning_rate": 4.394118070851749e-05, |
|
"loss": 0.2851, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.6320000000000001, |
|
"grad_norm": 0.6431130954370421, |
|
"learning_rate": 4.31104854657681e-05, |
|
"loss": 0.269, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.6480000000000001, |
|
"grad_norm": 0.9482573850706428, |
|
"learning_rate": 4.228172430848644e-05, |
|
"loss": 0.2526, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.6640000000000001, |
|
"grad_norm": 1.0967329813764932, |
|
"learning_rate": 4.1455129893836174e-05, |
|
"loss": 0.2788, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.6800000000000002, |
|
"grad_norm": 1.1253690039254327, |
|
"learning_rate": 4.063093427071376e-05, |
|
"loss": 0.2439, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.696, |
|
"grad_norm": 0.8922820572124287, |
|
"learning_rate": 3.9809368814605766e-05, |
|
"loss": 0.2701, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.712, |
|
"grad_norm": 1.136807663569628, |
|
"learning_rate": 3.899066416263493e-05, |
|
"loss": 0.2665, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.728, |
|
"grad_norm": 1.2239439019933025, |
|
"learning_rate": 3.817505014881378e-05, |
|
"loss": 0.3048, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.744, |
|
"grad_norm": 0.9827246945743642, |
|
"learning_rate": 3.736275573952354e-05, |
|
"loss": 0.2298, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.7866612401659844, |
|
"learning_rate": 3.655400896923672e-05, |
|
"loss": 0.2568, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.776, |
|
"grad_norm": 1.022737765446411, |
|
"learning_rate": 3.5749036876501194e-05, |
|
"loss": 0.2685, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.792, |
|
"grad_norm": 1.0652939313746896, |
|
"learning_rate": 3.494806544020398e-05, |
|
"loss": 0.2675, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.808, |
|
"grad_norm": 0.9728776797500178, |
|
"learning_rate": 3.4151319516132416e-05, |
|
"loss": 0.2539, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.8239999999999998, |
|
"grad_norm": 1.2874137173478253, |
|
"learning_rate": 3.335902277385067e-05, |
|
"loss": 0.26, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.8399999999999999, |
|
"grad_norm": 1.0831032362899013, |
|
"learning_rate": 3.257139763390925e-05, |
|
"loss": 0.262, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.8559999999999999, |
|
"grad_norm": 1.0111985690545156, |
|
"learning_rate": 3.178866520540509e-05, |
|
"loss": 0.2381, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.8719999999999999, |
|
"grad_norm": 0.9036562356484744, |
|
"learning_rate": 3.101104522390995e-05, |
|
"loss": 0.2682, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.888, |
|
"grad_norm": 1.5014307278329708, |
|
"learning_rate": 3.023875598978419e-05, |
|
"loss": 0.2902, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.904, |
|
"grad_norm": 0.9785446712411401, |
|
"learning_rate": 2.9472014306893603e-05, |
|
"loss": 0.2624, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 1.1319999391176576, |
|
"learning_rate": 2.8711035421746367e-05, |
|
"loss": 0.2686, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.936, |
|
"grad_norm": 1.1056089612154159, |
|
"learning_rate": 2.795603296306708e-05, |
|
"loss": 0.2443, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.952, |
|
"grad_norm": 0.8376972874868366, |
|
"learning_rate": 2.7207218881825014e-05, |
|
"loss": 0.2752, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.968, |
|
"grad_norm": 1.34403239117131, |
|
"learning_rate": 2.6464803391733374e-05, |
|
"loss": 0.3108, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.984, |
|
"grad_norm": 1.0963825979352566, |
|
"learning_rate": 2.5728994910236304e-05, |
|
"loss": 0.2517, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.0786181456785247, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 0.2494, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.016, |
|
"grad_norm": 1.0920867485450798, |
|
"learning_rate": 2.4278023310924673e-05, |
|
"loss": 0.2276, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.032, |
|
"grad_norm": 0.9377652164830973, |
|
"learning_rate": 2.3563267522693415e-05, |
|
"loss": 0.2145, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.048, |
|
"grad_norm": 0.9386939202846237, |
|
"learning_rate": 2.2855933287874138e-05, |
|
"loss": 0.1993, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.064, |
|
"grad_norm": 1.08768521136953, |
|
"learning_rate": 2.215621917559062e-05, |
|
"loss": 0.2402, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 1.1126946097754096, |
|
"learning_rate": 2.1464321615778422e-05, |
|
"loss": 0.198, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.096, |
|
"grad_norm": 1.144472975970789, |
|
"learning_rate": 2.07804348440414e-05, |
|
"loss": 0.1847, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.112, |
|
"grad_norm": 1.0016887934690482, |
|
"learning_rate": 2.0104750847124075e-05, |
|
"loss": 0.2217, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.128, |
|
"grad_norm": 0.8561793503114183, |
|
"learning_rate": 1.9437459309015427e-05, |
|
"loss": 0.219, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.144, |
|
"grad_norm": 1.3046968600575892, |
|
"learning_rate": 1.8778747557699224e-05, |
|
"loss": 0.2219, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 1.2565120133034997, |
|
"learning_rate": 1.8128800512565513e-05, |
|
"loss": 0.2043, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.176, |
|
"grad_norm": 1.0340866475830222, |
|
"learning_rate": 1.7487800632498545e-05, |
|
"loss": 0.1848, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.192, |
|
"grad_norm": 1.2687371164495156, |
|
"learning_rate": 1.685592786465524e-05, |
|
"loss": 0.2063, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.208, |
|
"grad_norm": 4.410575395721385, |
|
"learning_rate": 1.6233359593948777e-05, |
|
"loss": 0.203, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.224, |
|
"grad_norm": 0.896414168502828, |
|
"learning_rate": 1.5620270593251635e-05, |
|
"loss": 0.1998, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 1.1594043127204923, |
|
"learning_rate": 1.5016832974331724e-05, |
|
"loss": 0.2029, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.2560000000000002, |
|
"grad_norm": 0.9694525860243126, |
|
"learning_rate": 1.4423216139535734e-05, |
|
"loss": 0.1829, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.2720000000000002, |
|
"grad_norm": 0.7781970327573788, |
|
"learning_rate": 1.3839586734232906e-05, |
|
"loss": 0.2131, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.288, |
|
"grad_norm": 0.9924970297588063, |
|
"learning_rate": 1.3266108600032929e-05, |
|
"loss": 0.2099, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.304, |
|
"grad_norm": 1.0619144372673828, |
|
"learning_rate": 1.2702942728790895e-05, |
|
"loss": 0.203, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 1.1568706885012912, |
|
"learning_rate": 1.2150247217412186e-05, |
|
"loss": 0.2154, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.336, |
|
"grad_norm": 1.0110226875096215, |
|
"learning_rate": 1.160817722347014e-05, |
|
"loss": 0.1994, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.352, |
|
"grad_norm": 1.2186938894345578, |
|
"learning_rate": 1.1076884921648834e-05, |
|
"loss": 0.2255, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.368, |
|
"grad_norm": 1.0321133111544947, |
|
"learning_rate": 1.0556519461023301e-05, |
|
"loss": 0.2216, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.384, |
|
"grad_norm": 1.154074546283637, |
|
"learning_rate": 1.0047226923189024e-05, |
|
"loss": 0.1979, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.1620581009185202, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 0.2081, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.416, |
|
"grad_norm": 1.0946269046711545, |
|
"learning_rate": 9.06242935969528e-06, |
|
"loss": 0.2061, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.432, |
|
"grad_norm": 1.3740666369662087, |
|
"learning_rate": 8.587200795119793e-06, |
|
"loss": 0.2066, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.448, |
|
"grad_norm": 0.9801350633493024, |
|
"learning_rate": 8.123597997892918e-06, |
|
"loss": 0.2144, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.464, |
|
"grad_norm": 0.9114459647246573, |
|
"learning_rate": 7.671751114693104e-06, |
|
"loss": 0.1854, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 1.020041105806084, |
|
"learning_rate": 7.2317869919746705e-06, |
|
"loss": 0.2095, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.496, |
|
"grad_norm": 0.8590881829290263, |
|
"learning_rate": 6.803829140358237e-06, |
|
"loss": 0.1815, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.512, |
|
"grad_norm": 0.8341531305601159, |
|
"learning_rate": 6.3879976999578154e-06, |
|
"loss": 0.2091, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.528, |
|
"grad_norm": 1.0395797814111882, |
|
"learning_rate": 5.98440940665399e-06, |
|
"loss": 0.2112, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.544, |
|
"grad_norm": 0.950529878731894, |
|
"learning_rate": 5.593177559322777e-06, |
|
"loss": 0.1965, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 1.274310745570913, |
|
"learning_rate": 5.214411988029355e-06, |
|
"loss": 0.209, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.576, |
|
"grad_norm": 1.2921735659034017, |
|
"learning_rate": 4.848219023195644e-06, |
|
"loss": 0.2227, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.592, |
|
"grad_norm": 1.0127209415412022, |
|
"learning_rate": 4.494701465750217e-06, |
|
"loss": 0.1875, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.608, |
|
"grad_norm": 1.068518391858498, |
|
"learning_rate": 4.153958558269189e-06, |
|
"loss": 0.1887, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.624, |
|
"grad_norm": 1.1843724236696493, |
|
"learning_rate": 3.826085957115888e-06, |
|
"loss": 0.1833, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 1.3793247614185251, |
|
"learning_rate": 3.511175705587433e-06, |
|
"loss": 0.2001, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.656, |
|
"grad_norm": 1.1786689211718118, |
|
"learning_rate": 3.2093162080754637e-06, |
|
"loss": 0.2341, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.672, |
|
"grad_norm": 1.1876075782168105, |
|
"learning_rate": 2.9205922052484958e-06, |
|
"loss": 0.2251, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.6879999999999997, |
|
"grad_norm": 1.1105275881217318, |
|
"learning_rate": 2.6450847502627884e-06, |
|
"loss": 0.1915, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.7039999999999997, |
|
"grad_norm": 1.2615653662572164, |
|
"learning_rate": 2.3828711860083674e-06, |
|
"loss": 0.2241, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.7199999999999998, |
|
"grad_norm": 1.2336183045142934, |
|
"learning_rate": 2.134025123396638e-06, |
|
"loss": 0.1899, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.7359999999999998, |
|
"grad_norm": 1.2995831378990832, |
|
"learning_rate": 1.8986164206957035e-06, |
|
"loss": 0.1965, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.752, |
|
"grad_norm": 1.0738462699292388, |
|
"learning_rate": 1.6767111639191202e-06, |
|
"loss": 0.2192, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.768, |
|
"grad_norm": 1.0132612835074417, |
|
"learning_rate": 1.4683716482736366e-06, |
|
"loss": 0.2056, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.784, |
|
"grad_norm": 1.0015783236034024, |
|
"learning_rate": 1.2736563606711382e-06, |
|
"loss": 0.1803, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.2842164055412226, |
|
"learning_rate": 1.0926199633097157e-06, |
|
"loss": 0.24, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.816, |
|
"grad_norm": 1.0294787685775484, |
|
"learning_rate": 9.253132783283547e-07, |
|
"loss": 0.1914, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.832, |
|
"grad_norm": 1.0434750413131526, |
|
"learning_rate": 7.717832735397335e-07, |
|
"loss": 0.1935, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.848, |
|
"grad_norm": 1.0184364682600762, |
|
"learning_rate": 6.3207304924498e-07, |
|
"loss": 0.1771, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.864, |
|
"grad_norm": 1.0821144737056598, |
|
"learning_rate": 5.062218261342122e-07, |
|
"loss": 0.1955, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 1.1413490215179685, |
|
"learning_rate": 3.9426493427611177e-07, |
|
"loss": 0.1867, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.896, |
|
"grad_norm": 1.1954337890031224, |
|
"learning_rate": 2.962338031997691e-07, |
|
"loss": 0.2164, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.912, |
|
"grad_norm": 1.24853908918597, |
|
"learning_rate": 2.1215595307154667e-07, |
|
"loss": 0.2154, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.928, |
|
"grad_norm": 1.1438437987577612, |
|
"learning_rate": 1.420549869693033e-07, |
|
"loss": 0.2006, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.944, |
|
"grad_norm": 1.106366481038807, |
|
"learning_rate": 8.595058425640013e-08, |
|
"loss": 0.1884, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 1.0427024327416816, |
|
"learning_rate": 4.385849505708084e-08, |
|
"loss": 0.1914, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.976, |
|
"grad_norm": 1.379711472754783, |
|
"learning_rate": 1.5790535835003008e-08, |
|
"loss": 0.2049, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.992, |
|
"grad_norm": 1.095540989141702, |
|
"learning_rate": 1.7545860759693445e-09, |
|
"loss": 0.2074, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 1875, |
|
"total_flos": 302283383111680.0, |
|
"train_loss": 0.3027080503463745, |
|
"train_runtime": 17327.47, |
|
"train_samples_per_second": 0.866, |
|
"train_steps_per_second": 0.108 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1875, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 302283383111680.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|