{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1875, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": 1.4735389691796648, "learning_rate": 9.999298177883903e-05, "loss": 2.0846, "step": 10 }, { "epoch": 0.032, "grad_norm": 1.574973327270963, "learning_rate": 9.997192908557323e-05, "loss": 1.0283, "step": 20 }, { "epoch": 0.048, "grad_norm": 2.0498267249856488, "learning_rate": 9.993684783030088e-05, "loss": 0.686, "step": 30 }, { "epoch": 0.064, "grad_norm": 1.2702818493413723, "learning_rate": 9.988774786134234e-05, "loss": 0.587, "step": 40 }, { "epoch": 0.08, "grad_norm": 0.6979872055713617, "learning_rate": 9.982464296247522e-05, "loss": 0.5667, "step": 50 }, { "epoch": 0.096, "grad_norm": 1.0417497433733047, "learning_rate": 9.974755084906502e-05, "loss": 0.5057, "step": 60 }, { "epoch": 0.112, "grad_norm": 0.8791937337247481, "learning_rate": 9.965649316309178e-05, "loss": 0.5095, "step": 70 }, { "epoch": 0.128, "grad_norm": 0.6707662533067278, "learning_rate": 9.955149546707465e-05, "loss": 0.5589, "step": 80 }, { "epoch": 0.144, "grad_norm": 0.8382574790909684, "learning_rate": 9.94325872368957e-05, "loss": 0.4037, "step": 90 }, { "epoch": 0.16, "grad_norm": 1.3977581534529162, "learning_rate": 9.929980185352526e-05, "loss": 0.476, "step": 100 }, { "epoch": 0.176, "grad_norm": 0.7995346930439355, "learning_rate": 9.915317659365077e-05, "loss": 0.4732, "step": 110 }, { "epoch": 0.192, "grad_norm": 1.0293778342519209, "learning_rate": 9.899275261921234e-05, "loss": 0.4808, "step": 120 }, { "epoch": 0.208, "grad_norm": 0.7924054746311283, "learning_rate": 9.881857496584726e-05, "loss": 0.4359, "step": 130 }, { "epoch": 0.224, "grad_norm": 0.899393132949154, "learning_rate": 9.863069253024719e-05, "loss": 0.377, "step": 140 }, { "epoch": 0.24, "grad_norm": 0.9174662358653375, "learning_rate": 9.842915805643155e-05, "loss": 0.4143, "step": 150 }, { "epoch": 0.256, "grad_norm": 1.080440113976801, "learning_rate": 9.821402812094073e-05, "loss": 0.4396, "step": 160 }, { "epoch": 0.272, "grad_norm": 1.7023365502074517, "learning_rate": 9.798536311695334e-05, "loss": 0.4219, "step": 170 }, { "epoch": 0.288, "grad_norm": 1.0019239067011896, "learning_rate": 9.774322723733216e-05, "loss": 0.4189, "step": 180 }, { "epoch": 0.304, "grad_norm": 0.8717425800117149, "learning_rate": 9.748768845660334e-05, "loss": 0.399, "step": 190 }, { "epoch": 0.32, "grad_norm": 1.0137511699320638, "learning_rate": 9.721881851187406e-05, "loss": 0.434, "step": 200 }, { "epoch": 0.336, "grad_norm": 0.9295547515585622, "learning_rate": 9.693669288269372e-05, "loss": 0.3684, "step": 210 }, { "epoch": 0.352, "grad_norm": 2.034518417888826, "learning_rate": 9.664139076986473e-05, "loss": 0.3468, "step": 220 }, { "epoch": 0.368, "grad_norm": 0.9229607408434366, "learning_rate": 9.63329950732086e-05, "loss": 0.4119, "step": 230 }, { "epoch": 0.384, "grad_norm": 1.0004931328900635, "learning_rate": 9.601159236829352e-05, "loss": 0.3904, "step": 240 }, { "epoch": 0.4, "grad_norm": 0.9012457023667851, "learning_rate": 9.567727288213005e-05, "loss": 0.423, "step": 250 }, { "epoch": 0.416, "grad_norm": 0.9087395867147162, "learning_rate": 9.533013046784189e-05, "loss": 0.3761, "step": 260 }, { "epoch": 0.432, "grad_norm": 0.9173174623162017, "learning_rate": 9.497026257831855e-05, "loss": 0.3351, "step": 270 }, { "epoch": 0.448, "grad_norm": 0.7699242514964085, "learning_rate": 9.459777023885755e-05, "loss": 0.3282, "step": 280 }, { "epoch": 0.464, "grad_norm": 0.9501543328230019, "learning_rate": 9.421275801880362e-05, "loss": 0.4125, "step": 290 }, { "epoch": 0.48, "grad_norm": 1.21516103198593, "learning_rate": 9.381533400219318e-05, "loss": 0.3728, "step": 300 }, { "epoch": 0.496, "grad_norm": 0.9020121880449892, "learning_rate": 9.340560975741197e-05, "loss": 0.3094, "step": 310 }, { "epoch": 0.512, "grad_norm": 0.8137757312519368, "learning_rate": 9.298370030587456e-05, "loss": 0.3853, "step": 320 }, { "epoch": 0.528, "grad_norm": 1.0040829480628546, "learning_rate": 9.254972408973461e-05, "loss": 0.3391, "step": 330 }, { "epoch": 0.544, "grad_norm": 1.2904098253200862, "learning_rate": 9.210380293863462e-05, "loss": 0.397, "step": 340 }, { "epoch": 0.56, "grad_norm": 0.8956729634317536, "learning_rate": 9.164606203550497e-05, "loss": 0.3878, "step": 350 }, { "epoch": 0.576, "grad_norm": 1.522665457627724, "learning_rate": 9.117662988142138e-05, "loss": 0.3415, "step": 360 }, { "epoch": 0.592, "grad_norm": 1.0284487945989331, "learning_rate": 9.069563825953092e-05, "loss": 0.4046, "step": 370 }, { "epoch": 0.608, "grad_norm": 0.8916994827228245, "learning_rate": 9.020322219805674e-05, "loss": 0.3674, "step": 380 }, { "epoch": 0.624, "grad_norm": 0.8857668635539665, "learning_rate": 8.969951993239177e-05, "loss": 0.3478, "step": 390 }, { "epoch": 0.64, "grad_norm": 0.8050131549764538, "learning_rate": 8.9184672866292e-05, "loss": 0.375, "step": 400 }, { "epoch": 0.656, "grad_norm": 0.8436775930299412, "learning_rate": 8.865882553218037e-05, "loss": 0.381, "step": 410 }, { "epoch": 0.672, "grad_norm": 0.849143512269743, "learning_rate": 8.81221255505724e-05, "loss": 0.3594, "step": 420 }, { "epoch": 0.688, "grad_norm": 0.6585410149269071, "learning_rate": 8.757472358863481e-05, "loss": 0.3813, "step": 430 }, { "epoch": 0.704, "grad_norm": 0.7554756750287495, "learning_rate": 8.701677331788891e-05, "loss": 0.3614, "step": 440 }, { "epoch": 0.72, "grad_norm": 0.6356206323184476, "learning_rate": 8.644843137107059e-05, "loss": 0.3459, "step": 450 }, { "epoch": 0.736, "grad_norm": 0.7202075692536539, "learning_rate": 8.586985729815894e-05, "loss": 0.3318, "step": 460 }, { "epoch": 0.752, "grad_norm": 0.971675376179715, "learning_rate": 8.528121352158604e-05, "loss": 0.3727, "step": 470 }, { "epoch": 0.768, "grad_norm": 0.5834787172725806, "learning_rate": 8.468266529064025e-05, "loss": 0.3283, "step": 480 }, { "epoch": 0.784, "grad_norm": 0.9138752831493647, "learning_rate": 8.4074380635076e-05, "loss": 0.3466, "step": 490 }, { "epoch": 0.8, "grad_norm": 0.6628442550543788, "learning_rate": 8.345653031794292e-05, "loss": 0.3554, "step": 500 }, { "epoch": 0.816, "grad_norm": 1.1376540025434019, "learning_rate": 8.282928778764783e-05, "loss": 0.3495, "step": 510 }, { "epoch": 0.832, "grad_norm": 0.7241154857095444, "learning_rate": 8.21928291292627e-05, "loss": 0.3239, "step": 520 }, { "epoch": 0.848, "grad_norm": 0.9670287777704916, "learning_rate": 8.154733301509248e-05, "loss": 0.3288, "step": 530 }, { "epoch": 0.864, "grad_norm": 0.7396477232733722, "learning_rate": 8.089298065451672e-05, "loss": 0.2646, "step": 540 }, { "epoch": 0.88, "grad_norm": 0.8545577142602087, "learning_rate": 8.022995574311876e-05, "loss": 0.3072, "step": 550 }, { "epoch": 0.896, "grad_norm": 0.8730541522413842, "learning_rate": 7.95584444111171e-05, "loss": 0.3707, "step": 560 }, { "epoch": 0.912, "grad_norm": 0.977218932636251, "learning_rate": 7.887863517111338e-05, "loss": 0.3262, "step": 570 }, { "epoch": 0.928, "grad_norm": 0.7836322626790029, "learning_rate": 7.819071886517134e-05, "loss": 0.3383, "step": 580 }, { "epoch": 0.944, "grad_norm": 1.0708289653309697, "learning_rate": 7.7494888611242e-05, "loss": 0.311, "step": 590 }, { "epoch": 0.96, "grad_norm": 0.7697129892326693, "learning_rate": 7.679133974894983e-05, "loss": 0.3093, "step": 600 }, { "epoch": 0.976, "grad_norm": 0.9966052229687813, "learning_rate": 7.60802697847554e-05, "loss": 0.3547, "step": 610 }, { "epoch": 0.992, "grad_norm": 0.6690526939456714, "learning_rate": 7.536187833650947e-05, "loss": 0.3314, "step": 620 }, { "epoch": 1.008, "grad_norm": 0.75821824874568, "learning_rate": 7.463636707741458e-05, "loss": 0.3445, "step": 630 }, { "epoch": 1.024, "grad_norm": 0.9242081895927273, "learning_rate": 7.390393967940962e-05, "loss": 0.2962, "step": 640 }, { "epoch": 1.04, "grad_norm": 0.8108496898331262, "learning_rate": 7.316480175599309e-05, "loss": 0.283, "step": 650 }, { "epoch": 1.056, "grad_norm": 0.8921672080734943, "learning_rate": 7.241916080450163e-05, "loss": 0.2599, "step": 660 }, { "epoch": 1.072, "grad_norm": 0.7491655839339207, "learning_rate": 7.166722614785937e-05, "loss": 0.28, "step": 670 }, { "epoch": 1.088, "grad_norm": 1.0334160738514429, "learning_rate": 7.090920887581506e-05, "loss": 0.2348, "step": 680 }, { "epoch": 1.104, "grad_norm": 0.6704114246937775, "learning_rate": 7.014532178568314e-05, "loss": 0.2721, "step": 690 }, { "epoch": 1.12, "grad_norm": 0.899064629989069, "learning_rate": 6.937577932260515e-05, "loss": 0.3143, "step": 700 }, { "epoch": 1.1360000000000001, "grad_norm": 1.0158660631109402, "learning_rate": 6.860079751934908e-05, "loss": 0.2943, "step": 710 }, { "epoch": 1.152, "grad_norm": 0.5842709942575061, "learning_rate": 6.782059393566253e-05, "loss": 0.2593, "step": 720 }, { "epoch": 1.168, "grad_norm": 0.9083888380038305, "learning_rate": 6.70353875971976e-05, "loss": 0.3074, "step": 730 }, { "epoch": 1.184, "grad_norm": 0.9956074892974983, "learning_rate": 6.624539893402382e-05, "loss": 0.2933, "step": 740 }, { "epoch": 1.2, "grad_norm": 1.0364494081920752, "learning_rate": 6.545084971874738e-05, "loss": 0.2836, "step": 750 }, { "epoch": 1.216, "grad_norm": 0.8661955533396878, "learning_rate": 6.465196300425287e-05, "loss": 0.3033, "step": 760 }, { "epoch": 1.232, "grad_norm": 1.051469403032706, "learning_rate": 6.384896306108612e-05, "loss": 0.3165, "step": 770 }, { "epoch": 1.248, "grad_norm": 1.1430816578003544, "learning_rate": 6.304207531449486e-05, "loss": 0.2612, "step": 780 }, { "epoch": 1.264, "grad_norm": 0.962612175196701, "learning_rate": 6.223152628114537e-05, "loss": 0.2885, "step": 790 }, { "epoch": 1.28, "grad_norm": 1.131939828433401, "learning_rate": 6.141754350553279e-05, "loss": 0.3141, "step": 800 }, { "epoch": 1.296, "grad_norm": 0.9286238079054427, "learning_rate": 6.0600355496102745e-05, "loss": 0.2858, "step": 810 }, { "epoch": 1.312, "grad_norm": 1.0517327514857011, "learning_rate": 5.9780191661102415e-05, "loss": 0.2567, "step": 820 }, { "epoch": 1.328, "grad_norm": 1.2021599801342249, "learning_rate": 5.8957282244179124e-05, "loss": 0.2991, "step": 830 }, { "epoch": 1.3439999999999999, "grad_norm": 1.0319160295400196, "learning_rate": 5.813185825974419e-05, "loss": 0.2465, "step": 840 }, { "epoch": 1.3599999999999999, "grad_norm": 1.089231154026881, "learning_rate": 5.730415142812059e-05, "loss": 0.2682, "step": 850 }, { "epoch": 1.376, "grad_norm": 1.0066143745415537, "learning_rate": 5.6474394110492344e-05, "loss": 0.2987, "step": 860 }, { "epoch": 1.392, "grad_norm": 0.795049556260502, "learning_rate": 5.564281924367408e-05, "loss": 0.2888, "step": 870 }, { "epoch": 1.408, "grad_norm": 0.9657717918565559, "learning_rate": 5.480966027471889e-05, "loss": 0.2946, "step": 880 }, { "epoch": 1.424, "grad_norm": 1.0388183419341974, "learning_rate": 5.3975151095382995e-05, "loss": 0.282, "step": 890 }, { "epoch": 1.44, "grad_norm": 0.9202218639117007, "learning_rate": 5.313952597646568e-05, "loss": 0.2832, "step": 900 }, { "epoch": 1.456, "grad_norm": 0.8584135552405918, "learning_rate": 5.230301950204262e-05, "loss": 0.3014, "step": 910 }, { "epoch": 1.472, "grad_norm": 1.2062260258019901, "learning_rate": 5.1465866503611426e-05, "loss": 0.2739, "step": 920 }, { "epoch": 1.488, "grad_norm": 0.8260337878703716, "learning_rate": 5.062830199416764e-05, "loss": 0.2696, "step": 930 }, { "epoch": 1.504, "grad_norm": 0.8342686340750022, "learning_rate": 4.979056110222981e-05, "loss": 0.2958, "step": 940 }, { "epoch": 1.52, "grad_norm": 1.06260253365328, "learning_rate": 4.895287900583216e-05, "loss": 0.2482, "step": 950 }, { "epoch": 1.536, "grad_norm": 0.9317052193070622, "learning_rate": 4.811549086650327e-05, "loss": 0.2789, "step": 960 }, { "epoch": 1.552, "grad_norm": 0.8622399411448186, "learning_rate": 4.7278631763249554e-05, "loss": 0.2563, "step": 970 }, { "epoch": 1.568, "grad_norm": 0.8857610071118638, "learning_rate": 4.6442536626561675e-05, "loss": 0.259, "step": 980 }, { "epoch": 1.584, "grad_norm": 0.835587570091306, "learning_rate": 4.560744017246284e-05, "loss": 0.2623, "step": 990 }, { "epoch": 1.6, "grad_norm": 1.2092740152455195, "learning_rate": 4.477357683661734e-05, "loss": 0.2867, "step": 1000 }, { "epoch": 1.616, "grad_norm": 1.0784101902719292, "learning_rate": 4.394118070851749e-05, "loss": 0.2851, "step": 1010 }, { "epoch": 1.6320000000000001, "grad_norm": 0.6431130954370421, "learning_rate": 4.31104854657681e-05, "loss": 0.269, "step": 1020 }, { "epoch": 1.6480000000000001, "grad_norm": 0.9482573850706428, "learning_rate": 4.228172430848644e-05, "loss": 0.2526, "step": 1030 }, { "epoch": 1.6640000000000001, "grad_norm": 1.0967329813764932, "learning_rate": 4.1455129893836174e-05, "loss": 0.2788, "step": 1040 }, { "epoch": 1.6800000000000002, "grad_norm": 1.1253690039254327, "learning_rate": 4.063093427071376e-05, "loss": 0.2439, "step": 1050 }, { "epoch": 1.696, "grad_norm": 0.8922820572124287, "learning_rate": 3.9809368814605766e-05, "loss": 0.2701, "step": 1060 }, { "epoch": 1.712, "grad_norm": 1.136807663569628, "learning_rate": 3.899066416263493e-05, "loss": 0.2665, "step": 1070 }, { "epoch": 1.728, "grad_norm": 1.2239439019933025, "learning_rate": 3.817505014881378e-05, "loss": 0.3048, "step": 1080 }, { "epoch": 1.744, "grad_norm": 0.9827246945743642, "learning_rate": 3.736275573952354e-05, "loss": 0.2298, "step": 1090 }, { "epoch": 1.76, "grad_norm": 0.7866612401659844, "learning_rate": 3.655400896923672e-05, "loss": 0.2568, "step": 1100 }, { "epoch": 1.776, "grad_norm": 1.022737765446411, "learning_rate": 3.5749036876501194e-05, "loss": 0.2685, "step": 1110 }, { "epoch": 1.792, "grad_norm": 1.0652939313746896, "learning_rate": 3.494806544020398e-05, "loss": 0.2675, "step": 1120 }, { "epoch": 1.808, "grad_norm": 0.9728776797500178, "learning_rate": 3.4151319516132416e-05, "loss": 0.2539, "step": 1130 }, { "epoch": 1.8239999999999998, "grad_norm": 1.2874137173478253, "learning_rate": 3.335902277385067e-05, "loss": 0.26, "step": 1140 }, { "epoch": 1.8399999999999999, "grad_norm": 1.0831032362899013, "learning_rate": 3.257139763390925e-05, "loss": 0.262, "step": 1150 }, { "epoch": 1.8559999999999999, "grad_norm": 1.0111985690545156, "learning_rate": 3.178866520540509e-05, "loss": 0.2381, "step": 1160 }, { "epoch": 1.8719999999999999, "grad_norm": 0.9036562356484744, "learning_rate": 3.101104522390995e-05, "loss": 0.2682, "step": 1170 }, { "epoch": 1.888, "grad_norm": 1.5014307278329708, "learning_rate": 3.023875598978419e-05, "loss": 0.2902, "step": 1180 }, { "epoch": 1.904, "grad_norm": 0.9785446712411401, "learning_rate": 2.9472014306893603e-05, "loss": 0.2624, "step": 1190 }, { "epoch": 1.92, "grad_norm": 1.1319999391176576, "learning_rate": 2.8711035421746367e-05, "loss": 0.2686, "step": 1200 }, { "epoch": 1.936, "grad_norm": 1.1056089612154159, "learning_rate": 2.795603296306708e-05, "loss": 0.2443, "step": 1210 }, { "epoch": 1.952, "grad_norm": 0.8376972874868366, "learning_rate": 2.7207218881825014e-05, "loss": 0.2752, "step": 1220 }, { "epoch": 1.968, "grad_norm": 1.34403239117131, "learning_rate": 2.6464803391733374e-05, "loss": 0.3108, "step": 1230 }, { "epoch": 1.984, "grad_norm": 1.0963825979352566, "learning_rate": 2.5728994910236304e-05, "loss": 0.2517, "step": 1240 }, { "epoch": 2.0, "grad_norm": 1.0786181456785247, "learning_rate": 2.500000000000001e-05, "loss": 0.2494, "step": 1250 }, { "epoch": 2.016, "grad_norm": 1.0920867485450798, "learning_rate": 2.4278023310924673e-05, "loss": 0.2276, "step": 1260 }, { "epoch": 2.032, "grad_norm": 0.9377652164830973, "learning_rate": 2.3563267522693415e-05, "loss": 0.2145, "step": 1270 }, { "epoch": 2.048, "grad_norm": 0.9386939202846237, "learning_rate": 2.2855933287874138e-05, "loss": 0.1993, "step": 1280 }, { "epoch": 2.064, "grad_norm": 1.08768521136953, "learning_rate": 2.215621917559062e-05, "loss": 0.2402, "step": 1290 }, { "epoch": 2.08, "grad_norm": 1.1126946097754096, "learning_rate": 2.1464321615778422e-05, "loss": 0.198, "step": 1300 }, { "epoch": 2.096, "grad_norm": 1.144472975970789, "learning_rate": 2.07804348440414e-05, "loss": 0.1847, "step": 1310 }, { "epoch": 2.112, "grad_norm": 1.0016887934690482, "learning_rate": 2.0104750847124075e-05, "loss": 0.2217, "step": 1320 }, { "epoch": 2.128, "grad_norm": 0.8561793503114183, "learning_rate": 1.9437459309015427e-05, "loss": 0.219, "step": 1330 }, { "epoch": 2.144, "grad_norm": 1.3046968600575892, "learning_rate": 1.8778747557699224e-05, "loss": 0.2219, "step": 1340 }, { "epoch": 2.16, "grad_norm": 1.2565120133034997, "learning_rate": 1.8128800512565513e-05, "loss": 0.2043, "step": 1350 }, { "epoch": 2.176, "grad_norm": 1.0340866475830222, "learning_rate": 1.7487800632498545e-05, "loss": 0.1848, "step": 1360 }, { "epoch": 2.192, "grad_norm": 1.2687371164495156, "learning_rate": 1.685592786465524e-05, "loss": 0.2063, "step": 1370 }, { "epoch": 2.208, "grad_norm": 4.410575395721385, "learning_rate": 1.6233359593948777e-05, "loss": 0.203, "step": 1380 }, { "epoch": 2.224, "grad_norm": 0.896414168502828, "learning_rate": 1.5620270593251635e-05, "loss": 0.1998, "step": 1390 }, { "epoch": 2.24, "grad_norm": 1.1594043127204923, "learning_rate": 1.5016832974331724e-05, "loss": 0.2029, "step": 1400 }, { "epoch": 2.2560000000000002, "grad_norm": 0.9694525860243126, "learning_rate": 1.4423216139535734e-05, "loss": 0.1829, "step": 1410 }, { "epoch": 2.2720000000000002, "grad_norm": 0.7781970327573788, "learning_rate": 1.3839586734232906e-05, "loss": 0.2131, "step": 1420 }, { "epoch": 2.288, "grad_norm": 0.9924970297588063, "learning_rate": 1.3266108600032929e-05, "loss": 0.2099, "step": 1430 }, { "epoch": 2.304, "grad_norm": 1.0619144372673828, "learning_rate": 1.2702942728790895e-05, "loss": 0.203, "step": 1440 }, { "epoch": 2.32, "grad_norm": 1.1568706885012912, "learning_rate": 1.2150247217412186e-05, "loss": 0.2154, "step": 1450 }, { "epoch": 2.336, "grad_norm": 1.0110226875096215, "learning_rate": 1.160817722347014e-05, "loss": 0.1994, "step": 1460 }, { "epoch": 2.352, "grad_norm": 1.2186938894345578, "learning_rate": 1.1076884921648834e-05, "loss": 0.2255, "step": 1470 }, { "epoch": 2.368, "grad_norm": 1.0321133111544947, "learning_rate": 1.0556519461023301e-05, "loss": 0.2216, "step": 1480 }, { "epoch": 2.384, "grad_norm": 1.154074546283637, "learning_rate": 1.0047226923189024e-05, "loss": 0.1979, "step": 1490 }, { "epoch": 2.4, "grad_norm": 1.1620581009185202, "learning_rate": 9.549150281252633e-06, "loss": 0.2081, "step": 1500 }, { "epoch": 2.416, "grad_norm": 1.0946269046711545, "learning_rate": 9.06242935969528e-06, "loss": 0.2061, "step": 1510 }, { "epoch": 2.432, "grad_norm": 1.3740666369662087, "learning_rate": 8.587200795119793e-06, "loss": 0.2066, "step": 1520 }, { "epoch": 2.448, "grad_norm": 0.9801350633493024, "learning_rate": 8.123597997892918e-06, "loss": 0.2144, "step": 1530 }, { "epoch": 2.464, "grad_norm": 0.9114459647246573, "learning_rate": 7.671751114693104e-06, "loss": 0.1854, "step": 1540 }, { "epoch": 2.48, "grad_norm": 1.020041105806084, "learning_rate": 7.2317869919746705e-06, "loss": 0.2095, "step": 1550 }, { "epoch": 2.496, "grad_norm": 0.8590881829290263, "learning_rate": 6.803829140358237e-06, "loss": 0.1815, "step": 1560 }, { "epoch": 2.512, "grad_norm": 0.8341531305601159, "learning_rate": 6.3879976999578154e-06, "loss": 0.2091, "step": 1570 }, { "epoch": 2.528, "grad_norm": 1.0395797814111882, "learning_rate": 5.98440940665399e-06, "loss": 0.2112, "step": 1580 }, { "epoch": 2.544, "grad_norm": 0.950529878731894, "learning_rate": 5.593177559322777e-06, "loss": 0.1965, "step": 1590 }, { "epoch": 2.56, "grad_norm": 1.274310745570913, "learning_rate": 5.214411988029355e-06, "loss": 0.209, "step": 1600 }, { "epoch": 2.576, "grad_norm": 1.2921735659034017, "learning_rate": 4.848219023195644e-06, "loss": 0.2227, "step": 1610 }, { "epoch": 2.592, "grad_norm": 1.0127209415412022, "learning_rate": 4.494701465750217e-06, "loss": 0.1875, "step": 1620 }, { "epoch": 2.608, "grad_norm": 1.068518391858498, "learning_rate": 4.153958558269189e-06, "loss": 0.1887, "step": 1630 }, { "epoch": 2.624, "grad_norm": 1.1843724236696493, "learning_rate": 3.826085957115888e-06, "loss": 0.1833, "step": 1640 }, { "epoch": 2.64, "grad_norm": 1.3793247614185251, "learning_rate": 3.511175705587433e-06, "loss": 0.2001, "step": 1650 }, { "epoch": 2.656, "grad_norm": 1.1786689211718118, "learning_rate": 3.2093162080754637e-06, "loss": 0.2341, "step": 1660 }, { "epoch": 2.672, "grad_norm": 1.1876075782168105, "learning_rate": 2.9205922052484958e-06, "loss": 0.2251, "step": 1670 }, { "epoch": 2.6879999999999997, "grad_norm": 1.1105275881217318, "learning_rate": 2.6450847502627884e-06, "loss": 0.1915, "step": 1680 }, { "epoch": 2.7039999999999997, "grad_norm": 1.2615653662572164, "learning_rate": 2.3828711860083674e-06, "loss": 0.2241, "step": 1690 }, { "epoch": 2.7199999999999998, "grad_norm": 1.2336183045142934, "learning_rate": 2.134025123396638e-06, "loss": 0.1899, "step": 1700 }, { "epoch": 2.7359999999999998, "grad_norm": 1.2995831378990832, "learning_rate": 1.8986164206957035e-06, "loss": 0.1965, "step": 1710 }, { "epoch": 2.752, "grad_norm": 1.0738462699292388, "learning_rate": 1.6767111639191202e-06, "loss": 0.2192, "step": 1720 }, { "epoch": 2.768, "grad_norm": 1.0132612835074417, "learning_rate": 1.4683716482736366e-06, "loss": 0.2056, "step": 1730 }, { "epoch": 2.784, "grad_norm": 1.0015783236034024, "learning_rate": 1.2736563606711382e-06, "loss": 0.1803, "step": 1740 }, { "epoch": 2.8, "grad_norm": 1.2842164055412226, "learning_rate": 1.0926199633097157e-06, "loss": 0.24, "step": 1750 }, { "epoch": 2.816, "grad_norm": 1.0294787685775484, "learning_rate": 9.253132783283547e-07, "loss": 0.1914, "step": 1760 }, { "epoch": 2.832, "grad_norm": 1.0434750413131526, "learning_rate": 7.717832735397335e-07, "loss": 0.1935, "step": 1770 }, { "epoch": 2.848, "grad_norm": 1.0184364682600762, "learning_rate": 6.3207304924498e-07, "loss": 0.1771, "step": 1780 }, { "epoch": 2.864, "grad_norm": 1.0821144737056598, "learning_rate": 5.062218261342122e-07, "loss": 0.1955, "step": 1790 }, { "epoch": 2.88, "grad_norm": 1.1413490215179685, "learning_rate": 3.9426493427611177e-07, "loss": 0.1867, "step": 1800 }, { "epoch": 2.896, "grad_norm": 1.1954337890031224, "learning_rate": 2.962338031997691e-07, "loss": 0.2164, "step": 1810 }, { "epoch": 2.912, "grad_norm": 1.24853908918597, "learning_rate": 2.1215595307154667e-07, "loss": 0.2154, "step": 1820 }, { "epoch": 2.928, "grad_norm": 1.1438437987577612, "learning_rate": 1.420549869693033e-07, "loss": 0.2006, "step": 1830 }, { "epoch": 2.944, "grad_norm": 1.106366481038807, "learning_rate": 8.595058425640013e-08, "loss": 0.1884, "step": 1840 }, { "epoch": 2.96, "grad_norm": 1.0427024327416816, "learning_rate": 4.385849505708084e-08, "loss": 0.1914, "step": 1850 }, { "epoch": 2.976, "grad_norm": 1.379711472754783, "learning_rate": 1.5790535835003008e-08, "loss": 0.2049, "step": 1860 }, { "epoch": 2.992, "grad_norm": 1.095540989141702, "learning_rate": 1.7545860759693445e-09, "loss": 0.2074, "step": 1870 }, { "epoch": 3.0, "step": 1875, "total_flos": 302283383111680.0, "train_loss": 0.3027080503463745, "train_runtime": 17327.47, "train_samples_per_second": 0.866, "train_steps_per_second": 0.108 } ], "logging_steps": 10, "max_steps": 1875, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 302283383111680.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }