{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1875, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": 1.5132097727696663, "learning_rate": 9.999298177883903e-05, "loss": 1.6713, "step": 10 }, { "epoch": 0.032, "grad_norm": 1.0288849642854194, "learning_rate": 9.997192908557323e-05, "loss": 0.9562, "step": 20 }, { "epoch": 0.048, "grad_norm": 2.628046163977058, "learning_rate": 9.993684783030088e-05, "loss": 0.7998, "step": 30 }, { "epoch": 0.064, "grad_norm": 1.0977335920429718, "learning_rate": 9.988774786134234e-05, "loss": 0.7019, "step": 40 }, { "epoch": 0.08, "grad_norm": 1.143511217142594, "learning_rate": 9.982464296247522e-05, "loss": 0.6965, "step": 50 }, { "epoch": 0.096, "grad_norm": 1.1886880844299708, "learning_rate": 9.974755084906502e-05, "loss": 0.6417, "step": 60 }, { "epoch": 0.112, "grad_norm": 1.3088809099781331, "learning_rate": 9.965649316309178e-05, "loss": 0.6388, "step": 70 }, { "epoch": 0.128, "grad_norm": 2.0299623866899035, "learning_rate": 9.955149546707465e-05, "loss": 0.6665, "step": 80 }, { "epoch": 0.144, "grad_norm": 1.5251129511711958, "learning_rate": 9.94325872368957e-05, "loss": 0.6014, "step": 90 }, { "epoch": 0.16, "grad_norm": 1.307242452061001, "learning_rate": 9.929980185352526e-05, "loss": 0.5971, "step": 100 }, { "epoch": 0.176, "grad_norm": 1.0674735946460283, "learning_rate": 9.915317659365077e-05, "loss": 0.6206, "step": 110 }, { "epoch": 0.192, "grad_norm": 1.3671834842056045, "learning_rate": 9.899275261921234e-05, "loss": 0.5597, "step": 120 }, { "epoch": 0.208, "grad_norm": 1.3041158193336297, "learning_rate": 9.881857496584726e-05, "loss": 0.5774, "step": 130 }, { "epoch": 0.224, "grad_norm": 1.2739151782924372, "learning_rate": 9.863069253024719e-05, "loss": 0.5656, "step": 140 }, { "epoch": 0.24, "grad_norm": 1.1009566652519438, "learning_rate": 9.842915805643155e-05, "loss": 0.5724, "step": 150 }, { "epoch": 0.256, "grad_norm": 1.2004741768824123, "learning_rate": 9.821402812094073e-05, "loss": 0.5796, "step": 160 }, { "epoch": 0.272, "grad_norm": 1.5111413381395908, "learning_rate": 9.798536311695334e-05, "loss": 0.5556, "step": 170 }, { "epoch": 0.288, "grad_norm": 1.0331976579035735, "learning_rate": 9.774322723733216e-05, "loss": 0.5451, "step": 180 }, { "epoch": 0.304, "grad_norm": 1.1745428366022237, "learning_rate": 9.748768845660334e-05, "loss": 0.5605, "step": 190 }, { "epoch": 0.32, "grad_norm": 1.067749681555933, "learning_rate": 9.721881851187406e-05, "loss": 0.5644, "step": 200 }, { "epoch": 0.336, "grad_norm": 1.1773627454789788, "learning_rate": 9.693669288269372e-05, "loss": 0.5658, "step": 210 }, { "epoch": 0.352, "grad_norm": 1.4049555802409854, "learning_rate": 9.664139076986473e-05, "loss": 0.5483, "step": 220 }, { "epoch": 0.368, "grad_norm": 1.436137944296166, "learning_rate": 9.63329950732086e-05, "loss": 0.5376, "step": 230 }, { "epoch": 0.384, "grad_norm": 1.2686595633739197, "learning_rate": 9.601159236829352e-05, "loss": 0.5402, "step": 240 }, { "epoch": 0.4, "grad_norm": 1.4062419829256017, "learning_rate": 9.567727288213005e-05, "loss": 0.5552, "step": 250 }, { "epoch": 0.416, "grad_norm": 0.9610102580305002, "learning_rate": 9.533013046784189e-05, "loss": 0.5118, "step": 260 }, { "epoch": 0.432, "grad_norm": 1.2008075396073077, "learning_rate": 9.497026257831855e-05, "loss": 0.5479, "step": 270 }, { "epoch": 0.448, "grad_norm": 1.116478910009909, "learning_rate": 9.459777023885755e-05, "loss": 0.4773, "step": 280 }, { "epoch": 0.464, "grad_norm": 1.2287813088079407, "learning_rate": 9.421275801880362e-05, "loss": 0.5149, "step": 290 }, { "epoch": 0.48, "grad_norm": 1.3153026196754682, "learning_rate": 9.381533400219318e-05, "loss": 0.5209, "step": 300 }, { "epoch": 0.496, "grad_norm": 1.6158648577050707, "learning_rate": 9.340560975741197e-05, "loss": 0.5065, "step": 310 }, { "epoch": 0.512, "grad_norm": 1.1601862022387133, "learning_rate": 9.298370030587456e-05, "loss": 0.515, "step": 320 }, { "epoch": 0.528, "grad_norm": 1.2591523341167126, "learning_rate": 9.254972408973461e-05, "loss": 0.5375, "step": 330 }, { "epoch": 0.544, "grad_norm": 1.0361296810068354, "learning_rate": 9.210380293863462e-05, "loss": 0.5085, "step": 340 }, { "epoch": 0.56, "grad_norm": 1.0654093627590127, "learning_rate": 9.164606203550497e-05, "loss": 0.5137, "step": 350 }, { "epoch": 0.576, "grad_norm": 1.2697508248750302, "learning_rate": 9.117662988142138e-05, "loss": 0.4906, "step": 360 }, { "epoch": 0.592, "grad_norm": 1.2602467399423862, "learning_rate": 9.069563825953092e-05, "loss": 0.4966, "step": 370 }, { "epoch": 0.608, "grad_norm": 2.0044411177134918, "learning_rate": 9.020322219805674e-05, "loss": 0.5063, "step": 380 }, { "epoch": 0.624, "grad_norm": 1.148187026295081, "learning_rate": 8.969951993239177e-05, "loss": 0.4864, "step": 390 }, { "epoch": 0.64, "grad_norm": 1.0529856886801567, "learning_rate": 8.9184672866292e-05, "loss": 0.4632, "step": 400 }, { "epoch": 0.656, "grad_norm": 1.1830984654167522, "learning_rate": 8.865882553218037e-05, "loss": 0.5097, "step": 410 }, { "epoch": 0.672, "grad_norm": 1.1495869848318996, "learning_rate": 8.81221255505724e-05, "loss": 0.4795, "step": 420 }, { "epoch": 0.688, "grad_norm": 1.1235900402260404, "learning_rate": 8.757472358863481e-05, "loss": 0.5006, "step": 430 }, { "epoch": 0.704, "grad_norm": 1.2883600054948787, "learning_rate": 8.701677331788891e-05, "loss": 0.4822, "step": 440 }, { "epoch": 0.72, "grad_norm": 1.0725324329031838, "learning_rate": 8.644843137107059e-05, "loss": 0.4856, "step": 450 }, { "epoch": 0.736, "grad_norm": 2.9167440508075955, "learning_rate": 8.586985729815894e-05, "loss": 0.4648, "step": 460 }, { "epoch": 0.752, "grad_norm": 0.9522185207229502, "learning_rate": 8.528121352158604e-05, "loss": 0.488, "step": 470 }, { "epoch": 0.768, "grad_norm": 1.5907707346034905, "learning_rate": 8.468266529064025e-05, "loss": 0.5039, "step": 480 }, { "epoch": 0.784, "grad_norm": 1.1370580239916313, "learning_rate": 8.4074380635076e-05, "loss": 0.4737, "step": 490 }, { "epoch": 0.8, "grad_norm": 0.9782419363730104, "learning_rate": 8.345653031794292e-05, "loss": 0.5296, "step": 500 }, { "epoch": 0.816, "grad_norm": 1.1919631254125505, "learning_rate": 8.282928778764783e-05, "loss": 0.4688, "step": 510 }, { "epoch": 0.832, "grad_norm": 1.2994447479210165, "learning_rate": 8.21928291292627e-05, "loss": 0.4675, "step": 520 }, { "epoch": 0.848, "grad_norm": 0.9689627531529147, "learning_rate": 8.154733301509248e-05, "loss": 0.4408, "step": 530 }, { "epoch": 0.864, "grad_norm": 1.1649720718750445, "learning_rate": 8.089298065451672e-05, "loss": 0.4608, "step": 540 }, { "epoch": 0.88, "grad_norm": 1.3438713492048824, "learning_rate": 8.022995574311876e-05, "loss": 0.4476, "step": 550 }, { "epoch": 0.896, "grad_norm": 1.0737706296113363, "learning_rate": 7.95584444111171e-05, "loss": 0.5027, "step": 560 }, { "epoch": 0.912, "grad_norm": 1.3469637742497171, "learning_rate": 7.887863517111338e-05, "loss": 0.4929, "step": 570 }, { "epoch": 0.928, "grad_norm": 1.1251117260635928, "learning_rate": 7.819071886517134e-05, "loss": 0.4739, "step": 580 }, { "epoch": 0.944, "grad_norm": 1.0579726403654415, "learning_rate": 7.7494888611242e-05, "loss": 0.4529, "step": 590 }, { "epoch": 0.96, "grad_norm": 1.0498899044694738, "learning_rate": 7.679133974894983e-05, "loss": 0.4619, "step": 600 }, { "epoch": 0.976, "grad_norm": 1.2353618604203835, "learning_rate": 7.60802697847554e-05, "loss": 0.462, "step": 610 }, { "epoch": 0.992, "grad_norm": 1.4618575476184537, "learning_rate": 7.536187833650947e-05, "loss": 0.4478, "step": 620 }, { "epoch": 1.008, "grad_norm": 0.9313196650505406, "learning_rate": 7.463636707741458e-05, "loss": 0.4304, "step": 630 }, { "epoch": 1.024, "grad_norm": 1.1239041555920188, "learning_rate": 7.390393967940962e-05, "loss": 0.4138, "step": 640 }, { "epoch": 1.04, "grad_norm": 1.8736342455754906, "learning_rate": 7.316480175599309e-05, "loss": 0.3909, "step": 650 }, { "epoch": 1.056, "grad_norm": 1.3075877434881118, "learning_rate": 7.241916080450163e-05, "loss": 0.3873, "step": 660 }, { "epoch": 1.072, "grad_norm": 1.1366815390360774, "learning_rate": 7.166722614785937e-05, "loss": 0.3744, "step": 670 }, { "epoch": 1.088, "grad_norm": 1.1482160099998533, "learning_rate": 7.090920887581506e-05, "loss": 0.3641, "step": 680 }, { "epoch": 1.104, "grad_norm": 1.4903122013208903, "learning_rate": 7.014532178568314e-05, "loss": 0.3708, "step": 690 }, { "epoch": 1.12, "grad_norm": 1.2283466862099195, "learning_rate": 6.937577932260515e-05, "loss": 0.4147, "step": 700 }, { "epoch": 1.1360000000000001, "grad_norm": 1.044360506141548, "learning_rate": 6.860079751934908e-05, "loss": 0.3698, "step": 710 }, { "epoch": 1.152, "grad_norm": 1.9659986049038847, "learning_rate": 6.782059393566253e-05, "loss": 0.3768, "step": 720 }, { "epoch": 1.168, "grad_norm": 1.1559700548705432, "learning_rate": 6.70353875971976e-05, "loss": 0.3713, "step": 730 }, { "epoch": 1.184, "grad_norm": 1.3638599004402776, "learning_rate": 6.624539893402382e-05, "loss": 0.376, "step": 740 }, { "epoch": 1.2, "grad_norm": 1.2823520816693201, "learning_rate": 6.545084971874738e-05, "loss": 0.3762, "step": 750 }, { "epoch": 1.216, "grad_norm": 1.1889832443284072, "learning_rate": 6.465196300425287e-05, "loss": 0.3891, "step": 760 }, { "epoch": 1.232, "grad_norm": 1.128194765929284, "learning_rate": 6.384896306108612e-05, "loss": 0.3772, "step": 770 }, { "epoch": 1.248, "grad_norm": 1.4491197750693638, "learning_rate": 6.304207531449486e-05, "loss": 0.3678, "step": 780 }, { "epoch": 1.264, "grad_norm": 1.5686675701147237, "learning_rate": 6.223152628114537e-05, "loss": 0.3648, "step": 790 }, { "epoch": 1.28, "grad_norm": 1.0902715474500886, "learning_rate": 6.141754350553279e-05, "loss": 0.3909, "step": 800 }, { "epoch": 1.296, "grad_norm": 1.6337999430835446, "learning_rate": 6.0600355496102745e-05, "loss": 0.383, "step": 810 }, { "epoch": 1.312, "grad_norm": 1.321659745725057, "learning_rate": 5.9780191661102415e-05, "loss": 0.3802, "step": 820 }, { "epoch": 1.328, "grad_norm": 1.6867441595334431, "learning_rate": 5.8957282244179124e-05, "loss": 0.423, "step": 830 }, { "epoch": 1.3439999999999999, "grad_norm": 1.2441664958096805, "learning_rate": 5.813185825974419e-05, "loss": 0.3641, "step": 840 }, { "epoch": 1.3599999999999999, "grad_norm": 1.6422801866133996, "learning_rate": 5.730415142812059e-05, "loss": 0.3519, "step": 850 }, { "epoch": 1.376, "grad_norm": 1.7154734289625684, "learning_rate": 5.6474394110492344e-05, "loss": 0.3732, "step": 860 }, { "epoch": 1.392, "grad_norm": 1.3529634625288804, "learning_rate": 5.564281924367408e-05, "loss": 0.3954, "step": 870 }, { "epoch": 1.408, "grad_norm": 1.4693831158149995, "learning_rate": 5.480966027471889e-05, "loss": 0.3669, "step": 880 }, { "epoch": 1.424, "grad_norm": 1.5051265025599196, "learning_rate": 5.3975151095382995e-05, "loss": 0.3994, "step": 890 }, { "epoch": 1.44, "grad_norm": 1.2893604881333178, "learning_rate": 5.313952597646568e-05, "loss": 0.3904, "step": 900 }, { "epoch": 1.456, "grad_norm": 1.5665270237662434, "learning_rate": 5.230301950204262e-05, "loss": 0.3821, "step": 910 }, { "epoch": 1.472, "grad_norm": 1.1847281856196625, "learning_rate": 5.1465866503611426e-05, "loss": 0.3713, "step": 920 }, { "epoch": 1.488, "grad_norm": 1.6027343131881702, "learning_rate": 5.062830199416764e-05, "loss": 0.3785, "step": 930 }, { "epoch": 1.504, "grad_norm": 1.1745245908384976, "learning_rate": 4.979056110222981e-05, "loss": 0.3851, "step": 940 }, { "epoch": 1.52, "grad_norm": 1.2252730329400867, "learning_rate": 4.895287900583216e-05, "loss": 0.3991, "step": 950 }, { "epoch": 1.536, "grad_norm": 1.9189789923421308, "learning_rate": 4.811549086650327e-05, "loss": 0.3726, "step": 960 }, { "epoch": 1.552, "grad_norm": 1.9319344659060065, "learning_rate": 4.7278631763249554e-05, "loss": 0.3703, "step": 970 }, { "epoch": 1.568, "grad_norm": 1.3155775136973578, "learning_rate": 4.6442536626561675e-05, "loss": 0.3462, "step": 980 }, { "epoch": 1.584, "grad_norm": 1.3059644425743502, "learning_rate": 4.560744017246284e-05, "loss": 0.3486, "step": 990 }, { "epoch": 1.6, "grad_norm": 1.1918137712202796, "learning_rate": 4.477357683661734e-05, "loss": 0.3384, "step": 1000 }, { "epoch": 1.616, "grad_norm": 1.3051153832513638, "learning_rate": 4.394118070851749e-05, "loss": 0.3976, "step": 1010 }, { "epoch": 1.6320000000000001, "grad_norm": 2.1135344091319266, "learning_rate": 4.31104854657681e-05, "loss": 0.3607, "step": 1020 }, { "epoch": 1.6480000000000001, "grad_norm": 1.3948892979651148, "learning_rate": 4.228172430848644e-05, "loss": 0.371, "step": 1030 }, { "epoch": 1.6640000000000001, "grad_norm": 1.3934093132103698, "learning_rate": 4.1455129893836174e-05, "loss": 0.3627, "step": 1040 }, { "epoch": 1.6800000000000002, "grad_norm": 2.099040035068272, "learning_rate": 4.063093427071376e-05, "loss": 0.3725, "step": 1050 }, { "epoch": 1.696, "grad_norm": 1.1171081747364493, "learning_rate": 3.9809368814605766e-05, "loss": 0.3466, "step": 1060 }, { "epoch": 1.712, "grad_norm": 1.2557303206572503, "learning_rate": 3.899066416263493e-05, "loss": 0.3629, "step": 1070 }, { "epoch": 1.728, "grad_norm": 1.2612524984380895, "learning_rate": 3.817505014881378e-05, "loss": 0.3941, "step": 1080 }, { "epoch": 1.744, "grad_norm": 1.4536732919602855, "learning_rate": 3.736275573952354e-05, "loss": 0.3362, "step": 1090 }, { "epoch": 1.76, "grad_norm": 1.5347325608557592, "learning_rate": 3.655400896923672e-05, "loss": 0.3197, "step": 1100 }, { "epoch": 1.776, "grad_norm": 1.5306810075691544, "learning_rate": 3.5749036876501194e-05, "loss": 0.3447, "step": 1110 }, { "epoch": 1.792, "grad_norm": 1.2300188758500825, "learning_rate": 3.494806544020398e-05, "loss": 0.3535, "step": 1120 }, { "epoch": 1.808, "grad_norm": 1.4275688891160825, "learning_rate": 3.4151319516132416e-05, "loss": 0.3447, "step": 1130 }, { "epoch": 1.8239999999999998, "grad_norm": 1.397634927032295, "learning_rate": 3.335902277385067e-05, "loss": 0.338, "step": 1140 }, { "epoch": 1.8399999999999999, "grad_norm": 1.8570000554042922, "learning_rate": 3.257139763390925e-05, "loss": 0.3915, "step": 1150 }, { "epoch": 1.8559999999999999, "grad_norm": 1.0694824076607796, "learning_rate": 3.178866520540509e-05, "loss": 0.332, "step": 1160 }, { "epoch": 1.8719999999999999, "grad_norm": 1.9227266591369816, "learning_rate": 3.101104522390995e-05, "loss": 0.3734, "step": 1170 }, { "epoch": 1.888, "grad_norm": 1.4206748536497713, "learning_rate": 3.023875598978419e-05, "loss": 0.3409, "step": 1180 }, { "epoch": 1.904, "grad_norm": 1.4359353896901201, "learning_rate": 2.9472014306893603e-05, "loss": 0.3333, "step": 1190 }, { "epoch": 1.92, "grad_norm": 1.2517700720828175, "learning_rate": 2.8711035421746367e-05, "loss": 0.369, "step": 1200 }, { "epoch": 1.936, "grad_norm": 1.3801398040671706, "learning_rate": 2.795603296306708e-05, "loss": 0.3449, "step": 1210 }, { "epoch": 1.952, "grad_norm": 1.8786886188992349, "learning_rate": 2.7207218881825014e-05, "loss": 0.3662, "step": 1220 }, { "epoch": 1.968, "grad_norm": 1.2874259666120602, "learning_rate": 2.6464803391733374e-05, "loss": 0.3773, "step": 1230 }, { "epoch": 1.984, "grad_norm": 1.1788438157215508, "learning_rate": 2.5728994910236304e-05, "loss": 0.3245, "step": 1240 }, { "epoch": 2.0, "grad_norm": 1.5631045640027712, "learning_rate": 2.500000000000001e-05, "loss": 0.3412, "step": 1250 }, { "epoch": 2.016, "grad_norm": 1.4435057320180134, "learning_rate": 2.4278023310924673e-05, "loss": 0.3027, "step": 1260 }, { "epoch": 2.032, "grad_norm": 1.690073924846414, "learning_rate": 2.3563267522693415e-05, "loss": 0.2745, "step": 1270 }, { "epoch": 2.048, "grad_norm": 1.571802508559373, "learning_rate": 2.2855933287874138e-05, "loss": 0.2686, "step": 1280 }, { "epoch": 2.064, "grad_norm": 1.2829706151133036, "learning_rate": 2.215621917559062e-05, "loss": 0.2628, "step": 1290 }, { "epoch": 2.08, "grad_norm": 1.5440500775973827, "learning_rate": 2.1464321615778422e-05, "loss": 0.274, "step": 1300 }, { "epoch": 2.096, "grad_norm": 2.0539984960007627, "learning_rate": 2.07804348440414e-05, "loss": 0.2711, "step": 1310 }, { "epoch": 2.112, "grad_norm": 1.6138729128750948, "learning_rate": 2.0104750847124075e-05, "loss": 0.2662, "step": 1320 }, { "epoch": 2.128, "grad_norm": 1.4560623181879653, "learning_rate": 1.9437459309015427e-05, "loss": 0.2722, "step": 1330 }, { "epoch": 2.144, "grad_norm": 1.4659611175991962, "learning_rate": 1.8778747557699224e-05, "loss": 0.2965, "step": 1340 }, { "epoch": 2.16, "grad_norm": 1.375382006257737, "learning_rate": 1.8128800512565513e-05, "loss": 0.2789, "step": 1350 }, { "epoch": 2.176, "grad_norm": 1.599179489252925, "learning_rate": 1.7487800632498545e-05, "loss": 0.2727, "step": 1360 }, { "epoch": 2.192, "grad_norm": 1.517535594054785, "learning_rate": 1.685592786465524e-05, "loss": 0.2579, "step": 1370 }, { "epoch": 2.208, "grad_norm": 2.1592276480492414, "learning_rate": 1.6233359593948777e-05, "loss": 0.2639, "step": 1380 }, { "epoch": 2.224, "grad_norm": 1.7544518130502498, "learning_rate": 1.5620270593251635e-05, "loss": 0.2909, "step": 1390 }, { "epoch": 2.24, "grad_norm": 1.44885168688834, "learning_rate": 1.5016832974331724e-05, "loss": 0.2675, "step": 1400 }, { "epoch": 2.2560000000000002, "grad_norm": 1.6827124662772142, "learning_rate": 1.4423216139535734e-05, "loss": 0.2501, "step": 1410 }, { "epoch": 2.2720000000000002, "grad_norm": 1.8488651881141411, "learning_rate": 1.3839586734232906e-05, "loss": 0.28, "step": 1420 }, { "epoch": 2.288, "grad_norm": 1.8044696205561654, "learning_rate": 1.3266108600032929e-05, "loss": 0.2946, "step": 1430 }, { "epoch": 2.304, "grad_norm": 1.5680732030217428, "learning_rate": 1.2702942728790895e-05, "loss": 0.2801, "step": 1440 }, { "epoch": 2.32, "grad_norm": 1.8180785380951152, "learning_rate": 1.2150247217412186e-05, "loss": 0.2686, "step": 1450 }, { "epoch": 2.336, "grad_norm": 1.4756615703317884, "learning_rate": 1.160817722347014e-05, "loss": 0.2623, "step": 1460 }, { "epoch": 2.352, "grad_norm": 1.4951599689749675, "learning_rate": 1.1076884921648834e-05, "loss": 0.2865, "step": 1470 }, { "epoch": 2.368, "grad_norm": 1.8073936077115729, "learning_rate": 1.0556519461023301e-05, "loss": 0.2677, "step": 1480 }, { "epoch": 2.384, "grad_norm": 1.6787640384880986, "learning_rate": 1.0047226923189024e-05, "loss": 0.2821, "step": 1490 }, { "epoch": 2.4, "grad_norm": 1.7736081937737844, "learning_rate": 9.549150281252633e-06, "loss": 0.2628, "step": 1500 }, { "epoch": 2.416, "grad_norm": 1.8163690827077064, "learning_rate": 9.06242935969528e-06, "loss": 0.2648, "step": 1510 }, { "epoch": 2.432, "grad_norm": 1.914925459991879, "learning_rate": 8.587200795119793e-06, "loss": 0.2718, "step": 1520 }, { "epoch": 2.448, "grad_norm": 1.557572030989793, "learning_rate": 8.123597997892918e-06, "loss": 0.2779, "step": 1530 }, { "epoch": 2.464, "grad_norm": 2.4189475486471412, "learning_rate": 7.671751114693104e-06, "loss": 0.2522, "step": 1540 }, { "epoch": 2.48, "grad_norm": 1.5979978703748492, "learning_rate": 7.2317869919746705e-06, "loss": 0.2668, "step": 1550 }, { "epoch": 2.496, "grad_norm": 1.5867221943137244, "learning_rate": 6.803829140358237e-06, "loss": 0.2582, "step": 1560 }, { "epoch": 2.512, "grad_norm": 2.0088219455242866, "learning_rate": 6.3879976999578154e-06, "loss": 0.2509, "step": 1570 }, { "epoch": 2.528, "grad_norm": 1.7960925687722837, "learning_rate": 5.98440940665399e-06, "loss": 0.2743, "step": 1580 }, { "epoch": 2.544, "grad_norm": 1.5572986827037758, "learning_rate": 5.593177559322777e-06, "loss": 0.2833, "step": 1590 }, { "epoch": 2.56, "grad_norm": 1.370850993490327, "learning_rate": 5.214411988029355e-06, "loss": 0.2693, "step": 1600 }, { "epoch": 2.576, "grad_norm": 1.3820976570272918, "learning_rate": 4.848219023195644e-06, "loss": 0.2607, "step": 1610 }, { "epoch": 2.592, "grad_norm": 1.7956675550522907, "learning_rate": 4.494701465750217e-06, "loss": 0.2632, "step": 1620 }, { "epoch": 2.608, "grad_norm": 1.657852197945445, "learning_rate": 4.153958558269189e-06, "loss": 0.2573, "step": 1630 }, { "epoch": 2.624, "grad_norm": 1.6754996668537725, "learning_rate": 3.826085957115888e-06, "loss": 0.2411, "step": 1640 }, { "epoch": 2.64, "grad_norm": 1.3160903872279537, "learning_rate": 3.511175705587433e-06, "loss": 0.2601, "step": 1650 }, { "epoch": 2.656, "grad_norm": 1.4417765456611342, "learning_rate": 3.2093162080754637e-06, "loss": 0.2832, "step": 1660 }, { "epoch": 2.672, "grad_norm": 1.5418721821515773, "learning_rate": 2.9205922052484958e-06, "loss": 0.2725, "step": 1670 }, { "epoch": 2.6879999999999997, "grad_norm": 1.3747599439593585, "learning_rate": 2.6450847502627884e-06, "loss": 0.2654, "step": 1680 }, { "epoch": 2.7039999999999997, "grad_norm": 1.528133223907777, "learning_rate": 2.3828711860083674e-06, "loss": 0.2784, "step": 1690 }, { "epoch": 2.7199999999999998, "grad_norm": 2.3538184370373534, "learning_rate": 2.134025123396638e-06, "loss": 0.2758, "step": 1700 }, { "epoch": 2.7359999999999998, "grad_norm": 1.6198355766213741, "learning_rate": 1.8986164206957035e-06, "loss": 0.2533, "step": 1710 }, { "epoch": 2.752, "grad_norm": 1.8976100233670337, "learning_rate": 1.6767111639191202e-06, "loss": 0.268, "step": 1720 }, { "epoch": 2.768, "grad_norm": 1.7822120577723535, "learning_rate": 1.4683716482736366e-06, "loss": 0.28, "step": 1730 }, { "epoch": 2.784, "grad_norm": 2.2850018497482183, "learning_rate": 1.2736563606711382e-06, "loss": 0.2797, "step": 1740 }, { "epoch": 2.8, "grad_norm": 1.79121827664844, "learning_rate": 1.0926199633097157e-06, "loss": 0.2831, "step": 1750 }, { "epoch": 2.816, "grad_norm": 1.7925072153699964, "learning_rate": 9.253132783283547e-07, "loss": 0.2558, "step": 1760 }, { "epoch": 2.832, "grad_norm": 1.2894709082534288, "learning_rate": 7.717832735397335e-07, "loss": 0.2635, "step": 1770 }, { "epoch": 2.848, "grad_norm": 1.4531115959909715, "learning_rate": 6.3207304924498e-07, "loss": 0.2639, "step": 1780 }, { "epoch": 2.864, "grad_norm": 1.9570336366099832, "learning_rate": 5.062218261342122e-07, "loss": 0.2532, "step": 1790 }, { "epoch": 2.88, "grad_norm": 1.3421965286430086, "learning_rate": 3.9426493427611177e-07, "loss": 0.2327, "step": 1800 }, { "epoch": 2.896, "grad_norm": 2.3252685172502465, "learning_rate": 2.962338031997691e-07, "loss": 0.2621, "step": 1810 }, { "epoch": 2.912, "grad_norm": 1.7539361758237508, "learning_rate": 2.1215595307154667e-07, "loss": 0.256, "step": 1820 }, { "epoch": 2.928, "grad_norm": 1.417074579458341, "learning_rate": 1.420549869693033e-07, "loss": 0.2545, "step": 1830 }, { "epoch": 2.944, "grad_norm": 1.7016791883476743, "learning_rate": 8.595058425640013e-08, "loss": 0.2596, "step": 1840 }, { "epoch": 2.96, "grad_norm": 1.6335011497462837, "learning_rate": 4.385849505708084e-08, "loss": 0.2584, "step": 1850 }, { "epoch": 2.976, "grad_norm": 1.7816152560274576, "learning_rate": 1.5790535835003008e-08, "loss": 0.2752, "step": 1860 }, { "epoch": 2.992, "grad_norm": 1.579627845758394, "learning_rate": 1.7545860759693445e-09, "loss": 0.2778, "step": 1870 }, { "epoch": 3.0, "step": 1875, "total_flos": 299543991877632.0, "train_loss": 0.3980048195521037, "train_runtime": 15877.4798, "train_samples_per_second": 0.945, "train_steps_per_second": 0.118 } ], "logging_steps": 10, "max_steps": 1875, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 299543991877632.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }