diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,20081 @@ +{ + "best_metric": 0.934053361415863, + "best_model_checkpoint": "data/output/20240919-195438_qwen2.5-14_full_v4-sft-1e-5/checkpoint-28000", + "epoch": 1.9475551227655283, + "eval_steps": 500, + "global_step": 28000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006955554009876886, + "grad_norm": 71.94125186212713, + "learning_rate": 2.3180343069077426e-08, + "loss": 4.1325, + "step": 10 + }, + { + "epoch": 0.0013911108019753773, + "grad_norm": 74.8369077582018, + "learning_rate": 4.636068613815485e-08, + "loss": 3.5941, + "step": 20 + }, + { + "epoch": 0.002086666202963066, + "grad_norm": 77.60826795700011, + "learning_rate": 6.954102920723227e-08, + "loss": 3.3158, + "step": 30 + }, + { + "epoch": 0.0027822216039507545, + "grad_norm": 114.1277923397916, + "learning_rate": 9.27213722763097e-08, + "loss": 3.4595, + "step": 40 + }, + { + "epoch": 0.0034777770049384434, + "grad_norm": 109.95661724658716, + "learning_rate": 1.1590171534538712e-07, + "loss": 3.2352, + "step": 50 + }, + { + "epoch": 0.004173332405926132, + "grad_norm": 193.99408630609778, + "learning_rate": 1.3908205841446455e-07, + "loss": 3.5715, + "step": 60 + }, + { + "epoch": 0.004868887806913821, + "grad_norm": 67.42578442185231, + "learning_rate": 1.6226240148354197e-07, + "loss": 3.136, + "step": 70 + }, + { + "epoch": 0.005564443207901509, + "grad_norm": 53.40944914408296, + "learning_rate": 1.854427445526194e-07, + "loss": 2.4554, + "step": 80 + }, + { + "epoch": 0.006259998608889198, + "grad_norm": 222.64975047017927, + "learning_rate": 2.0862308762169682e-07, + "loss": 2.7649, + "step": 90 + }, + { + "epoch": 0.006955554009876887, + "grad_norm": 60.46022152369629, + "learning_rate": 2.3180343069077424e-07, + "loss": 2.4185, + "step": 100 + }, + { + "epoch": 0.0076511094108645756, + "grad_norm": 39.12465391275975, + "learning_rate": 2.5498377375985166e-07, + "loss": 1.8801, + "step": 110 + }, + { + "epoch": 0.008346664811852264, + "grad_norm": 27.76922153377231, + "learning_rate": 2.781641168289291e-07, + "loss": 1.8454, + "step": 120 + }, + { + "epoch": 0.009042220212839952, + "grad_norm": 14.286226916320414, + "learning_rate": 3.0134445989800654e-07, + "loss": 1.428, + "step": 130 + }, + { + "epoch": 0.009737775613827642, + "grad_norm": 6.4635030962641835, + "learning_rate": 3.2452480296708393e-07, + "loss": 1.4213, + "step": 140 + }, + { + "epoch": 0.01043333101481533, + "grad_norm": 20.61070279830427, + "learning_rate": 3.477051460361613e-07, + "loss": 1.4122, + "step": 150 + }, + { + "epoch": 0.011128886415803018, + "grad_norm": 34.1144019089677, + "learning_rate": 3.708854891052388e-07, + "loss": 1.4272, + "step": 160 + }, + { + "epoch": 0.011824441816790708, + "grad_norm": 90.88941682430695, + "learning_rate": 3.940658321743162e-07, + "loss": 1.3438, + "step": 170 + }, + { + "epoch": 0.012519997217778396, + "grad_norm": 4.693911515204152, + "learning_rate": 4.1724617524339365e-07, + "loss": 1.285, + "step": 180 + }, + { + "epoch": 0.013215552618766085, + "grad_norm": 4.132441857808432, + "learning_rate": 4.4042651831247104e-07, + "loss": 1.3482, + "step": 190 + }, + { + "epoch": 0.013911108019753773, + "grad_norm": 5.864181785614484, + "learning_rate": 4.636068613815485e-07, + "loss": 1.3356, + "step": 200 + }, + { + "epoch": 0.014606663420741461, + "grad_norm": 15.27692293522563, + "learning_rate": 4.867872044506259e-07, + "loss": 1.3874, + "step": 210 + }, + { + "epoch": 0.015302218821729151, + "grad_norm": 25.138127162865405, + "learning_rate": 5.099675475197033e-07, + "loss": 1.4074, + "step": 220 + }, + { + "epoch": 0.01599777422271684, + "grad_norm": 3.831193162960591, + "learning_rate": 5.331478905887808e-07, + "loss": 1.2638, + "step": 230 + }, + { + "epoch": 0.01669332962370453, + "grad_norm": 12.464017608121788, + "learning_rate": 5.563282336578582e-07, + "loss": 1.2784, + "step": 240 + }, + { + "epoch": 0.017388885024692217, + "grad_norm": 6.558159382736273, + "learning_rate": 5.795085767269356e-07, + "loss": 1.277, + "step": 250 + }, + { + "epoch": 0.018084440425679905, + "grad_norm": 4.476109318805263, + "learning_rate": 6.026889197960131e-07, + "loss": 1.2341, + "step": 260 + }, + { + "epoch": 0.018779995826667593, + "grad_norm": 4.125188855868361, + "learning_rate": 6.258692628650904e-07, + "loss": 1.1715, + "step": 270 + }, + { + "epoch": 0.019475551227655284, + "grad_norm": 5.298797104178416, + "learning_rate": 6.490496059341679e-07, + "loss": 1.283, + "step": 280 + }, + { + "epoch": 0.020171106628642972, + "grad_norm": 4.8242423728855375, + "learning_rate": 6.722299490032454e-07, + "loss": 1.2307, + "step": 290 + }, + { + "epoch": 0.02086666202963066, + "grad_norm": 6.606351102868575, + "learning_rate": 6.954102920723226e-07, + "loss": 1.2918, + "step": 300 + }, + { + "epoch": 0.021562217430618348, + "grad_norm": 3.535686669100287, + "learning_rate": 7.185906351414002e-07, + "loss": 1.2489, + "step": 310 + }, + { + "epoch": 0.022257772831606036, + "grad_norm": 11.018573728249391, + "learning_rate": 7.417709782104776e-07, + "loss": 1.2761, + "step": 320 + }, + { + "epoch": 0.022953328232593728, + "grad_norm": 4.729597892137768, + "learning_rate": 7.64951321279555e-07, + "loss": 1.2878, + "step": 330 + }, + { + "epoch": 0.023648883633581416, + "grad_norm": 13.48558994617877, + "learning_rate": 7.881316643486324e-07, + "loss": 1.2067, + "step": 340 + }, + { + "epoch": 0.024344439034569103, + "grad_norm": 3.5160970889278187, + "learning_rate": 8.113120074177099e-07, + "loss": 1.232, + "step": 350 + }, + { + "epoch": 0.02503999443555679, + "grad_norm": 4.539228133688202, + "learning_rate": 8.344923504867873e-07, + "loss": 1.2051, + "step": 360 + }, + { + "epoch": 0.02573554983654448, + "grad_norm": 9.328760421076003, + "learning_rate": 8.576726935558646e-07, + "loss": 1.2461, + "step": 370 + }, + { + "epoch": 0.02643110523753217, + "grad_norm": 9.039365501147607, + "learning_rate": 8.808530366249421e-07, + "loss": 1.1789, + "step": 380 + }, + { + "epoch": 0.02712666063851986, + "grad_norm": 4.674438800713769, + "learning_rate": 9.040333796940195e-07, + "loss": 1.1513, + "step": 390 + }, + { + "epoch": 0.027822216039507547, + "grad_norm": 4.461718247054742, + "learning_rate": 9.27213722763097e-07, + "loss": 1.1275, + "step": 400 + }, + { + "epoch": 0.028517771440495235, + "grad_norm": 4.565142649249895, + "learning_rate": 9.503940658321743e-07, + "loss": 1.1352, + "step": 410 + }, + { + "epoch": 0.029213326841482923, + "grad_norm": 12.685534036584528, + "learning_rate": 9.735744089012517e-07, + "loss": 1.106, + "step": 420 + }, + { + "epoch": 0.029908882242470614, + "grad_norm": 3.607629213578401, + "learning_rate": 9.967547519703292e-07, + "loss": 1.1523, + "step": 430 + }, + { + "epoch": 0.030604437643458302, + "grad_norm": 6.325071866727276, + "learning_rate": 1.0199350950394066e-06, + "loss": 1.155, + "step": 440 + }, + { + "epoch": 0.03129999304444599, + "grad_norm": 4.660434429722089, + "learning_rate": 1.043115438108484e-06, + "loss": 1.1998, + "step": 450 + }, + { + "epoch": 0.03199554844543368, + "grad_norm": 3.6523267528106236, + "learning_rate": 1.0662957811775615e-06, + "loss": 1.1963, + "step": 460 + }, + { + "epoch": 0.03269110384642137, + "grad_norm": 4.082494969893831, + "learning_rate": 1.089476124246639e-06, + "loss": 1.1745, + "step": 470 + }, + { + "epoch": 0.03338665924740906, + "grad_norm": 4.2777417674574645, + "learning_rate": 1.1126564673157164e-06, + "loss": 1.1378, + "step": 480 + }, + { + "epoch": 0.034082214648396746, + "grad_norm": 3.6762319349557804, + "learning_rate": 1.1358368103847938e-06, + "loss": 1.1128, + "step": 490 + }, + { + "epoch": 0.034777770049384434, + "grad_norm": 5.07670848589415, + "learning_rate": 1.1590171534538713e-06, + "loss": 1.1416, + "step": 500 + }, + { + "epoch": 0.034777770049384434, + "eval_loss": 1.1608619689941406, + "eval_runtime": 1329.0977, + "eval_samples_per_second": 13.664, + "eval_steps_per_second": 2.277, + "step": 500 + }, + { + "epoch": 0.03547332545037212, + "grad_norm": 4.20297140207172, + "learning_rate": 1.1821974965229487e-06, + "loss": 1.2091, + "step": 510 + }, + { + "epoch": 0.03616888085135981, + "grad_norm": 3.399668583855367, + "learning_rate": 1.2053778395920262e-06, + "loss": 1.1862, + "step": 520 + }, + { + "epoch": 0.0368644362523475, + "grad_norm": 4.217174411165483, + "learning_rate": 1.2285581826611034e-06, + "loss": 1.1605, + "step": 530 + }, + { + "epoch": 0.037559991653335185, + "grad_norm": 4.753373663751844, + "learning_rate": 1.2517385257301808e-06, + "loss": 1.2063, + "step": 540 + }, + { + "epoch": 0.03825554705432287, + "grad_norm": 15.18654271805812, + "learning_rate": 1.2749188687992583e-06, + "loss": 1.1328, + "step": 550 + }, + { + "epoch": 0.03895110245531057, + "grad_norm": 4.025940797005834, + "learning_rate": 1.2980992118683357e-06, + "loss": 1.159, + "step": 560 + }, + { + "epoch": 0.039646657856298256, + "grad_norm": 7.452243164756006, + "learning_rate": 1.3212795549374134e-06, + "loss": 1.1956, + "step": 570 + }, + { + "epoch": 0.040342213257285944, + "grad_norm": 3.369372106751049, + "learning_rate": 1.3444598980064908e-06, + "loss": 1.1278, + "step": 580 + }, + { + "epoch": 0.04103776865827363, + "grad_norm": 7.336034688912176, + "learning_rate": 1.3676402410755678e-06, + "loss": 1.1683, + "step": 590 + }, + { + "epoch": 0.04173332405926132, + "grad_norm": 4.609464134473135, + "learning_rate": 1.3908205841446453e-06, + "loss": 1.0925, + "step": 600 + }, + { + "epoch": 0.04242887946024901, + "grad_norm": 3.717037978652359, + "learning_rate": 1.414000927213723e-06, + "loss": 1.125, + "step": 610 + }, + { + "epoch": 0.043124434861236696, + "grad_norm": 4.196301110411812, + "learning_rate": 1.4371812702828004e-06, + "loss": 1.1584, + "step": 620 + }, + { + "epoch": 0.043819990262224384, + "grad_norm": 5.486863873426256, + "learning_rate": 1.4603616133518778e-06, + "loss": 1.1455, + "step": 630 + }, + { + "epoch": 0.04451554566321207, + "grad_norm": 4.858186902965923, + "learning_rate": 1.4835419564209553e-06, + "loss": 1.1288, + "step": 640 + }, + { + "epoch": 0.04521110106419976, + "grad_norm": 3.8122229848232974, + "learning_rate": 1.5067222994900327e-06, + "loss": 1.125, + "step": 650 + }, + { + "epoch": 0.045906656465187455, + "grad_norm": 14.229965347378094, + "learning_rate": 1.52990264255911e-06, + "loss": 1.1478, + "step": 660 + }, + { + "epoch": 0.04660221186617514, + "grad_norm": 3.8682586331574482, + "learning_rate": 1.5530829856281874e-06, + "loss": 1.0764, + "step": 670 + }, + { + "epoch": 0.04729776726716283, + "grad_norm": 15.480311853564283, + "learning_rate": 1.5762633286972648e-06, + "loss": 1.1825, + "step": 680 + }, + { + "epoch": 0.04799332266815052, + "grad_norm": 3.9046987672904487, + "learning_rate": 1.5994436717663423e-06, + "loss": 1.0574, + "step": 690 + }, + { + "epoch": 0.04868887806913821, + "grad_norm": 4.417750272162218, + "learning_rate": 1.6226240148354197e-06, + "loss": 1.1582, + "step": 700 + }, + { + "epoch": 0.049384433470125895, + "grad_norm": 4.942121683358699, + "learning_rate": 1.6458043579044972e-06, + "loss": 1.1144, + "step": 710 + }, + { + "epoch": 0.05007998887111358, + "grad_norm": 3.3738879128287658, + "learning_rate": 1.6689847009735746e-06, + "loss": 1.068, + "step": 720 + }, + { + "epoch": 0.05077554427210127, + "grad_norm": 5.920179680092115, + "learning_rate": 1.692165044042652e-06, + "loss": 1.1085, + "step": 730 + }, + { + "epoch": 0.05147109967308896, + "grad_norm": 4.079691638982447, + "learning_rate": 1.7153453871117293e-06, + "loss": 1.0896, + "step": 740 + }, + { + "epoch": 0.05216665507407665, + "grad_norm": 7.40216926130164, + "learning_rate": 1.7385257301808067e-06, + "loss": 1.083, + "step": 750 + }, + { + "epoch": 0.05286221047506434, + "grad_norm": 4.391375581767902, + "learning_rate": 1.7617060732498842e-06, + "loss": 1.1516, + "step": 760 + }, + { + "epoch": 0.05355776587605203, + "grad_norm": 8.051708020431825, + "learning_rate": 1.7848864163189616e-06, + "loss": 1.0714, + "step": 770 + }, + { + "epoch": 0.05425332127703972, + "grad_norm": 8.54131953437199, + "learning_rate": 1.808066759388039e-06, + "loss": 1.0639, + "step": 780 + }, + { + "epoch": 0.054948876678027406, + "grad_norm": 4.59244969719458, + "learning_rate": 1.8312471024571165e-06, + "loss": 1.1029, + "step": 790 + }, + { + "epoch": 0.055644432079015094, + "grad_norm": 4.435906933097953, + "learning_rate": 1.854427445526194e-06, + "loss": 1.0743, + "step": 800 + }, + { + "epoch": 0.05633998748000278, + "grad_norm": 5.1662320118548894, + "learning_rate": 1.8776077885952716e-06, + "loss": 1.135, + "step": 810 + }, + { + "epoch": 0.05703554288099047, + "grad_norm": 5.0493661912158805, + "learning_rate": 1.9007881316643486e-06, + "loss": 1.0907, + "step": 820 + }, + { + "epoch": 0.05773109828197816, + "grad_norm": 10.949617078652706, + "learning_rate": 1.9239684747334263e-06, + "loss": 1.0434, + "step": 830 + }, + { + "epoch": 0.058426653682965846, + "grad_norm": 6.187550988701232, + "learning_rate": 1.9471488178025035e-06, + "loss": 1.0528, + "step": 840 + }, + { + "epoch": 0.059122209083953534, + "grad_norm": 3.1253121989612884, + "learning_rate": 1.970329160871581e-06, + "loss": 1.0895, + "step": 850 + }, + { + "epoch": 0.05981776448494123, + "grad_norm": 3.054235910762966, + "learning_rate": 1.9935095039406584e-06, + "loss": 1.0689, + "step": 860 + }, + { + "epoch": 0.060513319885928916, + "grad_norm": 4.136423010166038, + "learning_rate": 2.016689847009736e-06, + "loss": 1.1949, + "step": 870 + }, + { + "epoch": 0.061208875286916604, + "grad_norm": 4.902423398240502, + "learning_rate": 2.0398701900788133e-06, + "loss": 1.0952, + "step": 880 + }, + { + "epoch": 0.06190443068790429, + "grad_norm": 5.879684437628109, + "learning_rate": 2.063050533147891e-06, + "loss": 1.171, + "step": 890 + }, + { + "epoch": 0.06259998608889197, + "grad_norm": 4.1335237025985405, + "learning_rate": 2.086230876216968e-06, + "loss": 1.1525, + "step": 900 + }, + { + "epoch": 0.06329554148987968, + "grad_norm": 3.31880428688954, + "learning_rate": 2.1094112192860454e-06, + "loss": 1.1228, + "step": 910 + }, + { + "epoch": 0.06399109689086736, + "grad_norm": 23.082175053747566, + "learning_rate": 2.132591562355123e-06, + "loss": 1.1599, + "step": 920 + }, + { + "epoch": 0.06468665229185505, + "grad_norm": 5.0955552035300915, + "learning_rate": 2.1557719054242003e-06, + "loss": 1.0651, + "step": 930 + }, + { + "epoch": 0.06538220769284274, + "grad_norm": 9.305910214633359, + "learning_rate": 2.178952248493278e-06, + "loss": 1.0874, + "step": 940 + }, + { + "epoch": 0.06607776309383043, + "grad_norm": 4.068946413040955, + "learning_rate": 2.2021325915623556e-06, + "loss": 1.0665, + "step": 950 + }, + { + "epoch": 0.06677331849481812, + "grad_norm": 3.3675311068396376, + "learning_rate": 2.225312934631433e-06, + "loss": 1.1088, + "step": 960 + }, + { + "epoch": 0.0674688738958058, + "grad_norm": 3.915657433096321, + "learning_rate": 2.24849327770051e-06, + "loss": 1.0653, + "step": 970 + }, + { + "epoch": 0.06816442929679349, + "grad_norm": 3.3918117454405596, + "learning_rate": 2.2716736207695877e-06, + "loss": 1.0909, + "step": 980 + }, + { + "epoch": 0.06885998469778118, + "grad_norm": 3.247036542175465, + "learning_rate": 2.294853963838665e-06, + "loss": 1.0663, + "step": 990 + }, + { + "epoch": 0.06955554009876887, + "grad_norm": 7.801554416680361, + "learning_rate": 2.3180343069077426e-06, + "loss": 1.1538, + "step": 1000 + }, + { + "epoch": 0.06955554009876887, + "eval_loss": 1.091132640838623, + "eval_runtime": 1328.0537, + "eval_samples_per_second": 13.675, + "eval_steps_per_second": 2.279, + "step": 1000 + }, + { + "epoch": 0.07025109549975656, + "grad_norm": 5.629336569234828, + "learning_rate": 2.34121464997682e-06, + "loss": 1.0773, + "step": 1010 + }, + { + "epoch": 0.07094665090074424, + "grad_norm": 3.84287801372759, + "learning_rate": 2.3643949930458975e-06, + "loss": 1.0599, + "step": 1020 + }, + { + "epoch": 0.07164220630173193, + "grad_norm": 5.025374746213098, + "learning_rate": 2.3875753361149747e-06, + "loss": 1.057, + "step": 1030 + }, + { + "epoch": 0.07233776170271962, + "grad_norm": 2.878294363346734, + "learning_rate": 2.4107556791840523e-06, + "loss": 1.0652, + "step": 1040 + }, + { + "epoch": 0.07303331710370731, + "grad_norm": 5.558987457613281, + "learning_rate": 2.4339360222531296e-06, + "loss": 1.0897, + "step": 1050 + }, + { + "epoch": 0.073728872504695, + "grad_norm": 3.534659392793332, + "learning_rate": 2.457116365322207e-06, + "loss": 1.0896, + "step": 1060 + }, + { + "epoch": 0.07442442790568268, + "grad_norm": 3.725282341773033, + "learning_rate": 2.4802967083912845e-06, + "loss": 1.0789, + "step": 1070 + }, + { + "epoch": 0.07511998330667037, + "grad_norm": 2.9459451330853685, + "learning_rate": 2.5034770514603617e-06, + "loss": 1.1268, + "step": 1080 + }, + { + "epoch": 0.07581553870765806, + "grad_norm": 4.371089491177268, + "learning_rate": 2.5266573945294393e-06, + "loss": 1.1302, + "step": 1090 + }, + { + "epoch": 0.07651109410864575, + "grad_norm": 4.1276842326410526, + "learning_rate": 2.5498377375985166e-06, + "loss": 1.0765, + "step": 1100 + }, + { + "epoch": 0.07720664950963345, + "grad_norm": 4.535372128548419, + "learning_rate": 2.5730180806675942e-06, + "loss": 1.1459, + "step": 1110 + }, + { + "epoch": 0.07790220491062114, + "grad_norm": 3.8656540521472644, + "learning_rate": 2.5961984237366715e-06, + "loss": 1.0793, + "step": 1120 + }, + { + "epoch": 0.07859776031160882, + "grad_norm": 4.128217853422419, + "learning_rate": 2.619378766805749e-06, + "loss": 1.075, + "step": 1130 + }, + { + "epoch": 0.07929331571259651, + "grad_norm": 5.788527413388564, + "learning_rate": 2.6425591098748268e-06, + "loss": 1.0133, + "step": 1140 + }, + { + "epoch": 0.0799888711135842, + "grad_norm": 4.215342719233836, + "learning_rate": 2.665739452943904e-06, + "loss": 1.116, + "step": 1150 + }, + { + "epoch": 0.08068442651457189, + "grad_norm": 13.236929802358423, + "learning_rate": 2.6889197960129816e-06, + "loss": 1.0763, + "step": 1160 + }, + { + "epoch": 0.08137998191555958, + "grad_norm": 4.186733374980703, + "learning_rate": 2.7121001390820585e-06, + "loss": 1.0924, + "step": 1170 + }, + { + "epoch": 0.08207553731654726, + "grad_norm": 4.7674995225294525, + "learning_rate": 2.7352804821511357e-06, + "loss": 1.082, + "step": 1180 + }, + { + "epoch": 0.08277109271753495, + "grad_norm": 4.4134510885078, + "learning_rate": 2.7584608252202133e-06, + "loss": 1.0381, + "step": 1190 + }, + { + "epoch": 0.08346664811852264, + "grad_norm": 11.595211607628576, + "learning_rate": 2.7816411682892906e-06, + "loss": 1.0709, + "step": 1200 + }, + { + "epoch": 0.08416220351951033, + "grad_norm": 10.602218981492141, + "learning_rate": 2.8048215113583682e-06, + "loss": 1.1064, + "step": 1210 + }, + { + "epoch": 0.08485775892049802, + "grad_norm": 3.8401739497676033, + "learning_rate": 2.828001854427446e-06, + "loss": 1.0532, + "step": 1220 + }, + { + "epoch": 0.0855533143214857, + "grad_norm": 3.435984696989646, + "learning_rate": 2.851182197496523e-06, + "loss": 1.1052, + "step": 1230 + }, + { + "epoch": 0.08624886972247339, + "grad_norm": 4.439108848700687, + "learning_rate": 2.8743625405656008e-06, + "loss": 1.0645, + "step": 1240 + }, + { + "epoch": 0.08694442512346108, + "grad_norm": 4.123159332852383, + "learning_rate": 2.897542883634678e-06, + "loss": 1.0065, + "step": 1250 + }, + { + "epoch": 0.08763998052444877, + "grad_norm": 5.954565243778203, + "learning_rate": 2.9207232267037557e-06, + "loss": 1.047, + "step": 1260 + }, + { + "epoch": 0.08833553592543646, + "grad_norm": 3.965930075073621, + "learning_rate": 2.943903569772833e-06, + "loss": 1.1097, + "step": 1270 + }, + { + "epoch": 0.08903109132642414, + "grad_norm": 6.497459338922845, + "learning_rate": 2.9670839128419105e-06, + "loss": 1.1488, + "step": 1280 + }, + { + "epoch": 0.08972664672741183, + "grad_norm": 4.504164171669045, + "learning_rate": 2.9902642559109878e-06, + "loss": 1.0515, + "step": 1290 + }, + { + "epoch": 0.09042220212839952, + "grad_norm": 3.5895422917588, + "learning_rate": 3.0134445989800654e-06, + "loss": 1.0729, + "step": 1300 + }, + { + "epoch": 0.09111775752938722, + "grad_norm": 8.191286005665773, + "learning_rate": 3.0366249420491427e-06, + "loss": 1.133, + "step": 1310 + }, + { + "epoch": 0.09181331293037491, + "grad_norm": 3.061812128875411, + "learning_rate": 3.05980528511822e-06, + "loss": 1.0611, + "step": 1320 + }, + { + "epoch": 0.0925088683313626, + "grad_norm": 3.9523917495652188, + "learning_rate": 3.082985628187297e-06, + "loss": 1.0287, + "step": 1330 + }, + { + "epoch": 0.09320442373235029, + "grad_norm": 3.59942889903288, + "learning_rate": 3.1061659712563748e-06, + "loss": 1.1247, + "step": 1340 + }, + { + "epoch": 0.09389997913333797, + "grad_norm": 4.0245013195989525, + "learning_rate": 3.129346314325452e-06, + "loss": 1.0898, + "step": 1350 + }, + { + "epoch": 0.09459553453432566, + "grad_norm": 3.934680149713043, + "learning_rate": 3.1525266573945297e-06, + "loss": 1.0977, + "step": 1360 + }, + { + "epoch": 0.09529108993531335, + "grad_norm": 3.6770538632419143, + "learning_rate": 3.175707000463607e-06, + "loss": 1.1288, + "step": 1370 + }, + { + "epoch": 0.09598664533630104, + "grad_norm": 3.1337164805385465, + "learning_rate": 3.1988873435326845e-06, + "loss": 1.0507, + "step": 1380 + }, + { + "epoch": 0.09668220073728873, + "grad_norm": 4.284816022717549, + "learning_rate": 3.2220676866017618e-06, + "loss": 1.0199, + "step": 1390 + }, + { + "epoch": 0.09737775613827641, + "grad_norm": 4.17161225285423, + "learning_rate": 3.2452480296708394e-06, + "loss": 1.1341, + "step": 1400 + }, + { + "epoch": 0.0980733115392641, + "grad_norm": 4.025784325261557, + "learning_rate": 3.268428372739917e-06, + "loss": 1.0543, + "step": 1410 + }, + { + "epoch": 0.09876886694025179, + "grad_norm": 3.2234427700459283, + "learning_rate": 3.2916087158089943e-06, + "loss": 1.0448, + "step": 1420 + }, + { + "epoch": 0.09946442234123948, + "grad_norm": 5.3567850049332195, + "learning_rate": 3.314789058878072e-06, + "loss": 1.0625, + "step": 1430 + }, + { + "epoch": 0.10015997774222717, + "grad_norm": 3.6455322564383312, + "learning_rate": 3.337969401947149e-06, + "loss": 1.042, + "step": 1440 + }, + { + "epoch": 0.10085553314321485, + "grad_norm": 3.983629478274342, + "learning_rate": 3.361149745016227e-06, + "loss": 1.0837, + "step": 1450 + }, + { + "epoch": 0.10155108854420254, + "grad_norm": 3.370347247056451, + "learning_rate": 3.384330088085304e-06, + "loss": 1.0794, + "step": 1460 + }, + { + "epoch": 0.10224664394519023, + "grad_norm": 5.784870046877274, + "learning_rate": 3.4075104311543817e-06, + "loss": 0.9923, + "step": 1470 + }, + { + "epoch": 0.10294219934617792, + "grad_norm": 3.8865345481376594, + "learning_rate": 3.4306907742234585e-06, + "loss": 1.0936, + "step": 1480 + }, + { + "epoch": 0.1036377547471656, + "grad_norm": 4.445483017054741, + "learning_rate": 3.453871117292536e-06, + "loss": 1.1002, + "step": 1490 + }, + { + "epoch": 0.1043333101481533, + "grad_norm": 5.255987572052633, + "learning_rate": 3.4770514603616134e-06, + "loss": 1.1151, + "step": 1500 + }, + { + "epoch": 0.1043333101481533, + "eval_loss": 1.0752936601638794, + "eval_runtime": 1325.8921, + "eval_samples_per_second": 13.697, + "eval_steps_per_second": 2.283, + "step": 1500 + }, + { + "epoch": 0.105028865549141, + "grad_norm": 4.333704522369647, + "learning_rate": 3.500231803430691e-06, + "loss": 1.1004, + "step": 1510 + }, + { + "epoch": 0.10572442095012868, + "grad_norm": 4.8656069348586914, + "learning_rate": 3.5234121464997683e-06, + "loss": 1.1045, + "step": 1520 + }, + { + "epoch": 0.10641997635111637, + "grad_norm": 4.480958334403094, + "learning_rate": 3.546592489568846e-06, + "loss": 1.056, + "step": 1530 + }, + { + "epoch": 0.10711553175210406, + "grad_norm": 13.673974366934772, + "learning_rate": 3.569772832637923e-06, + "loss": 1.0705, + "step": 1540 + }, + { + "epoch": 0.10781108715309175, + "grad_norm": 3.4775830641032432, + "learning_rate": 3.592953175707001e-06, + "loss": 1.0943, + "step": 1550 + }, + { + "epoch": 0.10850664255407944, + "grad_norm": 4.214045840844501, + "learning_rate": 3.616133518776078e-06, + "loss": 1.0715, + "step": 1560 + }, + { + "epoch": 0.10920219795506712, + "grad_norm": 4.1854819921304385, + "learning_rate": 3.6393138618451557e-06, + "loss": 1.0594, + "step": 1570 + }, + { + "epoch": 0.10989775335605481, + "grad_norm": 4.69866797362075, + "learning_rate": 3.662494204914233e-06, + "loss": 1.1025, + "step": 1580 + }, + { + "epoch": 0.1105933087570425, + "grad_norm": 8.72171959408076, + "learning_rate": 3.6856745479833106e-06, + "loss": 1.1339, + "step": 1590 + }, + { + "epoch": 0.11128886415803019, + "grad_norm": 6.485562591845707, + "learning_rate": 3.708854891052388e-06, + "loss": 1.0751, + "step": 1600 + }, + { + "epoch": 0.11198441955901788, + "grad_norm": 3.875318647768489, + "learning_rate": 3.7320352341214655e-06, + "loss": 1.0389, + "step": 1610 + }, + { + "epoch": 0.11267997496000556, + "grad_norm": 3.994299900281713, + "learning_rate": 3.755215577190543e-06, + "loss": 1.1467, + "step": 1620 + }, + { + "epoch": 0.11337553036099325, + "grad_norm": 3.5047107429883577, + "learning_rate": 3.77839592025962e-06, + "loss": 1.0703, + "step": 1630 + }, + { + "epoch": 0.11407108576198094, + "grad_norm": 3.0110253543770713, + "learning_rate": 3.801576263328697e-06, + "loss": 1.0616, + "step": 1640 + }, + { + "epoch": 0.11476664116296863, + "grad_norm": 4.861207684453612, + "learning_rate": 3.824756606397775e-06, + "loss": 1.1103, + "step": 1650 + }, + { + "epoch": 0.11546219656395632, + "grad_norm": 10.280188296096748, + "learning_rate": 3.8479369494668525e-06, + "loss": 1.0428, + "step": 1660 + }, + { + "epoch": 0.116157751964944, + "grad_norm": 3.079730536821372, + "learning_rate": 3.871117292535929e-06, + "loss": 0.9963, + "step": 1670 + }, + { + "epoch": 0.11685330736593169, + "grad_norm": 3.540092215949589, + "learning_rate": 3.894297635605007e-06, + "loss": 1.0193, + "step": 1680 + }, + { + "epoch": 0.11754886276691938, + "grad_norm": 3.062613191358516, + "learning_rate": 3.917477978674085e-06, + "loss": 1.0522, + "step": 1690 + }, + { + "epoch": 0.11824441816790707, + "grad_norm": 3.049017213492556, + "learning_rate": 3.940658321743162e-06, + "loss": 1.0278, + "step": 1700 + }, + { + "epoch": 0.11893997356889477, + "grad_norm": 3.6700191996058527, + "learning_rate": 3.96383866481224e-06, + "loss": 1.0648, + "step": 1710 + }, + { + "epoch": 0.11963552896988246, + "grad_norm": 3.4545461380369056, + "learning_rate": 3.987019007881317e-06, + "loss": 1.0466, + "step": 1720 + }, + { + "epoch": 0.12033108437087014, + "grad_norm": 3.5092159246029815, + "learning_rate": 4.010199350950394e-06, + "loss": 1.059, + "step": 1730 + }, + { + "epoch": 0.12102663977185783, + "grad_norm": 4.397593667262636, + "learning_rate": 4.033379694019472e-06, + "loss": 1.1254, + "step": 1740 + }, + { + "epoch": 0.12172219517284552, + "grad_norm": 3.6497385462779715, + "learning_rate": 4.05656003708855e-06, + "loss": 1.0665, + "step": 1750 + }, + { + "epoch": 0.12241775057383321, + "grad_norm": 3.431984821190286, + "learning_rate": 4.0797403801576265e-06, + "loss": 1.0331, + "step": 1760 + }, + { + "epoch": 0.1231133059748209, + "grad_norm": 3.051515255369257, + "learning_rate": 4.102920723226704e-06, + "loss": 1.0633, + "step": 1770 + }, + { + "epoch": 0.12380886137580858, + "grad_norm": 7.535228688712293, + "learning_rate": 4.126101066295782e-06, + "loss": 1.0716, + "step": 1780 + }, + { + "epoch": 0.12450441677679627, + "grad_norm": 3.4896507742937475, + "learning_rate": 4.149281409364859e-06, + "loss": 1.0985, + "step": 1790 + }, + { + "epoch": 0.12519997217778395, + "grad_norm": 3.1355424982535514, + "learning_rate": 4.172461752433936e-06, + "loss": 1.0374, + "step": 1800 + }, + { + "epoch": 0.12589552757877165, + "grad_norm": 16.685636824550922, + "learning_rate": 4.195642095503014e-06, + "loss": 1.0416, + "step": 1810 + }, + { + "epoch": 0.12659108297975935, + "grad_norm": 4.015629083094722, + "learning_rate": 4.218822438572091e-06, + "loss": 1.0487, + "step": 1820 + }, + { + "epoch": 0.12728663838074702, + "grad_norm": 6.471513568876525, + "learning_rate": 4.242002781641168e-06, + "loss": 1.1297, + "step": 1830 + }, + { + "epoch": 0.12798219378173473, + "grad_norm": 8.306196436789273, + "learning_rate": 4.265183124710246e-06, + "loss": 1.044, + "step": 1840 + }, + { + "epoch": 0.1286777491827224, + "grad_norm": 4.053533475042931, + "learning_rate": 4.288363467779324e-06, + "loss": 1.1015, + "step": 1850 + }, + { + "epoch": 0.1293733045837101, + "grad_norm": 3.519093588192627, + "learning_rate": 4.3115438108484005e-06, + "loss": 1.025, + "step": 1860 + }, + { + "epoch": 0.13006885998469778, + "grad_norm": 3.196154974457117, + "learning_rate": 4.334724153917478e-06, + "loss": 1.1408, + "step": 1870 + }, + { + "epoch": 0.13076441538568548, + "grad_norm": 3.666893419506706, + "learning_rate": 4.357904496986556e-06, + "loss": 1.1191, + "step": 1880 + }, + { + "epoch": 0.13145997078667315, + "grad_norm": 3.6277285241694943, + "learning_rate": 4.3810848400556335e-06, + "loss": 1.0651, + "step": 1890 + }, + { + "epoch": 0.13215552618766085, + "grad_norm": 4.0564176949965365, + "learning_rate": 4.404265183124711e-06, + "loss": 1.0316, + "step": 1900 + }, + { + "epoch": 0.13285108158864853, + "grad_norm": 4.065373128038651, + "learning_rate": 4.427445526193788e-06, + "loss": 1.0456, + "step": 1910 + }, + { + "epoch": 0.13354663698963623, + "grad_norm": 4.290077257783666, + "learning_rate": 4.450625869262866e-06, + "loss": 1.0384, + "step": 1920 + }, + { + "epoch": 0.1342421923906239, + "grad_norm": 7.489408002045181, + "learning_rate": 4.473806212331943e-06, + "loss": 1.0399, + "step": 1930 + }, + { + "epoch": 0.1349377477916116, + "grad_norm": 2.613915253627605, + "learning_rate": 4.49698655540102e-06, + "loss": 1.0179, + "step": 1940 + }, + { + "epoch": 0.13563330319259928, + "grad_norm": 11.750825471211268, + "learning_rate": 4.520166898470098e-06, + "loss": 1.0048, + "step": 1950 + }, + { + "epoch": 0.13632885859358698, + "grad_norm": 4.415607715055371, + "learning_rate": 4.543347241539175e-06, + "loss": 1.0717, + "step": 1960 + }, + { + "epoch": 0.13702441399457466, + "grad_norm": 3.6266687484554723, + "learning_rate": 4.566527584608252e-06, + "loss": 1.1268, + "step": 1970 + }, + { + "epoch": 0.13771996939556236, + "grad_norm": 3.398865624308442, + "learning_rate": 4.58970792767733e-06, + "loss": 1.092, + "step": 1980 + }, + { + "epoch": 0.13841552479655003, + "grad_norm": 2.3663251420998495, + "learning_rate": 4.6128882707464075e-06, + "loss": 1.0916, + "step": 1990 + }, + { + "epoch": 0.13911108019753773, + "grad_norm": 4.304792419271643, + "learning_rate": 4.636068613815485e-06, + "loss": 1.0187, + "step": 2000 + }, + { + "epoch": 0.13911108019753773, + "eval_loss": 1.0618764162063599, + "eval_runtime": 1327.3733, + "eval_samples_per_second": 13.682, + "eval_steps_per_second": 2.28, + "step": 2000 + }, + { + "epoch": 0.13980663559852544, + "grad_norm": 2.9067395465567243, + "learning_rate": 4.659248956884562e-06, + "loss": 1.0347, + "step": 2010 + }, + { + "epoch": 0.1405021909995131, + "grad_norm": 4.4718974989494225, + "learning_rate": 4.68242929995364e-06, + "loss": 1.0613, + "step": 2020 + }, + { + "epoch": 0.1411977464005008, + "grad_norm": 5.093360608315384, + "learning_rate": 4.705609643022717e-06, + "loss": 1.0429, + "step": 2030 + }, + { + "epoch": 0.1418933018014885, + "grad_norm": 3.8243124512374593, + "learning_rate": 4.728789986091795e-06, + "loss": 1.0304, + "step": 2040 + }, + { + "epoch": 0.1425888572024762, + "grad_norm": 4.243954748940069, + "learning_rate": 4.751970329160872e-06, + "loss": 1.0652, + "step": 2050 + }, + { + "epoch": 0.14328441260346386, + "grad_norm": 3.1893821023746463, + "learning_rate": 4.775150672229949e-06, + "loss": 1.0632, + "step": 2060 + }, + { + "epoch": 0.14397996800445156, + "grad_norm": 6.93748491255458, + "learning_rate": 4.798331015299027e-06, + "loss": 1.0648, + "step": 2070 + }, + { + "epoch": 0.14467552340543924, + "grad_norm": 2.9155418416483494, + "learning_rate": 4.821511358368105e-06, + "loss": 1.0849, + "step": 2080 + }, + { + "epoch": 0.14537107880642694, + "grad_norm": 3.4366870295928975, + "learning_rate": 4.844691701437182e-06, + "loss": 1.0296, + "step": 2090 + }, + { + "epoch": 0.14606663420741461, + "grad_norm": 4.081208731435202, + "learning_rate": 4.867872044506259e-06, + "loss": 1.1094, + "step": 2100 + }, + { + "epoch": 0.14676218960840232, + "grad_norm": 3.308563013826008, + "learning_rate": 4.891052387575336e-06, + "loss": 1.0767, + "step": 2110 + }, + { + "epoch": 0.14745774500939, + "grad_norm": 5.267940931404248, + "learning_rate": 4.914232730644414e-06, + "loss": 1.0621, + "step": 2120 + }, + { + "epoch": 0.1481533004103777, + "grad_norm": 3.7628128616549215, + "learning_rate": 4.937413073713491e-06, + "loss": 1.0537, + "step": 2130 + }, + { + "epoch": 0.14884885581136537, + "grad_norm": 4.059513147212732, + "learning_rate": 4.960593416782569e-06, + "loss": 1.113, + "step": 2140 + }, + { + "epoch": 0.14954441121235307, + "grad_norm": 2.908932115104416, + "learning_rate": 4.983773759851646e-06, + "loss": 1.079, + "step": 2150 + }, + { + "epoch": 0.15023996661334074, + "grad_norm": 3.2670321256502977, + "learning_rate": 5.006954102920723e-06, + "loss": 1.1203, + "step": 2160 + }, + { + "epoch": 0.15093552201432844, + "grad_norm": 2.6905398739406516, + "learning_rate": 5.030134445989802e-06, + "loss": 1.0186, + "step": 2170 + }, + { + "epoch": 0.15163107741531612, + "grad_norm": 12.501698607098879, + "learning_rate": 5.053314789058879e-06, + "loss": 1.0947, + "step": 2180 + }, + { + "epoch": 0.15232663281630382, + "grad_norm": 2.9482685992791886, + "learning_rate": 5.0764951321279555e-06, + "loss": 1.0461, + "step": 2190 + }, + { + "epoch": 0.1530221882172915, + "grad_norm": 5.625663702244715, + "learning_rate": 5.099675475197033e-06, + "loss": 1.0341, + "step": 2200 + }, + { + "epoch": 0.1537177436182792, + "grad_norm": 3.4289462834415834, + "learning_rate": 5.12285581826611e-06, + "loss": 1.0059, + "step": 2210 + }, + { + "epoch": 0.1544132990192669, + "grad_norm": 4.370533176394325, + "learning_rate": 5.1460361613351884e-06, + "loss": 1.0743, + "step": 2220 + }, + { + "epoch": 0.15510885442025457, + "grad_norm": 3.779088621658577, + "learning_rate": 5.169216504404265e-06, + "loss": 1.0566, + "step": 2230 + }, + { + "epoch": 0.15580440982124227, + "grad_norm": 2.6309605795295763, + "learning_rate": 5.192396847473343e-06, + "loss": 1.1118, + "step": 2240 + }, + { + "epoch": 0.15649996522222995, + "grad_norm": 3.350868522650498, + "learning_rate": 5.2155771905424206e-06, + "loss": 1.0796, + "step": 2250 + }, + { + "epoch": 0.15719552062321765, + "grad_norm": 3.7739915686010175, + "learning_rate": 5.238757533611498e-06, + "loss": 1.0818, + "step": 2260 + }, + { + "epoch": 0.15789107602420532, + "grad_norm": 2.436431230555693, + "learning_rate": 5.261937876680575e-06, + "loss": 1.0652, + "step": 2270 + }, + { + "epoch": 0.15858663142519303, + "grad_norm": 3.1362475748972014, + "learning_rate": 5.2851182197496535e-06, + "loss": 1.0518, + "step": 2280 + }, + { + "epoch": 0.1592821868261807, + "grad_norm": 2.9414783357873024, + "learning_rate": 5.30829856281873e-06, + "loss": 1.0299, + "step": 2290 + }, + { + "epoch": 0.1599777422271684, + "grad_norm": 12.121814006646256, + "learning_rate": 5.331478905887808e-06, + "loss": 1.1245, + "step": 2300 + }, + { + "epoch": 0.16067329762815608, + "grad_norm": 3.770012131730106, + "learning_rate": 5.354659248956885e-06, + "loss": 0.9972, + "step": 2310 + }, + { + "epoch": 0.16136885302914378, + "grad_norm": 9.794855473580656, + "learning_rate": 5.377839592025963e-06, + "loss": 1.0707, + "step": 2320 + }, + { + "epoch": 0.16206440843013145, + "grad_norm": 4.050572489825119, + "learning_rate": 5.40101993509504e-06, + "loss": 1.0547, + "step": 2330 + }, + { + "epoch": 0.16275996383111915, + "grad_norm": 8.236017420524046, + "learning_rate": 5.424200278164117e-06, + "loss": 1.0318, + "step": 2340 + }, + { + "epoch": 0.16345551923210683, + "grad_norm": 3.027901539761808, + "learning_rate": 5.4473806212331946e-06, + "loss": 1.0416, + "step": 2350 + }, + { + "epoch": 0.16415107463309453, + "grad_norm": 4.41408812559083, + "learning_rate": 5.470560964302271e-06, + "loss": 1.0604, + "step": 2360 + }, + { + "epoch": 0.1648466300340822, + "grad_norm": 2.9780969525574865, + "learning_rate": 5.49374130737135e-06, + "loss": 1.0608, + "step": 2370 + }, + { + "epoch": 0.1655421854350699, + "grad_norm": 3.0671773318616227, + "learning_rate": 5.516921650440427e-06, + "loss": 1.0868, + "step": 2380 + }, + { + "epoch": 0.16623774083605758, + "grad_norm": 4.118851560262518, + "learning_rate": 5.540101993509504e-06, + "loss": 1.0868, + "step": 2390 + }, + { + "epoch": 0.16693329623704528, + "grad_norm": 4.232046634800152, + "learning_rate": 5.563282336578581e-06, + "loss": 1.0384, + "step": 2400 + }, + { + "epoch": 0.16762885163803298, + "grad_norm": 4.348527302438795, + "learning_rate": 5.58646267964766e-06, + "loss": 1.097, + "step": 2410 + }, + { + "epoch": 0.16832440703902066, + "grad_norm": 3.827970470480688, + "learning_rate": 5.6096430227167365e-06, + "loss": 1.106, + "step": 2420 + }, + { + "epoch": 0.16901996244000836, + "grad_norm": 3.220365339781202, + "learning_rate": 5.632823365785814e-06, + "loss": 1.0698, + "step": 2430 + }, + { + "epoch": 0.16971551784099603, + "grad_norm": 4.574703543970943, + "learning_rate": 5.656003708854892e-06, + "loss": 1.0482, + "step": 2440 + }, + { + "epoch": 0.17041107324198373, + "grad_norm": 5.198211739741746, + "learning_rate": 5.679184051923969e-06, + "loss": 1.0863, + "step": 2450 + }, + { + "epoch": 0.1711066286429714, + "grad_norm": 7.952029078225997, + "learning_rate": 5.702364394993046e-06, + "loss": 1.0289, + "step": 2460 + }, + { + "epoch": 0.1718021840439591, + "grad_norm": 3.717910670753866, + "learning_rate": 5.725544738062124e-06, + "loss": 1.0449, + "step": 2470 + }, + { + "epoch": 0.17249773944494678, + "grad_norm": 2.839405187112438, + "learning_rate": 5.7487250811312015e-06, + "loss": 1.1054, + "step": 2480 + }, + { + "epoch": 0.1731932948459345, + "grad_norm": 5.100028632889747, + "learning_rate": 5.771905424200278e-06, + "loss": 1.0364, + "step": 2490 + }, + { + "epoch": 0.17388885024692216, + "grad_norm": 6.259391445905808, + "learning_rate": 5.795085767269356e-06, + "loss": 1.1028, + "step": 2500 + }, + { + "epoch": 0.17388885024692216, + "eval_loss": 1.0674511194229126, + "eval_runtime": 1328.7845, + "eval_samples_per_second": 13.667, + "eval_steps_per_second": 2.278, + "step": 2500 + }, + { + "epoch": 0.17458440564790986, + "grad_norm": 2.763982569170532, + "learning_rate": 5.818266110338433e-06, + "loss": 1.0008, + "step": 2510 + }, + { + "epoch": 0.17527996104889754, + "grad_norm": 4.13392380506925, + "learning_rate": 5.841446453407511e-06, + "loss": 1.0199, + "step": 2520 + }, + { + "epoch": 0.17597551644988524, + "grad_norm": 2.718979718627869, + "learning_rate": 5.864626796476588e-06, + "loss": 1.0481, + "step": 2530 + }, + { + "epoch": 0.1766710718508729, + "grad_norm": 3.544175706074825, + "learning_rate": 5.887807139545666e-06, + "loss": 1.0705, + "step": 2540 + }, + { + "epoch": 0.17736662725186061, + "grad_norm": 2.369641133247102, + "learning_rate": 5.9109874826147426e-06, + "loss": 1.0019, + "step": 2550 + }, + { + "epoch": 0.1780621826528483, + "grad_norm": 3.2491288177142374, + "learning_rate": 5.934167825683821e-06, + "loss": 1.075, + "step": 2560 + }, + { + "epoch": 0.178757738053836, + "grad_norm": 4.049292553224335, + "learning_rate": 5.957348168752898e-06, + "loss": 1.0782, + "step": 2570 + }, + { + "epoch": 0.17945329345482366, + "grad_norm": 3.1450505684314214, + "learning_rate": 5.9805285118219755e-06, + "loss": 1.0648, + "step": 2580 + }, + { + "epoch": 0.18014884885581137, + "grad_norm": 3.1113775595823405, + "learning_rate": 6.003708854891052e-06, + "loss": 1.0922, + "step": 2590 + }, + { + "epoch": 0.18084440425679904, + "grad_norm": 5.155189429879514, + "learning_rate": 6.026889197960131e-06, + "loss": 1.1121, + "step": 2600 + }, + { + "epoch": 0.18153995965778674, + "grad_norm": 4.07511593643901, + "learning_rate": 6.050069541029208e-06, + "loss": 1.1111, + "step": 2610 + }, + { + "epoch": 0.18223551505877444, + "grad_norm": 3.82251950163841, + "learning_rate": 6.073249884098285e-06, + "loss": 1.0696, + "step": 2620 + }, + { + "epoch": 0.18293107045976212, + "grad_norm": 3.8031073285007997, + "learning_rate": 6.096430227167363e-06, + "loss": 1.0633, + "step": 2630 + }, + { + "epoch": 0.18362662586074982, + "grad_norm": 3.240212419015874, + "learning_rate": 6.11961057023644e-06, + "loss": 1.0891, + "step": 2640 + }, + { + "epoch": 0.1843221812617375, + "grad_norm": 2.667500239439651, + "learning_rate": 6.142790913305517e-06, + "loss": 1.0629, + "step": 2650 + }, + { + "epoch": 0.1850177366627252, + "grad_norm": 5.699130663613962, + "learning_rate": 6.165971256374594e-06, + "loss": 1.0784, + "step": 2660 + }, + { + "epoch": 0.18571329206371287, + "grad_norm": 5.25584976214573, + "learning_rate": 6.189151599443673e-06, + "loss": 1.054, + "step": 2670 + }, + { + "epoch": 0.18640884746470057, + "grad_norm": 7.129532835607845, + "learning_rate": 6.2123319425127495e-06, + "loss": 0.9971, + "step": 2680 + }, + { + "epoch": 0.18710440286568825, + "grad_norm": 4.4629396158382795, + "learning_rate": 6.235512285581827e-06, + "loss": 1.0788, + "step": 2690 + }, + { + "epoch": 0.18779995826667595, + "grad_norm": 3.909205800283168, + "learning_rate": 6.258692628650904e-06, + "loss": 1.0927, + "step": 2700 + }, + { + "epoch": 0.18849551366766362, + "grad_norm": 3.155099572520692, + "learning_rate": 6.2818729717199825e-06, + "loss": 1.0977, + "step": 2710 + }, + { + "epoch": 0.18919106906865132, + "grad_norm": 3.5517893908691067, + "learning_rate": 6.305053314789059e-06, + "loss": 1.0754, + "step": 2720 + }, + { + "epoch": 0.189886624469639, + "grad_norm": 2.6969013297115696, + "learning_rate": 6.328233657858137e-06, + "loss": 0.9871, + "step": 2730 + }, + { + "epoch": 0.1905821798706267, + "grad_norm": 3.0514550315300353, + "learning_rate": 6.351414000927214e-06, + "loss": 1.035, + "step": 2740 + }, + { + "epoch": 0.19127773527161437, + "grad_norm": 3.1613622453724806, + "learning_rate": 6.374594343996292e-06, + "loss": 1.0828, + "step": 2750 + }, + { + "epoch": 0.19197329067260208, + "grad_norm": 3.461002916100869, + "learning_rate": 6.397774687065369e-06, + "loss": 1.0698, + "step": 2760 + }, + { + "epoch": 0.19266884607358975, + "grad_norm": 8.041096720956304, + "learning_rate": 6.420955030134447e-06, + "loss": 1.0628, + "step": 2770 + }, + { + "epoch": 0.19336440147457745, + "grad_norm": 3.483464401584098, + "learning_rate": 6.4441353732035235e-06, + "loss": 1.0531, + "step": 2780 + }, + { + "epoch": 0.19405995687556513, + "grad_norm": 2.9440019535611177, + "learning_rate": 6.467315716272602e-06, + "loss": 1.0536, + "step": 2790 + }, + { + "epoch": 0.19475551227655283, + "grad_norm": 3.207209246776207, + "learning_rate": 6.490496059341679e-06, + "loss": 1.0264, + "step": 2800 + }, + { + "epoch": 0.19545106767754053, + "grad_norm": 5.235464616480115, + "learning_rate": 6.513676402410756e-06, + "loss": 1.0922, + "step": 2810 + }, + { + "epoch": 0.1961466230785282, + "grad_norm": 2.9189755647590956, + "learning_rate": 6.536856745479834e-06, + "loss": 1.0612, + "step": 2820 + }, + { + "epoch": 0.1968421784795159, + "grad_norm": 5.911521977969626, + "learning_rate": 6.560037088548911e-06, + "loss": 1.0088, + "step": 2830 + }, + { + "epoch": 0.19753773388050358, + "grad_norm": 3.6441221504438084, + "learning_rate": 6.583217431617989e-06, + "loss": 1.1006, + "step": 2840 + }, + { + "epoch": 0.19823328928149128, + "grad_norm": 2.6809143001921476, + "learning_rate": 6.6063977746870654e-06, + "loss": 1.1107, + "step": 2850 + }, + { + "epoch": 0.19892884468247896, + "grad_norm": 4.786872594989275, + "learning_rate": 6.629578117756144e-06, + "loss": 1.0225, + "step": 2860 + }, + { + "epoch": 0.19962440008346666, + "grad_norm": 5.075575195197206, + "learning_rate": 6.652758460825221e-06, + "loss": 1.0641, + "step": 2870 + }, + { + "epoch": 0.20031995548445433, + "grad_norm": 2.4663275796267508, + "learning_rate": 6.675938803894298e-06, + "loss": 1.0465, + "step": 2880 + }, + { + "epoch": 0.20101551088544203, + "grad_norm": 4.861471470277184, + "learning_rate": 6.699119146963375e-06, + "loss": 1.0409, + "step": 2890 + }, + { + "epoch": 0.2017110662864297, + "grad_norm": 2.862300294756334, + "learning_rate": 6.722299490032454e-06, + "loss": 1.0358, + "step": 2900 + }, + { + "epoch": 0.2024066216874174, + "grad_norm": 2.933938165433661, + "learning_rate": 6.7454798331015305e-06, + "loss": 1.08, + "step": 2910 + }, + { + "epoch": 0.20310217708840508, + "grad_norm": 3.8676319324274813, + "learning_rate": 6.768660176170608e-06, + "loss": 1.0462, + "step": 2920 + }, + { + "epoch": 0.20379773248939279, + "grad_norm": 3.284666136429242, + "learning_rate": 6.791840519239685e-06, + "loss": 1.0112, + "step": 2930 + }, + { + "epoch": 0.20449328789038046, + "grad_norm": 4.841254828698239, + "learning_rate": 6.8150208623087635e-06, + "loss": 1.0549, + "step": 2940 + }, + { + "epoch": 0.20518884329136816, + "grad_norm": 6.263563213811836, + "learning_rate": 6.83820120537784e-06, + "loss": 1.0912, + "step": 2950 + }, + { + "epoch": 0.20588439869235584, + "grad_norm": 2.4926369651624753, + "learning_rate": 6.861381548446917e-06, + "loss": 1.1184, + "step": 2960 + }, + { + "epoch": 0.20657995409334354, + "grad_norm": 10.345369663188581, + "learning_rate": 6.884561891515995e-06, + "loss": 1.0748, + "step": 2970 + }, + { + "epoch": 0.2072755094943312, + "grad_norm": 3.3495180724473257, + "learning_rate": 6.907742234585072e-06, + "loss": 1.0587, + "step": 2980 + }, + { + "epoch": 0.2079710648953189, + "grad_norm": 4.226520552647976, + "learning_rate": 6.93092257765415e-06, + "loss": 1.0578, + "step": 2990 + }, + { + "epoch": 0.2086666202963066, + "grad_norm": 2.823434755096611, + "learning_rate": 6.954102920723227e-06, + "loss": 1.0466, + "step": 3000 + }, + { + "epoch": 0.2086666202963066, + "eval_loss": 1.0648584365844727, + "eval_runtime": 1324.3441, + "eval_samples_per_second": 13.713, + "eval_steps_per_second": 2.286, + "step": 3000 + }, + { + "epoch": 0.2093621756972943, + "grad_norm": 4.6381395642949865, + "learning_rate": 6.9772832637923045e-06, + "loss": 1.1044, + "step": 3010 + }, + { + "epoch": 0.210057731098282, + "grad_norm": 3.211113204841403, + "learning_rate": 7.000463606861382e-06, + "loss": 1.1307, + "step": 3020 + }, + { + "epoch": 0.21075328649926967, + "grad_norm": 3.2133128295007176, + "learning_rate": 7.02364394993046e-06, + "loss": 1.0058, + "step": 3030 + }, + { + "epoch": 0.21144884190025737, + "grad_norm": 3.745047547820964, + "learning_rate": 7.046824292999537e-06, + "loss": 1.0546, + "step": 3040 + }, + { + "epoch": 0.21214439730124504, + "grad_norm": 3.5088859134141774, + "learning_rate": 7.070004636068615e-06, + "loss": 1.0431, + "step": 3050 + }, + { + "epoch": 0.21283995270223274, + "grad_norm": 4.0373669795089375, + "learning_rate": 7.093184979137692e-06, + "loss": 1.0902, + "step": 3060 + }, + { + "epoch": 0.21353550810322042, + "grad_norm": 2.518725427803631, + "learning_rate": 7.11636532220677e-06, + "loss": 1.0726, + "step": 3070 + }, + { + "epoch": 0.21423106350420812, + "grad_norm": 11.9637717050546, + "learning_rate": 7.139545665275846e-06, + "loss": 1.0604, + "step": 3080 + }, + { + "epoch": 0.2149266189051958, + "grad_norm": 5.258980899255202, + "learning_rate": 7.162726008344925e-06, + "loss": 1.0493, + "step": 3090 + }, + { + "epoch": 0.2156221743061835, + "grad_norm": 5.262411462891559, + "learning_rate": 7.185906351414002e-06, + "loss": 1.077, + "step": 3100 + }, + { + "epoch": 0.21631772970717117, + "grad_norm": 5.703524384554722, + "learning_rate": 7.2090866944830785e-06, + "loss": 1.1303, + "step": 3110 + }, + { + "epoch": 0.21701328510815887, + "grad_norm": 2.920182195097713, + "learning_rate": 7.232267037552156e-06, + "loss": 1.056, + "step": 3120 + }, + { + "epoch": 0.21770884050914655, + "grad_norm": 7.510781923632586, + "learning_rate": 7.255447380621233e-06, + "loss": 1.073, + "step": 3130 + }, + { + "epoch": 0.21840439591013425, + "grad_norm": 2.9670736218755094, + "learning_rate": 7.2786277236903115e-06, + "loss": 1.0462, + "step": 3140 + }, + { + "epoch": 0.21909995131112192, + "grad_norm": 2.799403048797392, + "learning_rate": 7.301808066759388e-06, + "loss": 1.0633, + "step": 3150 + }, + { + "epoch": 0.21979550671210962, + "grad_norm": 3.468853043515575, + "learning_rate": 7.324988409828466e-06, + "loss": 1.1189, + "step": 3160 + }, + { + "epoch": 0.2204910621130973, + "grad_norm": 4.0862433135053235, + "learning_rate": 7.348168752897544e-06, + "loss": 1.0405, + "step": 3170 + }, + { + "epoch": 0.221186617514085, + "grad_norm": 24.64781598980598, + "learning_rate": 7.371349095966621e-06, + "loss": 1.0497, + "step": 3180 + }, + { + "epoch": 0.22188217291507267, + "grad_norm": 3.410626085306715, + "learning_rate": 7.394529439035698e-06, + "loss": 1.0774, + "step": 3190 + }, + { + "epoch": 0.22257772831606037, + "grad_norm": 3.284815439391705, + "learning_rate": 7.417709782104776e-06, + "loss": 1.065, + "step": 3200 + }, + { + "epoch": 0.22327328371704808, + "grad_norm": 3.979434278263933, + "learning_rate": 7.440890125173853e-06, + "loss": 1.1064, + "step": 3210 + }, + { + "epoch": 0.22396883911803575, + "grad_norm": 3.6056575531065795, + "learning_rate": 7.464070468242931e-06, + "loss": 1.0656, + "step": 3220 + }, + { + "epoch": 0.22466439451902345, + "grad_norm": 14.166001547471382, + "learning_rate": 7.487250811312008e-06, + "loss": 1.0739, + "step": 3230 + }, + { + "epoch": 0.22535994992001113, + "grad_norm": 4.057676012592112, + "learning_rate": 7.510431154381086e-06, + "loss": 1.0573, + "step": 3240 + }, + { + "epoch": 0.22605550532099883, + "grad_norm": 6.932792932282011, + "learning_rate": 7.533611497450163e-06, + "loss": 1.0451, + "step": 3250 + }, + { + "epoch": 0.2267510607219865, + "grad_norm": 3.5319319427498277, + "learning_rate": 7.55679184051924e-06, + "loss": 1.0371, + "step": 3260 + }, + { + "epoch": 0.2274466161229742, + "grad_norm": 3.823037794178361, + "learning_rate": 7.579972183588318e-06, + "loss": 1.0289, + "step": 3270 + }, + { + "epoch": 0.22814217152396188, + "grad_norm": 3.1971296752989558, + "learning_rate": 7.603152526657394e-06, + "loss": 1.0408, + "step": 3280 + }, + { + "epoch": 0.22883772692494958, + "grad_norm": 18.743835158526338, + "learning_rate": 7.626332869726473e-06, + "loss": 1.031, + "step": 3290 + }, + { + "epoch": 0.22953328232593725, + "grad_norm": 3.748810420383887, + "learning_rate": 7.64951321279555e-06, + "loss": 1.0736, + "step": 3300 + }, + { + "epoch": 0.23022883772692496, + "grad_norm": 3.063766796874128, + "learning_rate": 7.672693555864627e-06, + "loss": 1.0833, + "step": 3310 + }, + { + "epoch": 0.23092439312791263, + "grad_norm": 3.516045409440455, + "learning_rate": 7.695873898933705e-06, + "loss": 1.0439, + "step": 3320 + }, + { + "epoch": 0.23161994852890033, + "grad_norm": 3.378030453662337, + "learning_rate": 7.719054242002783e-06, + "loss": 1.0586, + "step": 3330 + }, + { + "epoch": 0.232315503929888, + "grad_norm": 3.547073697919409, + "learning_rate": 7.742234585071859e-06, + "loss": 1.0749, + "step": 3340 + }, + { + "epoch": 0.2330110593308757, + "grad_norm": 3.389764510695452, + "learning_rate": 7.765414928140938e-06, + "loss": 1.0677, + "step": 3350 + }, + { + "epoch": 0.23370661473186338, + "grad_norm": 3.343664667551203, + "learning_rate": 7.788595271210014e-06, + "loss": 1.0301, + "step": 3360 + }, + { + "epoch": 0.23440217013285108, + "grad_norm": 3.306253310887168, + "learning_rate": 7.811775614279092e-06, + "loss": 1.1026, + "step": 3370 + }, + { + "epoch": 0.23509772553383876, + "grad_norm": 3.3528886918227823, + "learning_rate": 7.83495595734817e-06, + "loss": 1.0322, + "step": 3380 + }, + { + "epoch": 0.23579328093482646, + "grad_norm": 2.526605206101494, + "learning_rate": 7.858136300417247e-06, + "loss": 1.1146, + "step": 3390 + }, + { + "epoch": 0.23648883633581413, + "grad_norm": 2.4599280771944056, + "learning_rate": 7.881316643486325e-06, + "loss": 1.0799, + "step": 3400 + }, + { + "epoch": 0.23718439173680184, + "grad_norm": 2.7852649693807776, + "learning_rate": 7.904496986555402e-06, + "loss": 1.047, + "step": 3410 + }, + { + "epoch": 0.23787994713778954, + "grad_norm": 4.36390715326572, + "learning_rate": 7.92767732962448e-06, + "loss": 1.0576, + "step": 3420 + }, + { + "epoch": 0.2385755025387772, + "grad_norm": 4.17667348146727, + "learning_rate": 7.950857672693556e-06, + "loss": 1.1443, + "step": 3430 + }, + { + "epoch": 0.23927105793976491, + "grad_norm": 4.258317299120901, + "learning_rate": 7.974038015762633e-06, + "loss": 1.0311, + "step": 3440 + }, + { + "epoch": 0.2399666133407526, + "grad_norm": 5.124988681678754, + "learning_rate": 7.997218358831711e-06, + "loss": 1.1141, + "step": 3450 + }, + { + "epoch": 0.2406621687417403, + "grad_norm": 3.2018063380362394, + "learning_rate": 8.020398701900789e-06, + "loss": 1.0385, + "step": 3460 + }, + { + "epoch": 0.24135772414272796, + "grad_norm": 4.173481356448509, + "learning_rate": 8.043579044969866e-06, + "loss": 1.1136, + "step": 3470 + }, + { + "epoch": 0.24205327954371567, + "grad_norm": 4.294103202239798, + "learning_rate": 8.066759388038944e-06, + "loss": 1.0519, + "step": 3480 + }, + { + "epoch": 0.24274883494470334, + "grad_norm": 3.9679266061328393, + "learning_rate": 8.08993973110802e-06, + "loss": 1.0365, + "step": 3490 + }, + { + "epoch": 0.24344439034569104, + "grad_norm": 3.680079592022548, + "learning_rate": 8.1131200741771e-06, + "loss": 1.0874, + "step": 3500 + }, + { + "epoch": 0.24344439034569104, + "eval_loss": 1.0708545446395874, + "eval_runtime": 1324.2946, + "eval_samples_per_second": 13.714, + "eval_steps_per_second": 2.286, + "step": 3500 + }, + { + "epoch": 0.24413994574667872, + "grad_norm": 6.897912532693131, + "learning_rate": 8.136300417246175e-06, + "loss": 1.0606, + "step": 3510 + }, + { + "epoch": 0.24483550114766642, + "grad_norm": 3.323290366873813, + "learning_rate": 8.159480760315253e-06, + "loss": 1.1241, + "step": 3520 + }, + { + "epoch": 0.2455310565486541, + "grad_norm": 4.65122678229039, + "learning_rate": 8.18266110338433e-06, + "loss": 1.027, + "step": 3530 + }, + { + "epoch": 0.2462266119496418, + "grad_norm": 4.00539780698972, + "learning_rate": 8.205841446453408e-06, + "loss": 1.0694, + "step": 3540 + }, + { + "epoch": 0.24692216735062947, + "grad_norm": 5.930767835343031, + "learning_rate": 8.229021789522486e-06, + "loss": 1.1084, + "step": 3550 + }, + { + "epoch": 0.24761772275161717, + "grad_norm": 2.366566370106512, + "learning_rate": 8.252202132591564e-06, + "loss": 1.0439, + "step": 3560 + }, + { + "epoch": 0.24831327815260484, + "grad_norm": 5.324397509879531, + "learning_rate": 8.275382475660641e-06, + "loss": 1.0154, + "step": 3570 + }, + { + "epoch": 0.24900883355359255, + "grad_norm": 2.9450336117914686, + "learning_rate": 8.298562818729717e-06, + "loss": 1.1194, + "step": 3580 + }, + { + "epoch": 0.24970438895458022, + "grad_norm": 2.5042074820565823, + "learning_rate": 8.321743161798795e-06, + "loss": 1.0104, + "step": 3590 + }, + { + "epoch": 0.2503999443555679, + "grad_norm": 3.3086383723903543, + "learning_rate": 8.344923504867873e-06, + "loss": 1.1153, + "step": 3600 + }, + { + "epoch": 0.2510954997565556, + "grad_norm": 8.365521047621579, + "learning_rate": 8.36810384793695e-06, + "loss": 1.1319, + "step": 3610 + }, + { + "epoch": 0.2517910551575433, + "grad_norm": 3.1239555989631733, + "learning_rate": 8.391284191006028e-06, + "loss": 1.0498, + "step": 3620 + }, + { + "epoch": 0.252486610558531, + "grad_norm": 3.403487099180578, + "learning_rate": 8.414464534075106e-06, + "loss": 1.1436, + "step": 3630 + }, + { + "epoch": 0.2531821659595187, + "grad_norm": 7.36165664124148, + "learning_rate": 8.437644877144181e-06, + "loss": 1.0714, + "step": 3640 + }, + { + "epoch": 0.25387772136050635, + "grad_norm": 3.1604618362111587, + "learning_rate": 8.46082522021326e-06, + "loss": 1.1049, + "step": 3650 + }, + { + "epoch": 0.25457327676149405, + "grad_norm": 2.9303847541121124, + "learning_rate": 8.484005563282337e-06, + "loss": 1.0229, + "step": 3660 + }, + { + "epoch": 0.25526883216248175, + "grad_norm": 6.015422493780352, + "learning_rate": 8.507185906351414e-06, + "loss": 1.04, + "step": 3670 + }, + { + "epoch": 0.25596438756346945, + "grad_norm": 5.859910471746404, + "learning_rate": 8.530366249420492e-06, + "loss": 1.0333, + "step": 3680 + }, + { + "epoch": 0.2566599429644571, + "grad_norm": 7.284936853333558, + "learning_rate": 8.55354659248957e-06, + "loss": 1.0512, + "step": 3690 + }, + { + "epoch": 0.2573554983654448, + "grad_norm": 4.180577879330297, + "learning_rate": 8.576726935558647e-06, + "loss": 1.0495, + "step": 3700 + }, + { + "epoch": 0.2580510537664325, + "grad_norm": 4.591655663738542, + "learning_rate": 8.599907278627725e-06, + "loss": 1.0966, + "step": 3710 + }, + { + "epoch": 0.2587466091674202, + "grad_norm": 2.6398518824987205, + "learning_rate": 8.623087621696801e-06, + "loss": 1.0676, + "step": 3720 + }, + { + "epoch": 0.25944216456840785, + "grad_norm": 3.9085160431341444, + "learning_rate": 8.646267964765879e-06, + "loss": 1.0925, + "step": 3730 + }, + { + "epoch": 0.26013771996939555, + "grad_norm": 3.219473487227351, + "learning_rate": 8.669448307834956e-06, + "loss": 1.0751, + "step": 3740 + }, + { + "epoch": 0.26083327537038326, + "grad_norm": 3.297607468342328, + "learning_rate": 8.692628650904034e-06, + "loss": 1.0895, + "step": 3750 + }, + { + "epoch": 0.26152883077137096, + "grad_norm": 3.307694288337221, + "learning_rate": 8.715808993973112e-06, + "loss": 1.0452, + "step": 3760 + }, + { + "epoch": 0.2622243861723586, + "grad_norm": 18.761206896008495, + "learning_rate": 8.73898933704219e-06, + "loss": 1.0734, + "step": 3770 + }, + { + "epoch": 0.2629199415733463, + "grad_norm": 7.9338582213999205, + "learning_rate": 8.762169680111267e-06, + "loss": 1.0892, + "step": 3780 + }, + { + "epoch": 0.263615496974334, + "grad_norm": 3.585492237169441, + "learning_rate": 8.785350023180343e-06, + "loss": 1.0166, + "step": 3790 + }, + { + "epoch": 0.2643110523753217, + "grad_norm": 2.8307760989241366, + "learning_rate": 8.808530366249422e-06, + "loss": 1.0569, + "step": 3800 + }, + { + "epoch": 0.2650066077763094, + "grad_norm": 2.732313592745699, + "learning_rate": 8.831710709318498e-06, + "loss": 1.0404, + "step": 3810 + }, + { + "epoch": 0.26570216317729706, + "grad_norm": 3.022940417963089, + "learning_rate": 8.854891052387576e-06, + "loss": 1.0488, + "step": 3820 + }, + { + "epoch": 0.26639771857828476, + "grad_norm": 8.075073859854212, + "learning_rate": 8.878071395456654e-06, + "loss": 1.096, + "step": 3830 + }, + { + "epoch": 0.26709327397927246, + "grad_norm": 4.103849882196253, + "learning_rate": 8.901251738525731e-06, + "loss": 1.0023, + "step": 3840 + }, + { + "epoch": 0.26778882938026016, + "grad_norm": 5.798330501595627, + "learning_rate": 8.924432081594809e-06, + "loss": 1.08, + "step": 3850 + }, + { + "epoch": 0.2684843847812478, + "grad_norm": 5.24320266021797, + "learning_rate": 8.947612424663886e-06, + "loss": 1.0809, + "step": 3860 + }, + { + "epoch": 0.2691799401822355, + "grad_norm": 4.370234338310967, + "learning_rate": 8.970792767732962e-06, + "loss": 1.0802, + "step": 3870 + }, + { + "epoch": 0.2698754955832232, + "grad_norm": 2.9388111669122847, + "learning_rate": 8.99397311080204e-06, + "loss": 1.0993, + "step": 3880 + }, + { + "epoch": 0.2705710509842109, + "grad_norm": 4.582983781050352, + "learning_rate": 9.017153453871118e-06, + "loss": 1.0565, + "step": 3890 + }, + { + "epoch": 0.27126660638519856, + "grad_norm": 2.7598136981564374, + "learning_rate": 9.040333796940195e-06, + "loss": 1.0518, + "step": 3900 + }, + { + "epoch": 0.27196216178618626, + "grad_norm": 5.310272387350867, + "learning_rate": 9.063514140009273e-06, + "loss": 1.056, + "step": 3910 + }, + { + "epoch": 0.27265771718717396, + "grad_norm": 3.618540733821931, + "learning_rate": 9.08669448307835e-06, + "loss": 1.0698, + "step": 3920 + }, + { + "epoch": 0.27335327258816167, + "grad_norm": 3.2768500780266296, + "learning_rate": 9.109874826147428e-06, + "loss": 1.1052, + "step": 3930 + }, + { + "epoch": 0.2740488279891493, + "grad_norm": 2.898723761952049, + "learning_rate": 9.133055169216504e-06, + "loss": 1.0811, + "step": 3940 + }, + { + "epoch": 0.274744383390137, + "grad_norm": 3.381086882253515, + "learning_rate": 9.156235512285582e-06, + "loss": 1.102, + "step": 3950 + }, + { + "epoch": 0.2754399387911247, + "grad_norm": 2.857666927326934, + "learning_rate": 9.17941585535466e-06, + "loss": 1.0692, + "step": 3960 + }, + { + "epoch": 0.2761354941921124, + "grad_norm": 2.844431377889455, + "learning_rate": 9.202596198423737e-06, + "loss": 1.0656, + "step": 3970 + }, + { + "epoch": 0.27683104959310006, + "grad_norm": 3.5406070954445714, + "learning_rate": 9.225776541492815e-06, + "loss": 1.067, + "step": 3980 + }, + { + "epoch": 0.27752660499408777, + "grad_norm": 2.640755242986449, + "learning_rate": 9.248956884561893e-06, + "loss": 1.0534, + "step": 3990 + }, + { + "epoch": 0.27822216039507547, + "grad_norm": 3.157994632735133, + "learning_rate": 9.27213722763097e-06, + "loss": 1.0277, + "step": 4000 + }, + { + "epoch": 0.27822216039507547, + "eval_loss": 1.0788345336914062, + "eval_runtime": 1320.9241, + "eval_samples_per_second": 13.749, + "eval_steps_per_second": 2.292, + "step": 4000 + }, + { + "epoch": 0.27891771579606317, + "grad_norm": 4.779455467125114, + "learning_rate": 9.295317570700048e-06, + "loss": 1.1007, + "step": 4010 + }, + { + "epoch": 0.27961327119705087, + "grad_norm": 5.743679886281473, + "learning_rate": 9.318497913769124e-06, + "loss": 1.1345, + "step": 4020 + }, + { + "epoch": 0.2803088265980385, + "grad_norm": 4.127704545503805, + "learning_rate": 9.341678256838203e-06, + "loss": 1.0773, + "step": 4030 + }, + { + "epoch": 0.2810043819990262, + "grad_norm": 5.124341026288114, + "learning_rate": 9.36485859990728e-06, + "loss": 1.0751, + "step": 4040 + }, + { + "epoch": 0.2816999374000139, + "grad_norm": 2.831938100378156, + "learning_rate": 9.388038942976357e-06, + "loss": 1.1104, + "step": 4050 + }, + { + "epoch": 0.2823954928010016, + "grad_norm": 2.8164785952525215, + "learning_rate": 9.411219286045434e-06, + "loss": 1.0183, + "step": 4060 + }, + { + "epoch": 0.28309104820198927, + "grad_norm": 6.273010074814853, + "learning_rate": 9.43439962911451e-06, + "loss": 1.1253, + "step": 4070 + }, + { + "epoch": 0.283786603602977, + "grad_norm": 2.9362626881916496, + "learning_rate": 9.45757997218359e-06, + "loss": 1.0701, + "step": 4080 + }, + { + "epoch": 0.2844821590039647, + "grad_norm": 3.2233188905363774, + "learning_rate": 9.480760315252666e-06, + "loss": 1.1022, + "step": 4090 + }, + { + "epoch": 0.2851777144049524, + "grad_norm": 2.6442570793457794, + "learning_rate": 9.503940658321743e-06, + "loss": 1.1319, + "step": 4100 + }, + { + "epoch": 0.28587326980594, + "grad_norm": 2.541623013627999, + "learning_rate": 9.527121001390821e-06, + "loss": 1.0421, + "step": 4110 + }, + { + "epoch": 0.2865688252069277, + "grad_norm": 2.586820775477875, + "learning_rate": 9.550301344459899e-06, + "loss": 1.0585, + "step": 4120 + }, + { + "epoch": 0.2872643806079154, + "grad_norm": 2.230030356398589, + "learning_rate": 9.573481687528976e-06, + "loss": 1.034, + "step": 4130 + }, + { + "epoch": 0.28795993600890313, + "grad_norm": 2.5893033082811865, + "learning_rate": 9.596662030598054e-06, + "loss": 1.0792, + "step": 4140 + }, + { + "epoch": 0.2886554914098908, + "grad_norm": 5.079600239344667, + "learning_rate": 9.619842373667132e-06, + "loss": 1.1007, + "step": 4150 + }, + { + "epoch": 0.2893510468108785, + "grad_norm": 3.5956196701475407, + "learning_rate": 9.64302271673621e-06, + "loss": 1.0577, + "step": 4160 + }, + { + "epoch": 0.2900466022118662, + "grad_norm": 2.5595806488222554, + "learning_rate": 9.666203059805285e-06, + "loss": 1.0925, + "step": 4170 + }, + { + "epoch": 0.2907421576128539, + "grad_norm": 3.2423606776707525, + "learning_rate": 9.689383402874365e-06, + "loss": 1.0705, + "step": 4180 + }, + { + "epoch": 0.2914377130138415, + "grad_norm": 5.348299916040205, + "learning_rate": 9.71256374594344e-06, + "loss": 1.0484, + "step": 4190 + }, + { + "epoch": 0.29213326841482923, + "grad_norm": 2.1169297924210757, + "learning_rate": 9.735744089012518e-06, + "loss": 1.033, + "step": 4200 + }, + { + "epoch": 0.29282882381581693, + "grad_norm": 5.274431068524173, + "learning_rate": 9.758924432081596e-06, + "loss": 1.0538, + "step": 4210 + }, + { + "epoch": 0.29352437921680463, + "grad_norm": 3.287147447367767, + "learning_rate": 9.782104775150672e-06, + "loss": 1.0974, + "step": 4220 + }, + { + "epoch": 0.29421993461779233, + "grad_norm": 2.6789472440908906, + "learning_rate": 9.805285118219751e-06, + "loss": 1.1553, + "step": 4230 + }, + { + "epoch": 0.29491549001878, + "grad_norm": 8.522603522424022, + "learning_rate": 9.828465461288827e-06, + "loss": 1.0456, + "step": 4240 + }, + { + "epoch": 0.2956110454197677, + "grad_norm": 3.015342677407744, + "learning_rate": 9.851645804357905e-06, + "loss": 1.0666, + "step": 4250 + }, + { + "epoch": 0.2963066008207554, + "grad_norm": 4.343449548366913, + "learning_rate": 9.874826147426983e-06, + "loss": 1.0887, + "step": 4260 + }, + { + "epoch": 0.2970021562217431, + "grad_norm": 2.555854072105505, + "learning_rate": 9.89800649049606e-06, + "loss": 1.0535, + "step": 4270 + }, + { + "epoch": 0.29769771162273073, + "grad_norm": 4.647100746652741, + "learning_rate": 9.921186833565138e-06, + "loss": 1.0876, + "step": 4280 + }, + { + "epoch": 0.29839326702371843, + "grad_norm": 6.682275532040958, + "learning_rate": 9.944367176634215e-06, + "loss": 1.0363, + "step": 4290 + }, + { + "epoch": 0.29908882242470614, + "grad_norm": 5.7114784529268885, + "learning_rate": 9.967547519703291e-06, + "loss": 1.0832, + "step": 4300 + }, + { + "epoch": 0.29978437782569384, + "grad_norm": 5.3057266208115825, + "learning_rate": 9.99072786277237e-06, + "loss": 1.0625, + "step": 4310 + }, + { + "epoch": 0.3004799332266815, + "grad_norm": 2.2483188905770937, + "learning_rate": 9.999999410480316e-06, + "loss": 1.0143, + "step": 4320 + }, + { + "epoch": 0.3011754886276692, + "grad_norm": 2.9061264698559977, + "learning_rate": 9.999995807860525e-06, + "loss": 1.0844, + "step": 4330 + }, + { + "epoch": 0.3018710440286569, + "grad_norm": 2.252936895659207, + "learning_rate": 9.999988930134236e-06, + "loss": 1.0808, + "step": 4340 + }, + { + "epoch": 0.3025665994296446, + "grad_norm": 15.419177919098633, + "learning_rate": 9.999978777305955e-06, + "loss": 1.1113, + "step": 4350 + }, + { + "epoch": 0.30326215483063224, + "grad_norm": 2.825208694740048, + "learning_rate": 9.999965349382327e-06, + "loss": 1.0934, + "step": 4360 + }, + { + "epoch": 0.30395771023161994, + "grad_norm": 2.0587142394670352, + "learning_rate": 9.999948646372155e-06, + "loss": 1.0613, + "step": 4370 + }, + { + "epoch": 0.30465326563260764, + "grad_norm": 3.6176632242530444, + "learning_rate": 9.999928668286377e-06, + "loss": 1.0905, + "step": 4380 + }, + { + "epoch": 0.30534882103359534, + "grad_norm": 5.125471744457271, + "learning_rate": 9.999905415138079e-06, + "loss": 1.0282, + "step": 4390 + }, + { + "epoch": 0.306044376434583, + "grad_norm": 3.471981314943399, + "learning_rate": 9.999878886942489e-06, + "loss": 1.141, + "step": 4400 + }, + { + "epoch": 0.3067399318355707, + "grad_norm": 5.273975195503492, + "learning_rate": 9.999849083716989e-06, + "loss": 1.132, + "step": 4410 + }, + { + "epoch": 0.3074354872365584, + "grad_norm": 3.1066894145666097, + "learning_rate": 9.999816005481097e-06, + "loss": 1.0752, + "step": 4420 + }, + { + "epoch": 0.3081310426375461, + "grad_norm": 3.932538321451269, + "learning_rate": 9.99977965225648e-06, + "loss": 1.1106, + "step": 4430 + }, + { + "epoch": 0.3088265980385338, + "grad_norm": 2.9448240164844126, + "learning_rate": 9.999740024066955e-06, + "loss": 1.084, + "step": 4440 + }, + { + "epoch": 0.30952215343952144, + "grad_norm": 2.681350498613468, + "learning_rate": 9.999697120938473e-06, + "loss": 1.0331, + "step": 4450 + }, + { + "epoch": 0.31021770884050914, + "grad_norm": 2.5255885502817996, + "learning_rate": 9.99965094289914e-06, + "loss": 1.0713, + "step": 4460 + }, + { + "epoch": 0.31091326424149685, + "grad_norm": 2.64656076720824, + "learning_rate": 9.999601489979203e-06, + "loss": 1.0957, + "step": 4470 + }, + { + "epoch": 0.31160881964248455, + "grad_norm": 6.0403160135576215, + "learning_rate": 9.999548762211055e-06, + "loss": 1.0873, + "step": 4480 + }, + { + "epoch": 0.3123043750434722, + "grad_norm": 3.3898342265109394, + "learning_rate": 9.999492759629233e-06, + "loss": 1.0552, + "step": 4490 + }, + { + "epoch": 0.3129999304444599, + "grad_norm": 2.6618662660093784, + "learning_rate": 9.999433482270419e-06, + "loss": 1.0044, + "step": 4500 + }, + { + "epoch": 0.3129999304444599, + "eval_loss": 1.0833879709243774, + "eval_runtime": 1322.8495, + "eval_samples_per_second": 13.729, + "eval_steps_per_second": 2.288, + "step": 4500 + }, + { + "epoch": 0.3136954858454476, + "grad_norm": 2.350689740942198, + "learning_rate": 9.999370930173445e-06, + "loss": 1.055, + "step": 4510 + }, + { + "epoch": 0.3143910412464353, + "grad_norm": 2.4680560352620753, + "learning_rate": 9.99930510337928e-06, + "loss": 1.0534, + "step": 4520 + }, + { + "epoch": 0.31508659664742295, + "grad_norm": 9.309709380061202, + "learning_rate": 9.999236001931043e-06, + "loss": 1.0471, + "step": 4530 + }, + { + "epoch": 0.31578215204841065, + "grad_norm": 2.669550194254662, + "learning_rate": 9.999163625873998e-06, + "loss": 1.0571, + "step": 4540 + }, + { + "epoch": 0.31647770744939835, + "grad_norm": 2.680150879986522, + "learning_rate": 9.99908797525555e-06, + "loss": 1.0355, + "step": 4550 + }, + { + "epoch": 0.31717326285038605, + "grad_norm": 3.6394010841983295, + "learning_rate": 9.999009050125257e-06, + "loss": 1.0125, + "step": 4560 + }, + { + "epoch": 0.3178688182513737, + "grad_norm": 2.5664565582093233, + "learning_rate": 9.998926850534811e-06, + "loss": 1.0921, + "step": 4570 + }, + { + "epoch": 0.3185643736523614, + "grad_norm": 3.5094737032101695, + "learning_rate": 9.998841376538058e-06, + "loss": 1.1522, + "step": 4580 + }, + { + "epoch": 0.3192599290533491, + "grad_norm": 2.6264258708676906, + "learning_rate": 9.998752628190986e-06, + "loss": 1.086, + "step": 4590 + }, + { + "epoch": 0.3199554844543368, + "grad_norm": 2.85202172934704, + "learning_rate": 9.998660605551725e-06, + "loss": 1.1215, + "step": 4600 + }, + { + "epoch": 0.32065103985532445, + "grad_norm": 3.3727574529035103, + "learning_rate": 9.998565308680551e-06, + "loss": 1.1077, + "step": 4610 + }, + { + "epoch": 0.32134659525631215, + "grad_norm": 3.084110823456356, + "learning_rate": 9.998466737639889e-06, + "loss": 1.0965, + "step": 4620 + }, + { + "epoch": 0.32204215065729985, + "grad_norm": 6.1011501730715105, + "learning_rate": 9.9983648924943e-06, + "loss": 1.0549, + "step": 4630 + }, + { + "epoch": 0.32273770605828755, + "grad_norm": 4.378677777122456, + "learning_rate": 9.998259773310501e-06, + "loss": 1.0772, + "step": 4640 + }, + { + "epoch": 0.32343326145927526, + "grad_norm": 3.8395083623363337, + "learning_rate": 9.998151380157344e-06, + "loss": 1.1424, + "step": 4650 + }, + { + "epoch": 0.3241288168602629, + "grad_norm": 2.542111749486303, + "learning_rate": 9.998039713105827e-06, + "loss": 1.0718, + "step": 4660 + }, + { + "epoch": 0.3248243722612506, + "grad_norm": 2.7789787361244604, + "learning_rate": 9.9979247722291e-06, + "loss": 1.1228, + "step": 4670 + }, + { + "epoch": 0.3255199276622383, + "grad_norm": 2.582470403935361, + "learning_rate": 9.997806557602446e-06, + "loss": 1.1571, + "step": 4680 + }, + { + "epoch": 0.326215483063226, + "grad_norm": 3.30159135508484, + "learning_rate": 9.9976850693033e-06, + "loss": 1.1078, + "step": 4690 + }, + { + "epoch": 0.32691103846421365, + "grad_norm": 6.441445406521351, + "learning_rate": 9.997560307411241e-06, + "loss": 1.0985, + "step": 4700 + }, + { + "epoch": 0.32760659386520136, + "grad_norm": 4.205924625096802, + "learning_rate": 9.99743227200799e-06, + "loss": 1.0427, + "step": 4710 + }, + { + "epoch": 0.32830214926618906, + "grad_norm": 2.92835064146378, + "learning_rate": 9.997300963177412e-06, + "loss": 1.0701, + "step": 4720 + }, + { + "epoch": 0.32899770466717676, + "grad_norm": 5.36427800075514, + "learning_rate": 9.997166381005518e-06, + "loss": 1.0889, + "step": 4730 + }, + { + "epoch": 0.3296932600681644, + "grad_norm": 3.2122209454029864, + "learning_rate": 9.997028525580463e-06, + "loss": 1.0649, + "step": 4740 + }, + { + "epoch": 0.3303888154691521, + "grad_norm": 3.7952899054039597, + "learning_rate": 9.996887396992545e-06, + "loss": 1.0369, + "step": 4750 + }, + { + "epoch": 0.3310843708701398, + "grad_norm": 3.501545809321222, + "learning_rate": 9.996742995334205e-06, + "loss": 1.0926, + "step": 4760 + }, + { + "epoch": 0.3317799262711275, + "grad_norm": 2.1268393802163037, + "learning_rate": 9.99659532070003e-06, + "loss": 1.0208, + "step": 4770 + }, + { + "epoch": 0.33247548167211516, + "grad_norm": 1.7633333044842892, + "learning_rate": 9.99644437318675e-06, + "loss": 1.0987, + "step": 4780 + }, + { + "epoch": 0.33317103707310286, + "grad_norm": 2.7871039963982662, + "learning_rate": 9.996290152893239e-06, + "loss": 1.0542, + "step": 4790 + }, + { + "epoch": 0.33386659247409056, + "grad_norm": 3.291703279131844, + "learning_rate": 9.996132659920515e-06, + "loss": 1.0717, + "step": 4800 + }, + { + "epoch": 0.33456214787507826, + "grad_norm": 7.222261693705646, + "learning_rate": 9.99597189437174e-06, + "loss": 1.1112, + "step": 4810 + }, + { + "epoch": 0.33525770327606597, + "grad_norm": 4.11639709030618, + "learning_rate": 9.995807856352215e-06, + "loss": 1.0889, + "step": 4820 + }, + { + "epoch": 0.3359532586770536, + "grad_norm": 2.757518977374051, + "learning_rate": 9.995640545969393e-06, + "loss": 1.0725, + "step": 4830 + }, + { + "epoch": 0.3366488140780413, + "grad_norm": 2.831418364203189, + "learning_rate": 9.995469963332866e-06, + "loss": 1.1719, + "step": 4840 + }, + { + "epoch": 0.337344369479029, + "grad_norm": 3.661524455515929, + "learning_rate": 9.995296108554367e-06, + "loss": 1.136, + "step": 4850 + }, + { + "epoch": 0.3380399248800167, + "grad_norm": 3.0316747007359437, + "learning_rate": 9.995118981747775e-06, + "loss": 1.0928, + "step": 4860 + }, + { + "epoch": 0.33873548028100436, + "grad_norm": 3.271039973533199, + "learning_rate": 9.994938583029112e-06, + "loss": 1.0951, + "step": 4870 + }, + { + "epoch": 0.33943103568199207, + "grad_norm": 3.616500160879331, + "learning_rate": 9.994754912516545e-06, + "loss": 1.1681, + "step": 4880 + }, + { + "epoch": 0.34012659108297977, + "grad_norm": 2.916880865242579, + "learning_rate": 9.99456797033038e-06, + "loss": 1.1068, + "step": 4890 + }, + { + "epoch": 0.34082214648396747, + "grad_norm": 3.8997554291449736, + "learning_rate": 9.994377756593069e-06, + "loss": 1.0289, + "step": 4900 + }, + { + "epoch": 0.3415177018849551, + "grad_norm": 3.509463857887857, + "learning_rate": 9.994184271429205e-06, + "loss": 1.0403, + "step": 4910 + }, + { + "epoch": 0.3422132572859428, + "grad_norm": 4.126334618907005, + "learning_rate": 9.99398751496553e-06, + "loss": 1.0319, + "step": 4920 + }, + { + "epoch": 0.3429088126869305, + "grad_norm": 3.9874188857999444, + "learning_rate": 9.993787487330915e-06, + "loss": 1.0608, + "step": 4930 + }, + { + "epoch": 0.3436043680879182, + "grad_norm": 6.0466707604176975, + "learning_rate": 9.993584188656389e-06, + "loss": 1.0595, + "step": 4940 + }, + { + "epoch": 0.34429992348890587, + "grad_norm": 5.199949838999424, + "learning_rate": 9.993377619075116e-06, + "loss": 1.0863, + "step": 4950 + }, + { + "epoch": 0.34499547888989357, + "grad_norm": 2.347384603695045, + "learning_rate": 9.993167778722402e-06, + "loss": 1.0494, + "step": 4960 + }, + { + "epoch": 0.34569103429088127, + "grad_norm": 2.779423642033522, + "learning_rate": 9.9929546677357e-06, + "loss": 1.1016, + "step": 4970 + }, + { + "epoch": 0.346386589691869, + "grad_norm": 2.4521298428673908, + "learning_rate": 9.992738286254599e-06, + "loss": 1.1112, + "step": 4980 + }, + { + "epoch": 0.3470821450928566, + "grad_norm": 2.6533236151801147, + "learning_rate": 9.992518634420834e-06, + "loss": 1.0829, + "step": 4990 + }, + { + "epoch": 0.3477777004938443, + "grad_norm": 8.076971871984446, + "learning_rate": 9.992295712378284e-06, + "loss": 1.1612, + "step": 5000 + }, + { + "epoch": 0.3477777004938443, + "eval_loss": 1.0821527242660522, + "eval_runtime": 1320.3483, + "eval_samples_per_second": 13.755, + "eval_steps_per_second": 2.293, + "step": 5000 + }, + { + "epoch": 0.348473255894832, + "grad_norm": 6.9797768712606825, + "learning_rate": 9.992069520272967e-06, + "loss": 1.1588, + "step": 5010 + }, + { + "epoch": 0.3491688112958197, + "grad_norm": 3.758689422133329, + "learning_rate": 9.991840058253044e-06, + "loss": 1.0648, + "step": 5020 + }, + { + "epoch": 0.3498643666968074, + "grad_norm": 5.252303696262916, + "learning_rate": 9.991607326468816e-06, + "loss": 1.1078, + "step": 5030 + }, + { + "epoch": 0.3505599220977951, + "grad_norm": 2.527281734051451, + "learning_rate": 9.991371325072727e-06, + "loss": 1.0969, + "step": 5040 + }, + { + "epoch": 0.3512554774987828, + "grad_norm": 3.562365905843142, + "learning_rate": 9.991132054219366e-06, + "loss": 1.0842, + "step": 5050 + }, + { + "epoch": 0.3519510328997705, + "grad_norm": 2.9303568001225835, + "learning_rate": 9.990889514065459e-06, + "loss": 1.0986, + "step": 5060 + }, + { + "epoch": 0.3526465883007582, + "grad_norm": 3.5333170388846873, + "learning_rate": 9.990643704769874e-06, + "loss": 1.0044, + "step": 5070 + }, + { + "epoch": 0.3533421437017458, + "grad_norm": 2.476950473867391, + "learning_rate": 9.990394626493622e-06, + "loss": 1.0831, + "step": 5080 + }, + { + "epoch": 0.3540376991027335, + "grad_norm": 2.693687614834126, + "learning_rate": 9.990142279399856e-06, + "loss": 1.0775, + "step": 5090 + }, + { + "epoch": 0.35473325450372123, + "grad_norm": 3.1443060911192418, + "learning_rate": 9.989886663653869e-06, + "loss": 1.0938, + "step": 5100 + }, + { + "epoch": 0.35542880990470893, + "grad_norm": 3.737833084276326, + "learning_rate": 9.989627779423095e-06, + "loss": 1.0968, + "step": 5110 + }, + { + "epoch": 0.3561243653056966, + "grad_norm": 5.156006727085497, + "learning_rate": 9.989365626877106e-06, + "loss": 1.0843, + "step": 5120 + }, + { + "epoch": 0.3568199207066843, + "grad_norm": 2.00220483367296, + "learning_rate": 9.98910020618762e-06, + "loss": 1.0969, + "step": 5130 + }, + { + "epoch": 0.357515476107672, + "grad_norm": 12.90994874080209, + "learning_rate": 9.988831517528494e-06, + "loss": 1.0881, + "step": 5140 + }, + { + "epoch": 0.3582110315086597, + "grad_norm": 9.302622065157742, + "learning_rate": 9.988559561075723e-06, + "loss": 1.0707, + "step": 5150 + }, + { + "epoch": 0.35890658690964733, + "grad_norm": 2.39505673506412, + "learning_rate": 9.988284337007445e-06, + "loss": 1.0854, + "step": 5160 + }, + { + "epoch": 0.35960214231063503, + "grad_norm": 2.880893651672593, + "learning_rate": 9.98800584550394e-06, + "loss": 1.0857, + "step": 5170 + }, + { + "epoch": 0.36029769771162273, + "grad_norm": 2.649035427546175, + "learning_rate": 9.987724086747622e-06, + "loss": 1.1082, + "step": 5180 + }, + { + "epoch": 0.36099325311261043, + "grad_norm": 2.6081112671319295, + "learning_rate": 9.987439060923052e-06, + "loss": 1.1122, + "step": 5190 + }, + { + "epoch": 0.3616888085135981, + "grad_norm": 3.3718748870733957, + "learning_rate": 9.987150768216926e-06, + "loss": 1.0991, + "step": 5200 + }, + { + "epoch": 0.3623843639145858, + "grad_norm": 3.5422841652736743, + "learning_rate": 9.986859208818086e-06, + "loss": 1.0247, + "step": 5210 + }, + { + "epoch": 0.3630799193155735, + "grad_norm": 2.8524246880138056, + "learning_rate": 9.986564382917505e-06, + "loss": 1.1148, + "step": 5220 + }, + { + "epoch": 0.3637754747165612, + "grad_norm": 2.0259439123712917, + "learning_rate": 9.986266290708304e-06, + "loss": 1.0444, + "step": 5230 + }, + { + "epoch": 0.3644710301175489, + "grad_norm": 2.086459979420223, + "learning_rate": 9.985964932385737e-06, + "loss": 1.0436, + "step": 5240 + }, + { + "epoch": 0.36516658551853654, + "grad_norm": 2.869741727872723, + "learning_rate": 9.985660308147202e-06, + "loss": 1.0722, + "step": 5250 + }, + { + "epoch": 0.36586214091952424, + "grad_norm": 2.9391744035280736, + "learning_rate": 9.985352418192236e-06, + "loss": 1.0239, + "step": 5260 + }, + { + "epoch": 0.36655769632051194, + "grad_norm": 2.844867168656337, + "learning_rate": 9.98504126272251e-06, + "loss": 1.0513, + "step": 5270 + }, + { + "epoch": 0.36725325172149964, + "grad_norm": 2.6122326876741777, + "learning_rate": 9.984726841941841e-06, + "loss": 1.0886, + "step": 5280 + }, + { + "epoch": 0.3679488071224873, + "grad_norm": 3.0566255140913334, + "learning_rate": 9.98440915605618e-06, + "loss": 1.0594, + "step": 5290 + }, + { + "epoch": 0.368644362523475, + "grad_norm": 3.4099709423729294, + "learning_rate": 9.984088205273617e-06, + "loss": 1.075, + "step": 5300 + }, + { + "epoch": 0.3693399179244627, + "grad_norm": 2.306380892927756, + "learning_rate": 9.983763989804384e-06, + "loss": 1.0983, + "step": 5310 + }, + { + "epoch": 0.3700354733254504, + "grad_norm": 2.1242800875876733, + "learning_rate": 9.983436509860847e-06, + "loss": 0.9991, + "step": 5320 + }, + { + "epoch": 0.37073102872643804, + "grad_norm": 2.8096194351264123, + "learning_rate": 9.983105765657514e-06, + "loss": 1.0654, + "step": 5330 + }, + { + "epoch": 0.37142658412742574, + "grad_norm": 5.296984254294115, + "learning_rate": 9.982771757411032e-06, + "loss": 1.0974, + "step": 5340 + }, + { + "epoch": 0.37212213952841344, + "grad_norm": 2.840467116724936, + "learning_rate": 9.982434485340178e-06, + "loss": 1.0787, + "step": 5350 + }, + { + "epoch": 0.37281769492940114, + "grad_norm": 6.87367479933938, + "learning_rate": 9.982093949665876e-06, + "loss": 1.1397, + "step": 5360 + }, + { + "epoch": 0.3735132503303888, + "grad_norm": 2.876873712574703, + "learning_rate": 9.981750150611187e-06, + "loss": 1.0773, + "step": 5370 + }, + { + "epoch": 0.3742088057313765, + "grad_norm": 2.5114466921193612, + "learning_rate": 9.9814030884013e-06, + "loss": 1.016, + "step": 5380 + }, + { + "epoch": 0.3749043611323642, + "grad_norm": 3.9246814354674764, + "learning_rate": 9.981052763263554e-06, + "loss": 1.0932, + "step": 5390 + }, + { + "epoch": 0.3755999165333519, + "grad_norm": 2.6058302822025383, + "learning_rate": 9.98069917542742e-06, + "loss": 1.0597, + "step": 5400 + }, + { + "epoch": 0.37629547193433954, + "grad_norm": 7.5108825233648115, + "learning_rate": 9.980342325124501e-06, + "loss": 1.0248, + "step": 5410 + }, + { + "epoch": 0.37699102733532724, + "grad_norm": 8.300288604540437, + "learning_rate": 9.979982212588544e-06, + "loss": 1.0962, + "step": 5420 + }, + { + "epoch": 0.37768658273631495, + "grad_norm": 2.7378090455833086, + "learning_rate": 9.97961883805543e-06, + "loss": 1.0781, + "step": 5430 + }, + { + "epoch": 0.37838213813730265, + "grad_norm": 2.5999481284069783, + "learning_rate": 9.979252201763182e-06, + "loss": 1.0174, + "step": 5440 + }, + { + "epoch": 0.37907769353829035, + "grad_norm": 2.4321320825547477, + "learning_rate": 9.978882303951948e-06, + "loss": 1.0706, + "step": 5450 + }, + { + "epoch": 0.379773248939278, + "grad_norm": 2.925559464115409, + "learning_rate": 9.978509144864024e-06, + "loss": 1.0363, + "step": 5460 + }, + { + "epoch": 0.3804688043402657, + "grad_norm": 2.0664488783531123, + "learning_rate": 9.978132724743835e-06, + "loss": 1.112, + "step": 5470 + }, + { + "epoch": 0.3811643597412534, + "grad_norm": 4.894824512545068, + "learning_rate": 9.977753043837944e-06, + "loss": 1.0701, + "step": 5480 + }, + { + "epoch": 0.3818599151422411, + "grad_norm": 5.412656846773486, + "learning_rate": 9.977370102395052e-06, + "loss": 1.0291, + "step": 5490 + }, + { + "epoch": 0.38255547054322875, + "grad_norm": 2.9690312329576356, + "learning_rate": 9.976983900665992e-06, + "loss": 1.0147, + "step": 5500 + }, + { + "epoch": 0.38255547054322875, + "eval_loss": 1.0726287364959717, + "eval_runtime": 1322.7365, + "eval_samples_per_second": 13.73, + "eval_steps_per_second": 2.288, + "step": 5500 + }, + { + "epoch": 0.38325102594421645, + "grad_norm": 18.477096039636073, + "learning_rate": 9.976594438903737e-06, + "loss": 1.0452, + "step": 5510 + }, + { + "epoch": 0.38394658134520415, + "grad_norm": 2.8903615959280406, + "learning_rate": 9.976201717363391e-06, + "loss": 1.1294, + "step": 5520 + }, + { + "epoch": 0.38464213674619185, + "grad_norm": 2.968339590420217, + "learning_rate": 9.975805736302198e-06, + "loss": 1.1126, + "step": 5530 + }, + { + "epoch": 0.3853376921471795, + "grad_norm": 6.804917389770404, + "learning_rate": 9.97540649597953e-06, + "loss": 1.0785, + "step": 5540 + }, + { + "epoch": 0.3860332475481672, + "grad_norm": 4.458045891734725, + "learning_rate": 9.9750039966569e-06, + "loss": 1.1046, + "step": 5550 + }, + { + "epoch": 0.3867288029491549, + "grad_norm": 2.963251059834069, + "learning_rate": 9.974598238597955e-06, + "loss": 1.0614, + "step": 5560 + }, + { + "epoch": 0.3874243583501426, + "grad_norm": 2.9174136462852522, + "learning_rate": 9.974189222068476e-06, + "loss": 1.0968, + "step": 5570 + }, + { + "epoch": 0.38811991375113025, + "grad_norm": 2.704417390354094, + "learning_rate": 9.973776947336373e-06, + "loss": 1.0225, + "step": 5580 + }, + { + "epoch": 0.38881546915211795, + "grad_norm": 2.085415891046657, + "learning_rate": 9.9733614146717e-06, + "loss": 1.044, + "step": 5590 + }, + { + "epoch": 0.38951102455310566, + "grad_norm": 6.2426474447048745, + "learning_rate": 9.97294262434664e-06, + "loss": 1.0666, + "step": 5600 + }, + { + "epoch": 0.39020657995409336, + "grad_norm": 2.5812313924461905, + "learning_rate": 9.972520576635505e-06, + "loss": 1.0459, + "step": 5610 + }, + { + "epoch": 0.39090213535508106, + "grad_norm": 2.131714195844742, + "learning_rate": 9.97209527181475e-06, + "loss": 0.9875, + "step": 5620 + }, + { + "epoch": 0.3915976907560687, + "grad_norm": 5.713629700955489, + "learning_rate": 9.971666710162957e-06, + "loss": 1.0933, + "step": 5630 + }, + { + "epoch": 0.3922932461570564, + "grad_norm": 2.7799328679138195, + "learning_rate": 9.971234891960844e-06, + "loss": 1.0548, + "step": 5640 + }, + { + "epoch": 0.3929888015580441, + "grad_norm": 2.725965658929567, + "learning_rate": 9.97079981749126e-06, + "loss": 1.0605, + "step": 5650 + }, + { + "epoch": 0.3936843569590318, + "grad_norm": 2.0788316350593177, + "learning_rate": 9.970361487039191e-06, + "loss": 1.0445, + "step": 5660 + }, + { + "epoch": 0.39437991236001946, + "grad_norm": 2.326017164914017, + "learning_rate": 9.969919900891752e-06, + "loss": 1.0667, + "step": 5670 + }, + { + "epoch": 0.39507546776100716, + "grad_norm": 2.312555513788338, + "learning_rate": 9.96947505933819e-06, + "loss": 1.1029, + "step": 5680 + }, + { + "epoch": 0.39577102316199486, + "grad_norm": 2.6776398615375934, + "learning_rate": 9.969026962669885e-06, + "loss": 1.1356, + "step": 5690 + }, + { + "epoch": 0.39646657856298256, + "grad_norm": 2.4241134060983884, + "learning_rate": 9.968575611180355e-06, + "loss": 1.0875, + "step": 5700 + }, + { + "epoch": 0.3971621339639702, + "grad_norm": 3.3345449034224925, + "learning_rate": 9.96812100516524e-06, + "loss": 1.0337, + "step": 5710 + }, + { + "epoch": 0.3978576893649579, + "grad_norm": 2.9645664669939205, + "learning_rate": 9.96766314492232e-06, + "loss": 1.0759, + "step": 5720 + }, + { + "epoch": 0.3985532447659456, + "grad_norm": 2.97452835545531, + "learning_rate": 9.967202030751501e-06, + "loss": 1.1452, + "step": 5730 + }, + { + "epoch": 0.3992488001669333, + "grad_norm": 5.426244012254976, + "learning_rate": 9.966737662954826e-06, + "loss": 1.0119, + "step": 5740 + }, + { + "epoch": 0.39994435556792096, + "grad_norm": 2.9670723324432062, + "learning_rate": 9.966270041836463e-06, + "loss": 1.0996, + "step": 5750 + }, + { + "epoch": 0.40063991096890866, + "grad_norm": 3.4720090909121075, + "learning_rate": 9.965799167702716e-06, + "loss": 1.1261, + "step": 5760 + }, + { + "epoch": 0.40133546636989637, + "grad_norm": 5.241535084091025, + "learning_rate": 9.965325040862019e-06, + "loss": 1.0886, + "step": 5770 + }, + { + "epoch": 0.40203102177088407, + "grad_norm": 4.417670841381001, + "learning_rate": 9.964847661624931e-06, + "loss": 1.0269, + "step": 5780 + }, + { + "epoch": 0.4027265771718717, + "grad_norm": 3.3531063997613098, + "learning_rate": 9.964367030304149e-06, + "loss": 1.0637, + "step": 5790 + }, + { + "epoch": 0.4034221325728594, + "grad_norm": 2.6325671928171768, + "learning_rate": 9.963883147214497e-06, + "loss": 1.0783, + "step": 5800 + }, + { + "epoch": 0.4041176879738471, + "grad_norm": 3.622457980101284, + "learning_rate": 9.963396012672928e-06, + "loss": 1.0691, + "step": 5810 + }, + { + "epoch": 0.4048132433748348, + "grad_norm": 5.45068275612328, + "learning_rate": 9.962905626998529e-06, + "loss": 1.0383, + "step": 5820 + }, + { + "epoch": 0.4055087987758225, + "grad_norm": 3.0878838781103117, + "learning_rate": 9.962411990512507e-06, + "loss": 1.0838, + "step": 5830 + }, + { + "epoch": 0.40620435417681017, + "grad_norm": 2.0704319239808795, + "learning_rate": 9.96191510353821e-06, + "loss": 0.9937, + "step": 5840 + }, + { + "epoch": 0.40689990957779787, + "grad_norm": 2.139628616068829, + "learning_rate": 9.961414966401109e-06, + "loss": 1.0554, + "step": 5850 + }, + { + "epoch": 0.40759546497878557, + "grad_norm": 3.267521385384432, + "learning_rate": 9.960911579428802e-06, + "loss": 1.0285, + "step": 5860 + }, + { + "epoch": 0.4082910203797733, + "grad_norm": 2.4868412624076255, + "learning_rate": 9.96040494295102e-06, + "loss": 1.0978, + "step": 5870 + }, + { + "epoch": 0.4089865757807609, + "grad_norm": 2.6560908283550386, + "learning_rate": 9.959895057299623e-06, + "loss": 1.0375, + "step": 5880 + }, + { + "epoch": 0.4096821311817486, + "grad_norm": 2.6212901938425053, + "learning_rate": 9.959381922808594e-06, + "loss": 1.0291, + "step": 5890 + }, + { + "epoch": 0.4103776865827363, + "grad_norm": 6.5214182703959755, + "learning_rate": 9.95886553981405e-06, + "loss": 1.0298, + "step": 5900 + }, + { + "epoch": 0.411073241983724, + "grad_norm": 3.3095568519454566, + "learning_rate": 9.958345908654232e-06, + "loss": 1.0438, + "step": 5910 + }, + { + "epoch": 0.41176879738471167, + "grad_norm": 2.1383651454735713, + "learning_rate": 9.957823029669509e-06, + "loss": 1.0038, + "step": 5920 + }, + { + "epoch": 0.4124643527856994, + "grad_norm": 3.842362796164072, + "learning_rate": 9.95729690320238e-06, + "loss": 1.0693, + "step": 5930 + }, + { + "epoch": 0.4131599081866871, + "grad_norm": 4.270426655818027, + "learning_rate": 9.956767529597466e-06, + "loss": 1.0935, + "step": 5940 + }, + { + "epoch": 0.4138554635876748, + "grad_norm": 4.063758577918255, + "learning_rate": 9.956234909201523e-06, + "loss": 1.0596, + "step": 5950 + }, + { + "epoch": 0.4145510189886624, + "grad_norm": 10.586710073208788, + "learning_rate": 9.955699042363425e-06, + "loss": 1.0255, + "step": 5960 + }, + { + "epoch": 0.4152465743896501, + "grad_norm": 3.0209727451208295, + "learning_rate": 9.955159929434178e-06, + "loss": 1.014, + "step": 5970 + }, + { + "epoch": 0.4159421297906378, + "grad_norm": 2.967741700925254, + "learning_rate": 9.954617570766913e-06, + "loss": 1.0637, + "step": 5980 + }, + { + "epoch": 0.41663768519162553, + "grad_norm": 4.485511945524776, + "learning_rate": 9.954071966716887e-06, + "loss": 0.9971, + "step": 5990 + }, + { + "epoch": 0.4173332405926132, + "grad_norm": 4.714500822743432, + "learning_rate": 9.953523117641482e-06, + "loss": 1.0298, + "step": 6000 + }, + { + "epoch": 0.4173332405926132, + "eval_loss": 1.0646781921386719, + "eval_runtime": 1322.1856, + "eval_samples_per_second": 13.736, + "eval_steps_per_second": 2.289, + "step": 6000 + }, + { + "epoch": 0.4180287959936009, + "grad_norm": 2.992046363817223, + "learning_rate": 9.952971023900207e-06, + "loss": 1.0727, + "step": 6010 + }, + { + "epoch": 0.4187243513945886, + "grad_norm": 4.082679323378325, + "learning_rate": 9.952415685854692e-06, + "loss": 1.0715, + "step": 6020 + }, + { + "epoch": 0.4194199067955763, + "grad_norm": 2.342809829289365, + "learning_rate": 9.9518571038687e-06, + "loss": 1.0782, + "step": 6030 + }, + { + "epoch": 0.420115462196564, + "grad_norm": 2.378735517647908, + "learning_rate": 9.951295278308113e-06, + "loss": 1.0915, + "step": 6040 + }, + { + "epoch": 0.42081101759755163, + "grad_norm": 3.522479519441757, + "learning_rate": 9.95073020954094e-06, + "loss": 1.0539, + "step": 6050 + }, + { + "epoch": 0.42150657299853933, + "grad_norm": 2.3000806016819264, + "learning_rate": 9.95016189793731e-06, + "loss": 1.0723, + "step": 6060 + }, + { + "epoch": 0.42220212839952703, + "grad_norm": 2.7331354092175406, + "learning_rate": 9.949590343869483e-06, + "loss": 1.0211, + "step": 6070 + }, + { + "epoch": 0.42289768380051473, + "grad_norm": 2.26380730669913, + "learning_rate": 9.949015547711836e-06, + "loss": 1.0716, + "step": 6080 + }, + { + "epoch": 0.4235932392015024, + "grad_norm": 3.752052693521689, + "learning_rate": 9.948437509840877e-06, + "loss": 1.0497, + "step": 6090 + }, + { + "epoch": 0.4242887946024901, + "grad_norm": 4.589810910186348, + "learning_rate": 9.947856230635228e-06, + "loss": 1.0602, + "step": 6100 + }, + { + "epoch": 0.4249843500034778, + "grad_norm": 4.036920147614985, + "learning_rate": 9.947271710475647e-06, + "loss": 1.08, + "step": 6110 + }, + { + "epoch": 0.4256799054044655, + "grad_norm": 6.502540930857871, + "learning_rate": 9.946683949745002e-06, + "loss": 1.0701, + "step": 6120 + }, + { + "epoch": 0.42637546080545313, + "grad_norm": 1.9528421896138686, + "learning_rate": 9.94609294882829e-06, + "loss": 1.1237, + "step": 6130 + }, + { + "epoch": 0.42707101620644083, + "grad_norm": 5.887049566403137, + "learning_rate": 9.945498708112632e-06, + "loss": 1.0884, + "step": 6140 + }, + { + "epoch": 0.42776657160742854, + "grad_norm": 3.3710079647301376, + "learning_rate": 9.944901227987264e-06, + "loss": 0.9738, + "step": 6150 + }, + { + "epoch": 0.42846212700841624, + "grad_norm": 4.087247482192941, + "learning_rate": 9.944300508843555e-06, + "loss": 1.0197, + "step": 6160 + }, + { + "epoch": 0.4291576824094039, + "grad_norm": 2.5631502240661828, + "learning_rate": 9.943696551074982e-06, + "loss": 1.0279, + "step": 6170 + }, + { + "epoch": 0.4298532378103916, + "grad_norm": 1.917903864605394, + "learning_rate": 9.943089355077156e-06, + "loss": 1.0095, + "step": 6180 + }, + { + "epoch": 0.4305487932113793, + "grad_norm": 2.14245625584192, + "learning_rate": 9.9424789212478e-06, + "loss": 1.0516, + "step": 6190 + }, + { + "epoch": 0.431244348612367, + "grad_norm": 13.673192856199007, + "learning_rate": 9.941865249986765e-06, + "loss": 1.001, + "step": 6200 + }, + { + "epoch": 0.43193990401335464, + "grad_norm": 3.8306909890166634, + "learning_rate": 9.941248341696017e-06, + "loss": 1.0402, + "step": 6210 + }, + { + "epoch": 0.43263545941434234, + "grad_norm": 3.031794286200291, + "learning_rate": 9.940628196779644e-06, + "loss": 1.0467, + "step": 6220 + }, + { + "epoch": 0.43333101481533004, + "grad_norm": 2.3274604232145895, + "learning_rate": 9.940004815643855e-06, + "loss": 1.0132, + "step": 6230 + }, + { + "epoch": 0.43402657021631774, + "grad_norm": 3.4498033879498142, + "learning_rate": 9.939378198696978e-06, + "loss": 1.0359, + "step": 6240 + }, + { + "epoch": 0.43472212561730544, + "grad_norm": 2.5508440478923, + "learning_rate": 9.938748346349463e-06, + "loss": 1.1183, + "step": 6250 + }, + { + "epoch": 0.4354176810182931, + "grad_norm": 2.306156966436083, + "learning_rate": 9.938115259013875e-06, + "loss": 1.0432, + "step": 6260 + }, + { + "epoch": 0.4361132364192808, + "grad_norm": 3.2790064724674517, + "learning_rate": 9.937478937104899e-06, + "loss": 1.0378, + "step": 6270 + }, + { + "epoch": 0.4368087918202685, + "grad_norm": 3.273467508238954, + "learning_rate": 9.936839381039341e-06, + "loss": 1.0903, + "step": 6280 + }, + { + "epoch": 0.4375043472212562, + "grad_norm": 3.308852069296277, + "learning_rate": 9.936196591236125e-06, + "loss": 1.0553, + "step": 6290 + }, + { + "epoch": 0.43819990262224384, + "grad_norm": 2.6023753561486207, + "learning_rate": 9.93555056811629e-06, + "loss": 1.0534, + "step": 6300 + }, + { + "epoch": 0.43889545802323154, + "grad_norm": 3.544704035763468, + "learning_rate": 9.934901312103001e-06, + "loss": 1.0892, + "step": 6310 + }, + { + "epoch": 0.43959101342421925, + "grad_norm": 2.810017259909896, + "learning_rate": 9.934248823621526e-06, + "loss": 1.0838, + "step": 6320 + }, + { + "epoch": 0.44028656882520695, + "grad_norm": 1.5982331364657276, + "learning_rate": 9.933593103099266e-06, + "loss": 0.9949, + "step": 6330 + }, + { + "epoch": 0.4409821242261946, + "grad_norm": 2.797080058688697, + "learning_rate": 9.93293415096573e-06, + "loss": 1.1022, + "step": 6340 + }, + { + "epoch": 0.4416776796271823, + "grad_norm": 2.2162992095415843, + "learning_rate": 9.932271967652547e-06, + "loss": 1.0636, + "step": 6350 + }, + { + "epoch": 0.44237323502817, + "grad_norm": 4.269913644587961, + "learning_rate": 9.93160655359346e-06, + "loss": 1.0981, + "step": 6360 + }, + { + "epoch": 0.4430687904291577, + "grad_norm": 3.8649391318570467, + "learning_rate": 9.93093790922433e-06, + "loss": 1.121, + "step": 6370 + }, + { + "epoch": 0.44376434583014535, + "grad_norm": 2.4325608125538003, + "learning_rate": 9.930266034983134e-06, + "loss": 1.0988, + "step": 6380 + }, + { + "epoch": 0.44445990123113305, + "grad_norm": 3.4015388852261275, + "learning_rate": 9.929590931309967e-06, + "loss": 1.0404, + "step": 6390 + }, + { + "epoch": 0.44515545663212075, + "grad_norm": 3.328543569363123, + "learning_rate": 9.92891259864703e-06, + "loss": 1.0433, + "step": 6400 + }, + { + "epoch": 0.44585101203310845, + "grad_norm": 2.364876310455947, + "learning_rate": 9.928231037438654e-06, + "loss": 1.0524, + "step": 6410 + }, + { + "epoch": 0.44654656743409615, + "grad_norm": 3.8791572221088244, + "learning_rate": 9.92754624813127e-06, + "loss": 1.1138, + "step": 6420 + }, + { + "epoch": 0.4472421228350838, + "grad_norm": 1.8972330838716853, + "learning_rate": 9.926858231173435e-06, + "loss": 1.0207, + "step": 6430 + }, + { + "epoch": 0.4479376782360715, + "grad_norm": 3.950879631191104, + "learning_rate": 9.92616698701581e-06, + "loss": 1.0228, + "step": 6440 + }, + { + "epoch": 0.4486332336370592, + "grad_norm": 4.479772290407714, + "learning_rate": 9.925472516111178e-06, + "loss": 1.0345, + "step": 6450 + }, + { + "epoch": 0.4493287890380469, + "grad_norm": 3.563401991284036, + "learning_rate": 9.92477481891443e-06, + "loss": 1.0817, + "step": 6460 + }, + { + "epoch": 0.45002434443903455, + "grad_norm": 3.6332035535437623, + "learning_rate": 9.924073895882579e-06, + "loss": 1.0787, + "step": 6470 + }, + { + "epoch": 0.45071989984002225, + "grad_norm": 3.0153256301254454, + "learning_rate": 9.923369747474738e-06, + "loss": 1.0537, + "step": 6480 + }, + { + "epoch": 0.45141545524100996, + "grad_norm": 2.6893653067932313, + "learning_rate": 9.922662374152144e-06, + "loss": 1.0907, + "step": 6490 + }, + { + "epoch": 0.45211101064199766, + "grad_norm": 2.8437010073133897, + "learning_rate": 9.92195177637814e-06, + "loss": 1.0667, + "step": 6500 + }, + { + "epoch": 0.45211101064199766, + "eval_loss": 1.0582948923110962, + "eval_runtime": 1324.4484, + "eval_samples_per_second": 13.712, + "eval_steps_per_second": 2.285, + "step": 6500 + }, + { + "epoch": 0.4528065660429853, + "grad_norm": 3.385306808908416, + "learning_rate": 9.921237954618184e-06, + "loss": 1.0077, + "step": 6510 + }, + { + "epoch": 0.453502121443973, + "grad_norm": 4.000698366659246, + "learning_rate": 9.920520909339843e-06, + "loss": 1.0169, + "step": 6520 + }, + { + "epoch": 0.4541976768449607, + "grad_norm": 2.4775834692965626, + "learning_rate": 9.9198006410128e-06, + "loss": 0.9836, + "step": 6530 + }, + { + "epoch": 0.4548932322459484, + "grad_norm": 2.372882518575469, + "learning_rate": 9.919077150108846e-06, + "loss": 1.0495, + "step": 6540 + }, + { + "epoch": 0.45558878764693606, + "grad_norm": 2.298416554862528, + "learning_rate": 9.91835043710188e-06, + "loss": 1.0759, + "step": 6550 + }, + { + "epoch": 0.45628434304792376, + "grad_norm": 1.921901491226923, + "learning_rate": 9.917620502467921e-06, + "loss": 1.0534, + "step": 6560 + }, + { + "epoch": 0.45697989844891146, + "grad_norm": 5.92030487792017, + "learning_rate": 9.916887346685087e-06, + "loss": 1.1, + "step": 6570 + }, + { + "epoch": 0.45767545384989916, + "grad_norm": 2.4802719393351134, + "learning_rate": 9.916150970233612e-06, + "loss": 1.0133, + "step": 6580 + }, + { + "epoch": 0.4583710092508868, + "grad_norm": 2.447611803865586, + "learning_rate": 9.91541137359584e-06, + "loss": 1.0482, + "step": 6590 + }, + { + "epoch": 0.4590665646518745, + "grad_norm": 3.666141726965627, + "learning_rate": 9.914668557256221e-06, + "loss": 1.0111, + "step": 6600 + }, + { + "epoch": 0.4597621200528622, + "grad_norm": 8.039369461011216, + "learning_rate": 9.913922521701318e-06, + "loss": 1.0092, + "step": 6610 + }, + { + "epoch": 0.4604576754538499, + "grad_norm": 7.484121307679558, + "learning_rate": 9.9131732674198e-06, + "loss": 1.0458, + "step": 6620 + }, + { + "epoch": 0.4611532308548376, + "grad_norm": 2.592244055119813, + "learning_rate": 9.912420794902445e-06, + "loss": 0.9944, + "step": 6630 + }, + { + "epoch": 0.46184878625582526, + "grad_norm": 3.0483663835434682, + "learning_rate": 9.911665104642138e-06, + "loss": 1.0522, + "step": 6640 + }, + { + "epoch": 0.46254434165681296, + "grad_norm": 2.85475221311468, + "learning_rate": 9.910906197133874e-06, + "loss": 1.0797, + "step": 6650 + }, + { + "epoch": 0.46323989705780066, + "grad_norm": 2.192331392952616, + "learning_rate": 9.910144072874753e-06, + "loss": 1.0377, + "step": 6660 + }, + { + "epoch": 0.46393545245878837, + "grad_norm": 2.4051219760258262, + "learning_rate": 9.909378732363982e-06, + "loss": 1.0936, + "step": 6670 + }, + { + "epoch": 0.464631007859776, + "grad_norm": 3.1942771327771293, + "learning_rate": 9.908610176102879e-06, + "loss": 1.0716, + "step": 6680 + }, + { + "epoch": 0.4653265632607637, + "grad_norm": 3.944453399010348, + "learning_rate": 9.907838404594863e-06, + "loss": 0.9693, + "step": 6690 + }, + { + "epoch": 0.4660221186617514, + "grad_norm": 4.836134929521629, + "learning_rate": 9.90706341834546e-06, + "loss": 1.0737, + "step": 6700 + }, + { + "epoch": 0.4667176740627391, + "grad_norm": 2.717449621733962, + "learning_rate": 9.906285217862306e-06, + "loss": 1.0914, + "step": 6710 + }, + { + "epoch": 0.46741322946372676, + "grad_norm": 3.4733485878121604, + "learning_rate": 9.905503803655137e-06, + "loss": 1.0831, + "step": 6720 + }, + { + "epoch": 0.46810878486471447, + "grad_norm": 3.1901273389616263, + "learning_rate": 9.904719176235797e-06, + "loss": 1.0855, + "step": 6730 + }, + { + "epoch": 0.46880434026570217, + "grad_norm": 2.833679579394415, + "learning_rate": 9.903931336118233e-06, + "loss": 0.9995, + "step": 6740 + }, + { + "epoch": 0.46949989566668987, + "grad_norm": 3.5442943645763263, + "learning_rate": 9.9031402838185e-06, + "loss": 1.0237, + "step": 6750 + }, + { + "epoch": 0.4701954510676775, + "grad_norm": 2.3418764411387794, + "learning_rate": 9.902346019854753e-06, + "loss": 1.0512, + "step": 6760 + }, + { + "epoch": 0.4708910064686652, + "grad_norm": 2.255312479582044, + "learning_rate": 9.901548544747252e-06, + "loss": 1.066, + "step": 6770 + }, + { + "epoch": 0.4715865618696529, + "grad_norm": 3.910982259077785, + "learning_rate": 9.900747859018362e-06, + "loss": 1.0092, + "step": 6780 + }, + { + "epoch": 0.4722821172706406, + "grad_norm": 7.739445676571755, + "learning_rate": 9.89994396319255e-06, + "loss": 0.9755, + "step": 6790 + }, + { + "epoch": 0.47297767267162827, + "grad_norm": 4.02938375648387, + "learning_rate": 9.899136857796381e-06, + "loss": 1.1058, + "step": 6800 + }, + { + "epoch": 0.47367322807261597, + "grad_norm": 3.378544899678043, + "learning_rate": 9.898326543358531e-06, + "loss": 1.0963, + "step": 6810 + }, + { + "epoch": 0.4743687834736037, + "grad_norm": 2.9571055617026327, + "learning_rate": 9.897513020409772e-06, + "loss": 1.0965, + "step": 6820 + }, + { + "epoch": 0.4750643388745914, + "grad_norm": 2.8147099528066053, + "learning_rate": 9.896696289482982e-06, + "loss": 0.9869, + "step": 6830 + }, + { + "epoch": 0.4757598942755791, + "grad_norm": 2.3174655993696773, + "learning_rate": 9.895876351113131e-06, + "loss": 1.0903, + "step": 6840 + }, + { + "epoch": 0.4764554496765667, + "grad_norm": 5.128886822822465, + "learning_rate": 9.895053205837305e-06, + "loss": 1.0292, + "step": 6850 + }, + { + "epoch": 0.4771510050775544, + "grad_norm": 2.2450244408640283, + "learning_rate": 9.894226854194676e-06, + "loss": 1.0618, + "step": 6860 + }, + { + "epoch": 0.4778465604785421, + "grad_norm": 3.101699340368222, + "learning_rate": 9.893397296726523e-06, + "loss": 1.075, + "step": 6870 + }, + { + "epoch": 0.47854211587952983, + "grad_norm": 90.75420222309641, + "learning_rate": 9.892564533976228e-06, + "loss": 1.101, + "step": 6880 + }, + { + "epoch": 0.4792376712805175, + "grad_norm": 4.366403292579255, + "learning_rate": 9.891728566489264e-06, + "loss": 1.1247, + "step": 6890 + }, + { + "epoch": 0.4799332266815052, + "grad_norm": 3.6070469761555914, + "learning_rate": 9.890889394813214e-06, + "loss": 1.0862, + "step": 6900 + }, + { + "epoch": 0.4806287820824929, + "grad_norm": 2.2100658210557924, + "learning_rate": 9.890047019497747e-06, + "loss": 1.0736, + "step": 6910 + }, + { + "epoch": 0.4813243374834806, + "grad_norm": 2.7722864439915633, + "learning_rate": 9.889201441094643e-06, + "loss": 1.0614, + "step": 6920 + }, + { + "epoch": 0.4820198928844682, + "grad_norm": 2.740879380482433, + "learning_rate": 9.888352660157769e-06, + "loss": 1.0238, + "step": 6930 + }, + { + "epoch": 0.48271544828545593, + "grad_norm": 2.4966690211509173, + "learning_rate": 9.887500677243099e-06, + "loss": 1.0299, + "step": 6940 + }, + { + "epoch": 0.48341100368644363, + "grad_norm": 2.88289485817156, + "learning_rate": 9.8866454929087e-06, + "loss": 1.0288, + "step": 6950 + }, + { + "epoch": 0.48410655908743133, + "grad_norm": 3.7780949132315547, + "learning_rate": 9.885787107714734e-06, + "loss": 1.0757, + "step": 6960 + }, + { + "epoch": 0.484802114488419, + "grad_norm": 2.958426431293008, + "learning_rate": 9.884925522223463e-06, + "loss": 1.0994, + "step": 6970 + }, + { + "epoch": 0.4854976698894067, + "grad_norm": 4.18274998685782, + "learning_rate": 9.884060736999249e-06, + "loss": 1.0298, + "step": 6980 + }, + { + "epoch": 0.4861932252903944, + "grad_norm": 2.773191893610502, + "learning_rate": 9.883192752608537e-06, + "loss": 1.0784, + "step": 6990 + }, + { + "epoch": 0.4868887806913821, + "grad_norm": 2.399843479201011, + "learning_rate": 9.882321569619882e-06, + "loss": 1.0409, + "step": 7000 + }, + { + "epoch": 0.4868887806913821, + "eval_loss": 1.054239273071289, + "eval_runtime": 1321.4863, + "eval_samples_per_second": 13.743, + "eval_steps_per_second": 2.291, + "step": 7000 + }, + { + "epoch": 0.48758433609236973, + "grad_norm": 2.3304706873186736, + "learning_rate": 9.881447188603926e-06, + "loss": 1.0771, + "step": 7010 + }, + { + "epoch": 0.48827989149335743, + "grad_norm": 2.4150046038184523, + "learning_rate": 9.880569610133406e-06, + "loss": 1.1118, + "step": 7020 + }, + { + "epoch": 0.48897544689434513, + "grad_norm": 6.774934639546385, + "learning_rate": 9.879688834783159e-06, + "loss": 1.0327, + "step": 7030 + }, + { + "epoch": 0.48967100229533284, + "grad_norm": 2.9144460198676363, + "learning_rate": 9.878804863130107e-06, + "loss": 1.0749, + "step": 7040 + }, + { + "epoch": 0.49036655769632054, + "grad_norm": 1.8224843144925664, + "learning_rate": 9.877917695753275e-06, + "loss": 1.0701, + "step": 7050 + }, + { + "epoch": 0.4910621130973082, + "grad_norm": 2.8335142619370473, + "learning_rate": 9.877027333233776e-06, + "loss": 1.048, + "step": 7060 + }, + { + "epoch": 0.4917576684982959, + "grad_norm": 4.795727718883155, + "learning_rate": 9.876133776154815e-06, + "loss": 1.0845, + "step": 7070 + }, + { + "epoch": 0.4924532238992836, + "grad_norm": 2.409792024595756, + "learning_rate": 9.875237025101694e-06, + "loss": 1.0724, + "step": 7080 + }, + { + "epoch": 0.4931487793002713, + "grad_norm": 1.9892366354445605, + "learning_rate": 9.874337080661802e-06, + "loss": 1.0628, + "step": 7090 + }, + { + "epoch": 0.49384433470125894, + "grad_norm": 2.66653643676216, + "learning_rate": 9.873433943424624e-06, + "loss": 1.0623, + "step": 7100 + }, + { + "epoch": 0.49453989010224664, + "grad_norm": 2.179285180761321, + "learning_rate": 9.872527613981735e-06, + "loss": 1.0648, + "step": 7110 + }, + { + "epoch": 0.49523544550323434, + "grad_norm": 16.426364722914293, + "learning_rate": 9.871618092926799e-06, + "loss": 1.0325, + "step": 7120 + }, + { + "epoch": 0.49593100090422204, + "grad_norm": 2.463729527778021, + "learning_rate": 9.870705380855573e-06, + "loss": 1.0397, + "step": 7130 + }, + { + "epoch": 0.4966265563052097, + "grad_norm": 2.5211472221932287, + "learning_rate": 9.869789478365904e-06, + "loss": 1.0155, + "step": 7140 + }, + { + "epoch": 0.4973221117061974, + "grad_norm": 1.9838050916531509, + "learning_rate": 9.868870386057727e-06, + "loss": 1.0482, + "step": 7150 + }, + { + "epoch": 0.4980176671071851, + "grad_norm": 2.00851013507589, + "learning_rate": 9.867948104533067e-06, + "loss": 1.0279, + "step": 7160 + }, + { + "epoch": 0.4987132225081728, + "grad_norm": 2.2529441796404637, + "learning_rate": 9.86702263439604e-06, + "loss": 1.0596, + "step": 7170 + }, + { + "epoch": 0.49940877790916044, + "grad_norm": 2.9783695513366295, + "learning_rate": 9.86609397625285e-06, + "loss": 1.1005, + "step": 7180 + }, + { + "epoch": 0.5001043333101481, + "grad_norm": 2.77613765052381, + "learning_rate": 9.865162130711786e-06, + "loss": 1.0708, + "step": 7190 + }, + { + "epoch": 0.5007998887111358, + "grad_norm": 2.076274613617754, + "learning_rate": 9.86422709838323e-06, + "loss": 1.0078, + "step": 7200 + }, + { + "epoch": 0.5014954441121235, + "grad_norm": 3.6912228417701427, + "learning_rate": 9.863288879879645e-06, + "loss": 1.0758, + "step": 7210 + }, + { + "epoch": 0.5021909995131112, + "grad_norm": 1.6829158933903898, + "learning_rate": 9.862347475815585e-06, + "loss": 1.0481, + "step": 7220 + }, + { + "epoch": 0.502886554914099, + "grad_norm": 14.955855198733087, + "learning_rate": 9.861402886807694e-06, + "loss": 0.9939, + "step": 7230 + }, + { + "epoch": 0.5035821103150866, + "grad_norm": 2.884151152790379, + "learning_rate": 9.860455113474697e-06, + "loss": 1.0956, + "step": 7240 + }, + { + "epoch": 0.5042776657160742, + "grad_norm": 3.103421364992511, + "learning_rate": 9.859504156437402e-06, + "loss": 1.0733, + "step": 7250 + }, + { + "epoch": 0.504973221117062, + "grad_norm": 2.576241067584382, + "learning_rate": 9.858550016318714e-06, + "loss": 1.0354, + "step": 7260 + }, + { + "epoch": 0.5056687765180496, + "grad_norm": 5.25491326307299, + "learning_rate": 9.85759269374361e-06, + "loss": 1.0309, + "step": 7270 + }, + { + "epoch": 0.5063643319190374, + "grad_norm": 2.9238567090476058, + "learning_rate": 9.856632189339157e-06, + "loss": 1.1056, + "step": 7280 + }, + { + "epoch": 0.507059887320025, + "grad_norm": 10.01785302286799, + "learning_rate": 9.85566850373451e-06, + "loss": 1.0237, + "step": 7290 + }, + { + "epoch": 0.5077554427210127, + "grad_norm": 2.795921050758609, + "learning_rate": 9.854701637560902e-06, + "loss": 1.0868, + "step": 7300 + }, + { + "epoch": 0.5084509981220005, + "grad_norm": 1.7965927309902272, + "learning_rate": 9.853731591451652e-06, + "loss": 1.0023, + "step": 7310 + }, + { + "epoch": 0.5091465535229881, + "grad_norm": 1.6664719453139656, + "learning_rate": 9.852758366042161e-06, + "loss": 1.0123, + "step": 7320 + }, + { + "epoch": 0.5098421089239757, + "grad_norm": 2.4238981622108158, + "learning_rate": 9.851781961969913e-06, + "loss": 1.0902, + "step": 7330 + }, + { + "epoch": 0.5105376643249635, + "grad_norm": 4.061617869630927, + "learning_rate": 9.850802379874476e-06, + "loss": 1.0524, + "step": 7340 + }, + { + "epoch": 0.5112332197259511, + "grad_norm": 3.9759608771156687, + "learning_rate": 9.849819620397494e-06, + "loss": 1.0589, + "step": 7350 + }, + { + "epoch": 0.5119287751269389, + "grad_norm": 3.513342176462833, + "learning_rate": 9.848833684182698e-06, + "loss": 1.0561, + "step": 7360 + }, + { + "epoch": 0.5126243305279266, + "grad_norm": 2.058396588834767, + "learning_rate": 9.847844571875898e-06, + "loss": 1.0438, + "step": 7370 + }, + { + "epoch": 0.5133198859289142, + "grad_norm": 3.0793568179386126, + "learning_rate": 9.846852284124982e-06, + "loss": 1.0831, + "step": 7380 + }, + { + "epoch": 0.514015441329902, + "grad_norm": 5.816395575635206, + "learning_rate": 9.845856821579922e-06, + "loss": 1.0615, + "step": 7390 + }, + { + "epoch": 0.5147109967308896, + "grad_norm": 2.272976759211305, + "learning_rate": 9.844858184892769e-06, + "loss": 1.0174, + "step": 7400 + }, + { + "epoch": 0.5154065521318772, + "grad_norm": 2.8752265190974846, + "learning_rate": 9.84385637471765e-06, + "loss": 1.0603, + "step": 7410 + }, + { + "epoch": 0.516102107532865, + "grad_norm": 2.0546359939942134, + "learning_rate": 9.842851391710772e-06, + "loss": 1.0574, + "step": 7420 + }, + { + "epoch": 0.5167976629338527, + "grad_norm": 2.570738309973232, + "learning_rate": 9.841843236530424e-06, + "loss": 1.1295, + "step": 7430 + }, + { + "epoch": 0.5174932183348404, + "grad_norm": 2.0655473641140216, + "learning_rate": 9.840831909836965e-06, + "loss": 1.0666, + "step": 7440 + }, + { + "epoch": 0.5181887737358281, + "grad_norm": 2.0787874386224816, + "learning_rate": 9.83981741229284e-06, + "loss": 1.0589, + "step": 7450 + }, + { + "epoch": 0.5188843291368157, + "grad_norm": 3.6567009236805146, + "learning_rate": 9.838799744562564e-06, + "loss": 1.0069, + "step": 7460 + }, + { + "epoch": 0.5195798845378035, + "grad_norm": 2.322906892998821, + "learning_rate": 9.837778907312735e-06, + "loss": 1.0677, + "step": 7470 + }, + { + "epoch": 0.5202754399387911, + "grad_norm": 2.5994306375942537, + "learning_rate": 9.836754901212022e-06, + "loss": 1.0356, + "step": 7480 + }, + { + "epoch": 0.5209709953397789, + "grad_norm": 2.501514441466175, + "learning_rate": 9.83572772693117e-06, + "loss": 1.0551, + "step": 7490 + }, + { + "epoch": 0.5216665507407665, + "grad_norm": 3.6146473714914977, + "learning_rate": 9.834697385143002e-06, + "loss": 1.0719, + "step": 7500 + }, + { + "epoch": 0.5216665507407665, + "eval_loss": 1.0500108003616333, + "eval_runtime": 1323.8393, + "eval_samples_per_second": 13.718, + "eval_steps_per_second": 2.287, + "step": 7500 + }, + { + "epoch": 0.5223621061417542, + "grad_norm": 2.879022153110598, + "learning_rate": 9.833663876522415e-06, + "loss": 1.0892, + "step": 7510 + }, + { + "epoch": 0.5230576615427419, + "grad_norm": 3.1578531606759506, + "learning_rate": 9.832627201746377e-06, + "loss": 1.0341, + "step": 7520 + }, + { + "epoch": 0.5237532169437296, + "grad_norm": 2.331093204003521, + "learning_rate": 9.831587361493936e-06, + "loss": 1.0277, + "step": 7530 + }, + { + "epoch": 0.5244487723447172, + "grad_norm": 2.7454491806981753, + "learning_rate": 9.830544356446208e-06, + "loss": 1.0654, + "step": 7540 + }, + { + "epoch": 0.525144327745705, + "grad_norm": 2.620510564450273, + "learning_rate": 9.829498187286385e-06, + "loss": 1.1125, + "step": 7550 + }, + { + "epoch": 0.5258398831466926, + "grad_norm": 2.454256147927303, + "learning_rate": 9.828448854699732e-06, + "loss": 1.081, + "step": 7560 + }, + { + "epoch": 0.5265354385476804, + "grad_norm": 3.5055301953409264, + "learning_rate": 9.827396359373582e-06, + "loss": 1.0425, + "step": 7570 + }, + { + "epoch": 0.527230993948668, + "grad_norm": 2.186714145558953, + "learning_rate": 9.826340701997343e-06, + "loss": 1.1011, + "step": 7580 + }, + { + "epoch": 0.5279265493496557, + "grad_norm": 2.318885779115785, + "learning_rate": 9.825281883262497e-06, + "loss": 1.0655, + "step": 7590 + }, + { + "epoch": 0.5286221047506434, + "grad_norm": 3.176363778239221, + "learning_rate": 9.824219903862587e-06, + "loss": 1.0232, + "step": 7600 + }, + { + "epoch": 0.5293176601516311, + "grad_norm": 3.2074211683765586, + "learning_rate": 9.823154764493237e-06, + "loss": 1.0573, + "step": 7610 + }, + { + "epoch": 0.5300132155526188, + "grad_norm": 2.6661335527030325, + "learning_rate": 9.822086465852138e-06, + "loss": 1.0528, + "step": 7620 + }, + { + "epoch": 0.5307087709536065, + "grad_norm": 3.5004020588641636, + "learning_rate": 9.821015008639046e-06, + "loss": 1.094, + "step": 7630 + }, + { + "epoch": 0.5314043263545941, + "grad_norm": 3.337335748016239, + "learning_rate": 9.819940393555788e-06, + "loss": 1.031, + "step": 7640 + }, + { + "epoch": 0.5320998817555819, + "grad_norm": 2.255983720371493, + "learning_rate": 9.818862621306264e-06, + "loss": 1.0793, + "step": 7650 + }, + { + "epoch": 0.5327954371565695, + "grad_norm": 3.056374712171867, + "learning_rate": 9.817781692596438e-06, + "loss": 1.0958, + "step": 7660 + }, + { + "epoch": 0.5334909925575572, + "grad_norm": 5.399676754067343, + "learning_rate": 9.816697608134339e-06, + "loss": 1.0171, + "step": 7670 + }, + { + "epoch": 0.5341865479585449, + "grad_norm": 2.3084453823916604, + "learning_rate": 9.815610368630065e-06, + "loss": 1.0661, + "step": 7680 + }, + { + "epoch": 0.5348821033595326, + "grad_norm": 1.8573607696722092, + "learning_rate": 9.814519974795786e-06, + "loss": 1.0141, + "step": 7690 + }, + { + "epoch": 0.5355776587605203, + "grad_norm": 2.461776835656508, + "learning_rate": 9.813426427345733e-06, + "loss": 1.0296, + "step": 7700 + }, + { + "epoch": 0.536273214161508, + "grad_norm": 3.054508669117471, + "learning_rate": 9.812329726996202e-06, + "loss": 1.0, + "step": 7710 + }, + { + "epoch": 0.5369687695624956, + "grad_norm": 7.415985582789118, + "learning_rate": 9.811229874465554e-06, + "loss": 0.9876, + "step": 7720 + }, + { + "epoch": 0.5376643249634834, + "grad_norm": 1.926165949970257, + "learning_rate": 9.810126870474219e-06, + "loss": 1.0461, + "step": 7730 + }, + { + "epoch": 0.538359880364471, + "grad_norm": 2.0832318264089347, + "learning_rate": 9.80902071574469e-06, + "loss": 1.165, + "step": 7740 + }, + { + "epoch": 0.5390554357654587, + "grad_norm": 1.5739295739316188, + "learning_rate": 9.807911411001518e-06, + "loss": 0.976, + "step": 7750 + }, + { + "epoch": 0.5397509911664464, + "grad_norm": 2.8810779630208536, + "learning_rate": 9.806798956971327e-06, + "loss": 1.0637, + "step": 7760 + }, + { + "epoch": 0.5404465465674341, + "grad_norm": 3.446255290320476, + "learning_rate": 9.805683354382795e-06, + "loss": 1.1381, + "step": 7770 + }, + { + "epoch": 0.5411421019684218, + "grad_norm": 2.258971492391677, + "learning_rate": 9.80456460396667e-06, + "loss": 1.0714, + "step": 7780 + }, + { + "epoch": 0.5418376573694095, + "grad_norm": 4.325420002554537, + "learning_rate": 9.803442706455751e-06, + "loss": 1.069, + "step": 7790 + }, + { + "epoch": 0.5425332127703971, + "grad_norm": 3.9201539235544787, + "learning_rate": 9.802317662584912e-06, + "loss": 1.0489, + "step": 7800 + }, + { + "epoch": 0.5432287681713849, + "grad_norm": 2.8728716710864, + "learning_rate": 9.801189473091078e-06, + "loss": 1.0741, + "step": 7810 + }, + { + "epoch": 0.5439243235723725, + "grad_norm": 2.5817333650328016, + "learning_rate": 9.80005813871324e-06, + "loss": 1.0174, + "step": 7820 + }, + { + "epoch": 0.5446198789733603, + "grad_norm": 2.310619300228487, + "learning_rate": 9.798923660192444e-06, + "loss": 1.0691, + "step": 7830 + }, + { + "epoch": 0.5453154343743479, + "grad_norm": 2.6351173217868986, + "learning_rate": 9.797786038271801e-06, + "loss": 1.0049, + "step": 7840 + }, + { + "epoch": 0.5460109897753356, + "grad_norm": 2.447924739416393, + "learning_rate": 9.796645273696476e-06, + "loss": 1.0491, + "step": 7850 + }, + { + "epoch": 0.5467065451763233, + "grad_norm": 3.3662233517699627, + "learning_rate": 9.795501367213696e-06, + "loss": 1.0633, + "step": 7860 + }, + { + "epoch": 0.547402100577311, + "grad_norm": 3.5874298169770142, + "learning_rate": 9.794354319572742e-06, + "loss": 1.077, + "step": 7870 + }, + { + "epoch": 0.5480976559782986, + "grad_norm": 2.554244109403647, + "learning_rate": 9.793204131524961e-06, + "loss": 1.013, + "step": 7880 + }, + { + "epoch": 0.5487932113792864, + "grad_norm": 1.750292042471757, + "learning_rate": 9.792050803823747e-06, + "loss": 1.0274, + "step": 7890 + }, + { + "epoch": 0.549488766780274, + "grad_norm": 4.38209287739139, + "learning_rate": 9.790894337224555e-06, + "loss": 1.0548, + "step": 7900 + }, + { + "epoch": 0.5501843221812618, + "grad_norm": 3.740941856307656, + "learning_rate": 9.789734732484897e-06, + "loss": 1.0985, + "step": 7910 + }, + { + "epoch": 0.5508798775822494, + "grad_norm": 3.2922691318911674, + "learning_rate": 9.78857199036434e-06, + "loss": 0.9997, + "step": 7920 + }, + { + "epoch": 0.5515754329832371, + "grad_norm": 20.8819848928327, + "learning_rate": 9.787406111624504e-06, + "loss": 1.0236, + "step": 7930 + }, + { + "epoch": 0.5522709883842248, + "grad_norm": 2.1899541566985676, + "learning_rate": 9.786237097029065e-06, + "loss": 1.0118, + "step": 7940 + }, + { + "epoch": 0.5529665437852125, + "grad_norm": 8.356636957063943, + "learning_rate": 9.785064947343754e-06, + "loss": 1.0388, + "step": 7950 + }, + { + "epoch": 0.5536620991862001, + "grad_norm": 2.355741016077469, + "learning_rate": 9.783889663336356e-06, + "loss": 1.0246, + "step": 7960 + }, + { + "epoch": 0.5543576545871879, + "grad_norm": 1.7803851669485373, + "learning_rate": 9.782711245776703e-06, + "loss": 1.032, + "step": 7970 + }, + { + "epoch": 0.5550532099881755, + "grad_norm": 2.1268537745184912, + "learning_rate": 9.78152969543669e-06, + "loss": 1.0796, + "step": 7980 + }, + { + "epoch": 0.5557487653891633, + "grad_norm": 2.59888485709724, + "learning_rate": 9.780345013090255e-06, + "loss": 1.0543, + "step": 7990 + }, + { + "epoch": 0.5564443207901509, + "grad_norm": 2.5084177001876617, + "learning_rate": 9.779157199513392e-06, + "loss": 1.0292, + "step": 8000 + }, + { + "epoch": 0.5564443207901509, + "eval_loss": 1.047345519065857, + "eval_runtime": 1319.7527, + "eval_samples_per_second": 13.761, + "eval_steps_per_second": 2.294, + "step": 8000 + }, + { + "epoch": 0.5571398761911386, + "grad_norm": 2.380723254895733, + "learning_rate": 9.777966255484143e-06, + "loss": 1.0486, + "step": 8010 + }, + { + "epoch": 0.5578354315921263, + "grad_norm": 2.096499391308046, + "learning_rate": 9.776772181782604e-06, + "loss": 1.024, + "step": 8020 + }, + { + "epoch": 0.558530986993114, + "grad_norm": 5.579201690925586, + "learning_rate": 9.775574979190918e-06, + "loss": 1.0676, + "step": 8030 + }, + { + "epoch": 0.5592265423941017, + "grad_norm": 7.583034979991965, + "learning_rate": 9.774374648493281e-06, + "loss": 1.0621, + "step": 8040 + }, + { + "epoch": 0.5599220977950894, + "grad_norm": 2.112205675903228, + "learning_rate": 9.773171190475935e-06, + "loss": 1.0406, + "step": 8050 + }, + { + "epoch": 0.560617653196077, + "grad_norm": 2.9545571674336064, + "learning_rate": 9.77196460592717e-06, + "loss": 0.9558, + "step": 8060 + }, + { + "epoch": 0.5613132085970648, + "grad_norm": 3.414834919441167, + "learning_rate": 9.770754895637328e-06, + "loss": 0.9934, + "step": 8070 + }, + { + "epoch": 0.5620087639980524, + "grad_norm": 2.098085337714327, + "learning_rate": 9.769542060398794e-06, + "loss": 1.0457, + "step": 8080 + }, + { + "epoch": 0.5627043193990401, + "grad_norm": 2.6640921984897625, + "learning_rate": 9.768326101006e-06, + "loss": 1.084, + "step": 8090 + }, + { + "epoch": 0.5633998748000278, + "grad_norm": 2.8902895419010735, + "learning_rate": 9.767107018255428e-06, + "loss": 1.0902, + "step": 8100 + }, + { + "epoch": 0.5640954302010155, + "grad_norm": 2.658597971941556, + "learning_rate": 9.765884812945603e-06, + "loss": 1.0988, + "step": 8110 + }, + { + "epoch": 0.5647909856020032, + "grad_norm": 1.887285334802991, + "learning_rate": 9.764659485877095e-06, + "loss": 0.9785, + "step": 8120 + }, + { + "epoch": 0.5654865410029909, + "grad_norm": 3.068248219600333, + "learning_rate": 9.763431037852524e-06, + "loss": 1.0323, + "step": 8130 + }, + { + "epoch": 0.5661820964039785, + "grad_norm": 1.9405231246674146, + "learning_rate": 9.762199469676547e-06, + "loss": 1.0444, + "step": 8140 + }, + { + "epoch": 0.5668776518049663, + "grad_norm": 2.638098667809521, + "learning_rate": 9.760964782155868e-06, + "loss": 1.0907, + "step": 8150 + }, + { + "epoch": 0.567573207205954, + "grad_norm": 3.970136272040161, + "learning_rate": 9.759726976099237e-06, + "loss": 1.0628, + "step": 8160 + }, + { + "epoch": 0.5682687626069416, + "grad_norm": 6.742071248837605, + "learning_rate": 9.75848605231744e-06, + "loss": 1.0591, + "step": 8170 + }, + { + "epoch": 0.5689643180079293, + "grad_norm": 1.7269465031091806, + "learning_rate": 9.757242011623313e-06, + "loss": 1.0442, + "step": 8180 + }, + { + "epoch": 0.569659873408917, + "grad_norm": 2.640366825209659, + "learning_rate": 9.755994854831727e-06, + "loss": 1.066, + "step": 8190 + }, + { + "epoch": 0.5703554288099048, + "grad_norm": 3.7876405156901494, + "learning_rate": 9.754744582759598e-06, + "loss": 1.0059, + "step": 8200 + }, + { + "epoch": 0.5710509842108924, + "grad_norm": 4.259126623881852, + "learning_rate": 9.753491196225883e-06, + "loss": 1.0282, + "step": 8210 + }, + { + "epoch": 0.57174653961188, + "grad_norm": 2.362742963416628, + "learning_rate": 9.752234696051577e-06, + "loss": 1.0042, + "step": 8220 + }, + { + "epoch": 0.5724420950128678, + "grad_norm": 2.8813431154614224, + "learning_rate": 9.750975083059712e-06, + "loss": 1.0354, + "step": 8230 + }, + { + "epoch": 0.5731376504138554, + "grad_norm": 2.1053355562163745, + "learning_rate": 9.749712358075366e-06, + "loss": 1.0911, + "step": 8240 + }, + { + "epoch": 0.5738332058148432, + "grad_norm": 1.9605683532745135, + "learning_rate": 9.748446521925648e-06, + "loss": 0.9925, + "step": 8250 + }, + { + "epoch": 0.5745287612158309, + "grad_norm": 2.6071405497830464, + "learning_rate": 9.747177575439713e-06, + "loss": 1.0036, + "step": 8260 + }, + { + "epoch": 0.5752243166168185, + "grad_norm": 2.90897554028933, + "learning_rate": 9.745905519448743e-06, + "loss": 1.1036, + "step": 8270 + }, + { + "epoch": 0.5759198720178063, + "grad_norm": 3.8634597051849107, + "learning_rate": 9.744630354785967e-06, + "loss": 1.0406, + "step": 8280 + }, + { + "epoch": 0.5766154274187939, + "grad_norm": 1.9218697041269124, + "learning_rate": 9.743352082286641e-06, + "loss": 1.0202, + "step": 8290 + }, + { + "epoch": 0.5773109828197815, + "grad_norm": 2.1424444086070094, + "learning_rate": 9.742070702788067e-06, + "loss": 1.0083, + "step": 8300 + }, + { + "epoch": 0.5780065382207693, + "grad_norm": 3.0191697488317892, + "learning_rate": 9.740786217129574e-06, + "loss": 0.9882, + "step": 8310 + }, + { + "epoch": 0.578702093621757, + "grad_norm": 1.9639045729511668, + "learning_rate": 9.739498626152528e-06, + "loss": 0.9831, + "step": 8320 + }, + { + "epoch": 0.5793976490227447, + "grad_norm": 2.4577686776761953, + "learning_rate": 9.73820793070033e-06, + "loss": 1.1131, + "step": 8330 + }, + { + "epoch": 0.5800932044237324, + "grad_norm": 1.9755642203958492, + "learning_rate": 9.736914131618412e-06, + "loss": 1.0144, + "step": 8340 + }, + { + "epoch": 0.58078875982472, + "grad_norm": 3.8516897080276395, + "learning_rate": 9.73561722975424e-06, + "loss": 1.0396, + "step": 8350 + }, + { + "epoch": 0.5814843152257078, + "grad_norm": 2.825387612705438, + "learning_rate": 9.734317225957317e-06, + "loss": 1.0051, + "step": 8360 + }, + { + "epoch": 0.5821798706266954, + "grad_norm": 2.9384051452227147, + "learning_rate": 9.73301412107917e-06, + "loss": 0.9869, + "step": 8370 + }, + { + "epoch": 0.582875426027683, + "grad_norm": 2.20103542561724, + "learning_rate": 9.731707915973365e-06, + "loss": 0.9954, + "step": 8380 + }, + { + "epoch": 0.5835709814286708, + "grad_norm": 4.327244375771762, + "learning_rate": 9.730398611495492e-06, + "loss": 1.0548, + "step": 8390 + }, + { + "epoch": 0.5842665368296585, + "grad_norm": 1.8093775159519636, + "learning_rate": 9.729086208503174e-06, + "loss": 1.0411, + "step": 8400 + }, + { + "epoch": 0.5849620922306462, + "grad_norm": 2.785440832799896, + "learning_rate": 9.727770707856066e-06, + "loss": 1.0282, + "step": 8410 + }, + { + "epoch": 0.5856576476316339, + "grad_norm": 2.80385145956926, + "learning_rate": 9.726452110415847e-06, + "loss": 0.9969, + "step": 8420 + }, + { + "epoch": 0.5863532030326215, + "grad_norm": 141.97574589443073, + "learning_rate": 9.725130417046228e-06, + "loss": 0.9656, + "step": 8430 + }, + { + "epoch": 0.5870487584336093, + "grad_norm": 3.2745928382692777, + "learning_rate": 9.723805628612947e-06, + "loss": 1.0116, + "step": 8440 + }, + { + "epoch": 0.5877443138345969, + "grad_norm": 3.151221174418198, + "learning_rate": 9.72247774598377e-06, + "loss": 0.9809, + "step": 8450 + }, + { + "epoch": 0.5884398692355847, + "grad_norm": 1.8636929912146467, + "learning_rate": 9.721146770028489e-06, + "loss": 0.9344, + "step": 8460 + }, + { + "epoch": 0.5891354246365723, + "grad_norm": 3.4688360506000473, + "learning_rate": 9.719812701618921e-06, + "loss": 1.0304, + "step": 8470 + }, + { + "epoch": 0.58983098003756, + "grad_norm": 2.6242649095268074, + "learning_rate": 9.718475541628913e-06, + "loss": 1.0835, + "step": 8480 + }, + { + "epoch": 0.5905265354385477, + "grad_norm": 2.728913996806294, + "learning_rate": 9.71713529093433e-06, + "loss": 1.0606, + "step": 8490 + }, + { + "epoch": 0.5912220908395354, + "grad_norm": 1.8027928948437077, + "learning_rate": 9.71579195041307e-06, + "loss": 1.0183, + "step": 8500 + }, + { + "epoch": 0.5912220908395354, + "eval_loss": 1.0405319929122925, + "eval_runtime": 1322.8776, + "eval_samples_per_second": 13.728, + "eval_steps_per_second": 2.288, + "step": 8500 + }, + { + "epoch": 0.591917646240523, + "grad_norm": 2.8556584874599817, + "learning_rate": 9.714445520945045e-06, + "loss": 0.9658, + "step": 8510 + }, + { + "epoch": 0.5926132016415108, + "grad_norm": 2.8296826053889523, + "learning_rate": 9.7130960034122e-06, + "loss": 1.0412, + "step": 8520 + }, + { + "epoch": 0.5933087570424984, + "grad_norm": 1.8682802512051044, + "learning_rate": 9.711743398698496e-06, + "loss": 0.9793, + "step": 8530 + }, + { + "epoch": 0.5940043124434862, + "grad_norm": 2.784069262869355, + "learning_rate": 9.710387707689923e-06, + "loss": 1.0526, + "step": 8540 + }, + { + "epoch": 0.5946998678444738, + "grad_norm": 4.029957447205754, + "learning_rate": 9.709028931274482e-06, + "loss": 1.011, + "step": 8550 + }, + { + "epoch": 0.5953954232454615, + "grad_norm": 2.7271177859481663, + "learning_rate": 9.707667070342205e-06, + "loss": 1.0419, + "step": 8560 + }, + { + "epoch": 0.5960909786464492, + "grad_norm": 2.914663306524725, + "learning_rate": 9.706302125785139e-06, + "loss": 1.0407, + "step": 8570 + }, + { + "epoch": 0.5967865340474369, + "grad_norm": 2.0460284706631713, + "learning_rate": 9.704934098497356e-06, + "loss": 1.0047, + "step": 8580 + }, + { + "epoch": 0.5974820894484245, + "grad_norm": 4.309134370104355, + "learning_rate": 9.70356298937494e-06, + "loss": 0.9925, + "step": 8590 + }, + { + "epoch": 0.5981776448494123, + "grad_norm": 2.269808480699976, + "learning_rate": 9.702188799315997e-06, + "loss": 1.02, + "step": 8600 + }, + { + "epoch": 0.5988732002503999, + "grad_norm": 2.951830659406277, + "learning_rate": 9.700811529220653e-06, + "loss": 1.0013, + "step": 8610 + }, + { + "epoch": 0.5995687556513877, + "grad_norm": 7.008881954127553, + "learning_rate": 9.699431179991053e-06, + "loss": 1.0699, + "step": 8620 + }, + { + "epoch": 0.6002643110523753, + "grad_norm": 3.6939562256103975, + "learning_rate": 9.69804775253135e-06, + "loss": 0.9531, + "step": 8630 + }, + { + "epoch": 0.600959866453363, + "grad_norm": 1.9792111974346873, + "learning_rate": 9.696661247747723e-06, + "loss": 1.0262, + "step": 8640 + }, + { + "epoch": 0.6016554218543507, + "grad_norm": 2.1592293063413868, + "learning_rate": 9.695271666548362e-06, + "loss": 0.9794, + "step": 8650 + }, + { + "epoch": 0.6023509772553384, + "grad_norm": 2.068214640199246, + "learning_rate": 9.693879009843475e-06, + "loss": 0.9578, + "step": 8660 + }, + { + "epoch": 0.6030465326563261, + "grad_norm": 7.813700617205559, + "learning_rate": 9.69248327854528e-06, + "loss": 1.0436, + "step": 8670 + }, + { + "epoch": 0.6037420880573138, + "grad_norm": 2.580281708107549, + "learning_rate": 9.69108447356801e-06, + "loss": 1.0287, + "step": 8680 + }, + { + "epoch": 0.6044376434583014, + "grad_norm": 6.42280432681602, + "learning_rate": 9.689682595827919e-06, + "loss": 0.9743, + "step": 8690 + }, + { + "epoch": 0.6051331988592892, + "grad_norm": 2.649296759199103, + "learning_rate": 9.68827764624326e-06, + "loss": 1.0123, + "step": 8700 + }, + { + "epoch": 0.6058287542602768, + "grad_norm": 2.5485562935545865, + "learning_rate": 9.686869625734311e-06, + "loss": 1.0226, + "step": 8710 + }, + { + "epoch": 0.6065243096612645, + "grad_norm": 1.7553931327826786, + "learning_rate": 9.685458535223356e-06, + "loss": 1.0332, + "step": 8720 + }, + { + "epoch": 0.6072198650622522, + "grad_norm": 2.000709292652354, + "learning_rate": 9.684044375634687e-06, + "loss": 1.0257, + "step": 8730 + }, + { + "epoch": 0.6079154204632399, + "grad_norm": 2.46279154972546, + "learning_rate": 9.68262714789461e-06, + "loss": 1.0817, + "step": 8740 + }, + { + "epoch": 0.6086109758642276, + "grad_norm": 2.30622411851755, + "learning_rate": 9.681206852931442e-06, + "loss": 1.0929, + "step": 8750 + }, + { + "epoch": 0.6093065312652153, + "grad_norm": 3.293914131182769, + "learning_rate": 9.679783491675507e-06, + "loss": 1.0342, + "step": 8760 + }, + { + "epoch": 0.6100020866662029, + "grad_norm": 2.1737286681974317, + "learning_rate": 9.678357065059136e-06, + "loss": 1.0088, + "step": 8770 + }, + { + "epoch": 0.6106976420671907, + "grad_norm": 2.6101872275389075, + "learning_rate": 9.676927574016672e-06, + "loss": 1.0271, + "step": 8780 + }, + { + "epoch": 0.6113931974681783, + "grad_norm": 2.4656464123455306, + "learning_rate": 9.67549501948446e-06, + "loss": 1.0244, + "step": 8790 + }, + { + "epoch": 0.612088752869166, + "grad_norm": 11.59604123335081, + "learning_rate": 9.674059402400858e-06, + "loss": 1.0523, + "step": 8800 + }, + { + "epoch": 0.6127843082701537, + "grad_norm": 3.278023988534184, + "learning_rate": 9.672620723706223e-06, + "loss": 1.0811, + "step": 8810 + }, + { + "epoch": 0.6134798636711414, + "grad_norm": 7.551069231815679, + "learning_rate": 9.671178984342924e-06, + "loss": 1.0278, + "step": 8820 + }, + { + "epoch": 0.6141754190721291, + "grad_norm": 4.832046160794693, + "learning_rate": 9.669734185255331e-06, + "loss": 1.0526, + "step": 8830 + }, + { + "epoch": 0.6148709744731168, + "grad_norm": 4.113412390097194, + "learning_rate": 9.668286327389817e-06, + "loss": 1.0408, + "step": 8840 + }, + { + "epoch": 0.6155665298741044, + "grad_norm": 2.3694806391978847, + "learning_rate": 9.666835411694761e-06, + "loss": 1.0565, + "step": 8850 + }, + { + "epoch": 0.6162620852750922, + "grad_norm": 2.3134748288490004, + "learning_rate": 9.665381439120547e-06, + "loss": 1.1054, + "step": 8860 + }, + { + "epoch": 0.6169576406760798, + "grad_norm": 2.729173476192093, + "learning_rate": 9.663924410619556e-06, + "loss": 1.0665, + "step": 8870 + }, + { + "epoch": 0.6176531960770676, + "grad_norm": 2.330179996925811, + "learning_rate": 9.662464327146177e-06, + "loss": 1.022, + "step": 8880 + }, + { + "epoch": 0.6183487514780552, + "grad_norm": 3.001126215129907, + "learning_rate": 9.661001189656793e-06, + "loss": 1.0768, + "step": 8890 + }, + { + "epoch": 0.6190443068790429, + "grad_norm": 2.261374581485111, + "learning_rate": 9.659534999109792e-06, + "loss": 0.9821, + "step": 8900 + }, + { + "epoch": 0.6197398622800306, + "grad_norm": 2.4041153266216706, + "learning_rate": 9.658065756465563e-06, + "loss": 1.0715, + "step": 8910 + }, + { + "epoch": 0.6204354176810183, + "grad_norm": 3.166882291925204, + "learning_rate": 9.656593462686488e-06, + "loss": 1.0132, + "step": 8920 + }, + { + "epoch": 0.6211309730820059, + "grad_norm": 1.991687617919289, + "learning_rate": 9.655118118736954e-06, + "loss": 1.0602, + "step": 8930 + }, + { + "epoch": 0.6218265284829937, + "grad_norm": 2.985680110079251, + "learning_rate": 9.653639725583344e-06, + "loss": 1.1133, + "step": 8940 + }, + { + "epoch": 0.6225220838839813, + "grad_norm": 2.6604958490224764, + "learning_rate": 9.652158284194035e-06, + "loss": 1.075, + "step": 8950 + }, + { + "epoch": 0.6232176392849691, + "grad_norm": 1.8255910711452994, + "learning_rate": 9.650673795539409e-06, + "loss": 1.0583, + "step": 8960 + }, + { + "epoch": 0.6239131946859567, + "grad_norm": 3.0346600826607637, + "learning_rate": 9.649186260591833e-06, + "loss": 1.0519, + "step": 8970 + }, + { + "epoch": 0.6246087500869444, + "grad_norm": 2.419605006930221, + "learning_rate": 9.647695680325678e-06, + "loss": 1.013, + "step": 8980 + }, + { + "epoch": 0.6253043054879321, + "grad_norm": 1.8794123906843772, + "learning_rate": 9.646202055717304e-06, + "loss": 1.0354, + "step": 8990 + }, + { + "epoch": 0.6259998608889198, + "grad_norm": 2.29466452865579, + "learning_rate": 9.64470538774507e-06, + "loss": 1.0075, + "step": 9000 + }, + { + "epoch": 0.6259998608889198, + "eval_loss": 1.0422266721725464, + "eval_runtime": 1321.6472, + "eval_samples_per_second": 13.741, + "eval_steps_per_second": 2.29, + "step": 9000 + }, + { + "epoch": 0.6266954162899074, + "grad_norm": 3.261387549077601, + "learning_rate": 9.643205677389327e-06, + "loss": 1.0516, + "step": 9010 + }, + { + "epoch": 0.6273909716908952, + "grad_norm": 2.135585211677677, + "learning_rate": 9.641702925632418e-06, + "loss": 1.0353, + "step": 9020 + }, + { + "epoch": 0.6280865270918828, + "grad_norm": 3.0967602203030036, + "learning_rate": 9.640197133458674e-06, + "loss": 1.0082, + "step": 9030 + }, + { + "epoch": 0.6287820824928706, + "grad_norm": 7.277940701376121, + "learning_rate": 9.638688301854425e-06, + "loss": 0.9865, + "step": 9040 + }, + { + "epoch": 0.6294776378938582, + "grad_norm": 2.3165650859067153, + "learning_rate": 9.637176431807989e-06, + "loss": 1.0529, + "step": 9050 + }, + { + "epoch": 0.6301731932948459, + "grad_norm": 2.3273254923508877, + "learning_rate": 9.635661524309672e-06, + "loss": 0.9718, + "step": 9060 + }, + { + "epoch": 0.6308687486958336, + "grad_norm": 2.8994318751931663, + "learning_rate": 9.634143580351775e-06, + "loss": 1.0075, + "step": 9070 + }, + { + "epoch": 0.6315643040968213, + "grad_norm": 9.761165437625658, + "learning_rate": 9.63262260092858e-06, + "loss": 1.0064, + "step": 9080 + }, + { + "epoch": 0.632259859497809, + "grad_norm": 3.5735279564663607, + "learning_rate": 9.631098587036367e-06, + "loss": 1.0296, + "step": 9090 + }, + { + "epoch": 0.6329554148987967, + "grad_norm": 5.154414016334504, + "learning_rate": 9.629571539673392e-06, + "loss": 1.1059, + "step": 9100 + }, + { + "epoch": 0.6336509702997843, + "grad_norm": 2.1201537929462693, + "learning_rate": 9.62804145983991e-06, + "loss": 1.0216, + "step": 9110 + }, + { + "epoch": 0.6343465257007721, + "grad_norm": 4.2793719083041335, + "learning_rate": 9.626508348538153e-06, + "loss": 0.9657, + "step": 9120 + }, + { + "epoch": 0.6350420811017597, + "grad_norm": 11.508030152662005, + "learning_rate": 9.624972206772345e-06, + "loss": 1.0711, + "step": 9130 + }, + { + "epoch": 0.6357376365027474, + "grad_norm": 2.871542728220115, + "learning_rate": 9.62343303554869e-06, + "loss": 1.0876, + "step": 9140 + }, + { + "epoch": 0.6364331919037352, + "grad_norm": 4.420204487920676, + "learning_rate": 9.621890835875383e-06, + "loss": 1.1124, + "step": 9150 + }, + { + "epoch": 0.6371287473047228, + "grad_norm": 2.615936702734315, + "learning_rate": 9.620345608762593e-06, + "loss": 0.9879, + "step": 9160 + }, + { + "epoch": 0.6378243027057106, + "grad_norm": 2.8024536412933116, + "learning_rate": 9.61879735522248e-06, + "loss": 1.0115, + "step": 9170 + }, + { + "epoch": 0.6385198581066982, + "grad_norm": 3.378411699050181, + "learning_rate": 9.617246076269184e-06, + "loss": 1.0084, + "step": 9180 + }, + { + "epoch": 0.6392154135076858, + "grad_norm": 7.694532962803173, + "learning_rate": 9.615691772918829e-06, + "loss": 0.9565, + "step": 9190 + }, + { + "epoch": 0.6399109689086736, + "grad_norm": 2.6329474643542565, + "learning_rate": 9.614134446189512e-06, + "loss": 1.0265, + "step": 9200 + }, + { + "epoch": 0.6406065243096613, + "grad_norm": 4.881408815311737, + "learning_rate": 9.612574097101322e-06, + "loss": 1.0444, + "step": 9210 + }, + { + "epoch": 0.6413020797106489, + "grad_norm": 1.8156655098762537, + "learning_rate": 9.611010726676317e-06, + "loss": 1.0377, + "step": 9220 + }, + { + "epoch": 0.6419976351116367, + "grad_norm": 2.367588597878744, + "learning_rate": 9.609444335938542e-06, + "loss": 1.0212, + "step": 9230 + }, + { + "epoch": 0.6426931905126243, + "grad_norm": 2.335941469387071, + "learning_rate": 9.607874925914016e-06, + "loss": 0.987, + "step": 9240 + }, + { + "epoch": 0.6433887459136121, + "grad_norm": 3.039616527704691, + "learning_rate": 9.606302497630735e-06, + "loss": 1.0319, + "step": 9250 + }, + { + "epoch": 0.6440843013145997, + "grad_norm": 4.021367835182951, + "learning_rate": 9.604727052118678e-06, + "loss": 1.0074, + "step": 9260 + }, + { + "epoch": 0.6447798567155874, + "grad_norm": 14.06139461749761, + "learning_rate": 9.603148590409794e-06, + "loss": 1.006, + "step": 9270 + }, + { + "epoch": 0.6454754121165751, + "grad_norm": 2.5758801419637183, + "learning_rate": 9.601567113538008e-06, + "loss": 1.0253, + "step": 9280 + }, + { + "epoch": 0.6461709675175628, + "grad_norm": 2.0403594158378917, + "learning_rate": 9.599982622539225e-06, + "loss": 1.0304, + "step": 9290 + }, + { + "epoch": 0.6468665229185505, + "grad_norm": 2.2868358441566072, + "learning_rate": 9.59839511845132e-06, + "loss": 1.0133, + "step": 9300 + }, + { + "epoch": 0.6475620783195382, + "grad_norm": 2.628409990512463, + "learning_rate": 9.596804602314141e-06, + "loss": 0.998, + "step": 9310 + }, + { + "epoch": 0.6482576337205258, + "grad_norm": 2.5492475308308857, + "learning_rate": 9.595211075169515e-06, + "loss": 1.02, + "step": 9320 + }, + { + "epoch": 0.6489531891215136, + "grad_norm": 6.2706919120697435, + "learning_rate": 9.593614538061233e-06, + "loss": 1.0613, + "step": 9330 + }, + { + "epoch": 0.6496487445225012, + "grad_norm": 2.811791971358467, + "learning_rate": 9.592014992035065e-06, + "loss": 1.0621, + "step": 9340 + }, + { + "epoch": 0.6503442999234889, + "grad_norm": 2.6135530022916047, + "learning_rate": 9.590412438138746e-06, + "loss": 0.955, + "step": 9350 + }, + { + "epoch": 0.6510398553244766, + "grad_norm": 2.456978215648236, + "learning_rate": 9.588806877421986e-06, + "loss": 0.9985, + "step": 9360 + }, + { + "epoch": 0.6517354107254643, + "grad_norm": 2.3976065147825265, + "learning_rate": 9.58719831093646e-06, + "loss": 1.0326, + "step": 9370 + }, + { + "epoch": 0.652430966126452, + "grad_norm": 1.7983066385905468, + "learning_rate": 9.585586739735815e-06, + "loss": 0.9682, + "step": 9380 + }, + { + "epoch": 0.6531265215274397, + "grad_norm": 10.69675603658846, + "learning_rate": 9.583972164875668e-06, + "loss": 1.0322, + "step": 9390 + }, + { + "epoch": 0.6538220769284273, + "grad_norm": 3.9336248038980837, + "learning_rate": 9.582354587413596e-06, + "loss": 1.043, + "step": 9400 + }, + { + "epoch": 0.6545176323294151, + "grad_norm": 3.6467454641323047, + "learning_rate": 9.580734008409151e-06, + "loss": 1.0716, + "step": 9410 + }, + { + "epoch": 0.6552131877304027, + "grad_norm": 2.2691484095417533, + "learning_rate": 9.579110428923847e-06, + "loss": 1.0898, + "step": 9420 + }, + { + "epoch": 0.6559087431313905, + "grad_norm": 3.4990459626526897, + "learning_rate": 9.577483850021164e-06, + "loss": 1.0572, + "step": 9430 + }, + { + "epoch": 0.6566042985323781, + "grad_norm": 3.008236357596094, + "learning_rate": 9.575854272766547e-06, + "loss": 0.9934, + "step": 9440 + }, + { + "epoch": 0.6572998539333658, + "grad_norm": 2.692625989820183, + "learning_rate": 9.574221698227403e-06, + "loss": 1.0495, + "step": 9450 + }, + { + "epoch": 0.6579954093343535, + "grad_norm": 4.38734857012432, + "learning_rate": 9.572586127473106e-06, + "loss": 1.0565, + "step": 9460 + }, + { + "epoch": 0.6586909647353412, + "grad_norm": 2.5284170623782556, + "learning_rate": 9.57094756157499e-06, + "loss": 0.9338, + "step": 9470 + }, + { + "epoch": 0.6593865201363288, + "grad_norm": 3.307600589189606, + "learning_rate": 9.56930600160635e-06, + "loss": 1.0025, + "step": 9480 + }, + { + "epoch": 0.6600820755373166, + "grad_norm": 2.561257578184538, + "learning_rate": 9.567661448642447e-06, + "loss": 1.0701, + "step": 9490 + }, + { + "epoch": 0.6607776309383042, + "grad_norm": 2.130781036166668, + "learning_rate": 9.566013903760496e-06, + "loss": 1.0497, + "step": 9500 + }, + { + "epoch": 0.6607776309383042, + "eval_loss": 1.0318341255187988, + "eval_runtime": 1323.0071, + "eval_samples_per_second": 13.727, + "eval_steps_per_second": 2.288, + "step": 9500 + }, + { + "epoch": 0.661473186339292, + "grad_norm": 2.490698501121074, + "learning_rate": 9.564363368039675e-06, + "loss": 1.033, + "step": 9510 + }, + { + "epoch": 0.6621687417402796, + "grad_norm": 1.705058055859062, + "learning_rate": 9.562709842561124e-06, + "loss": 1.0308, + "step": 9520 + }, + { + "epoch": 0.6628642971412673, + "grad_norm": 1.8381863578595816, + "learning_rate": 9.561053328407934e-06, + "loss": 1.0337, + "step": 9530 + }, + { + "epoch": 0.663559852542255, + "grad_norm": 2.8387279634415576, + "learning_rate": 9.55939382666516e-06, + "loss": 1.0268, + "step": 9540 + }, + { + "epoch": 0.6642554079432427, + "grad_norm": 2.2949386214493135, + "learning_rate": 9.557731338419815e-06, + "loss": 1.0495, + "step": 9550 + }, + { + "epoch": 0.6649509633442303, + "grad_norm": 3.6010386727632158, + "learning_rate": 9.55606586476086e-06, + "loss": 1.0484, + "step": 9560 + }, + { + "epoch": 0.6656465187452181, + "grad_norm": 2.813797047340269, + "learning_rate": 9.554397406779219e-06, + "loss": 1.0785, + "step": 9570 + }, + { + "epoch": 0.6663420741462057, + "grad_norm": 2.7125021500506366, + "learning_rate": 9.552725965567769e-06, + "loss": 1.0178, + "step": 9580 + }, + { + "epoch": 0.6670376295471935, + "grad_norm": 3.0108244144253433, + "learning_rate": 9.55105154222134e-06, + "loss": 1.0622, + "step": 9590 + }, + { + "epoch": 0.6677331849481811, + "grad_norm": 2.65661635841504, + "learning_rate": 9.549374137836714e-06, + "loss": 1.0678, + "step": 9600 + }, + { + "epoch": 0.6684287403491688, + "grad_norm": 3.7426087980639506, + "learning_rate": 9.54769375351263e-06, + "loss": 1.0695, + "step": 9610 + }, + { + "epoch": 0.6691242957501565, + "grad_norm": 1.9773975400745272, + "learning_rate": 9.546010390349778e-06, + "loss": 1.0495, + "step": 9620 + }, + { + "epoch": 0.6698198511511442, + "grad_norm": 4.933966085203939, + "learning_rate": 9.544324049450791e-06, + "loss": 0.9701, + "step": 9630 + }, + { + "epoch": 0.6705154065521319, + "grad_norm": 2.4221213679575597, + "learning_rate": 9.542634731920266e-06, + "loss": 1.0294, + "step": 9640 + }, + { + "epoch": 0.6712109619531196, + "grad_norm": 3.289871965476106, + "learning_rate": 9.54094243886474e-06, + "loss": 1.0121, + "step": 9650 + }, + { + "epoch": 0.6719065173541072, + "grad_norm": 2.0994929259096025, + "learning_rate": 9.539247171392702e-06, + "loss": 1.0328, + "step": 9660 + }, + { + "epoch": 0.672602072755095, + "grad_norm": 4.150664555606297, + "learning_rate": 9.53754893061459e-06, + "loss": 1.0893, + "step": 9670 + }, + { + "epoch": 0.6732976281560826, + "grad_norm": 2.7792552296560524, + "learning_rate": 9.535847717642787e-06, + "loss": 1.02, + "step": 9680 + }, + { + "epoch": 0.6739931835570703, + "grad_norm": 2.7286021687297963, + "learning_rate": 9.534143533591627e-06, + "loss": 1.0279, + "step": 9690 + }, + { + "epoch": 0.674688738958058, + "grad_norm": 2.224039871887424, + "learning_rate": 9.532436379577387e-06, + "loss": 1.0137, + "step": 9700 + }, + { + "epoch": 0.6753842943590457, + "grad_norm": 5.566764334876452, + "learning_rate": 9.53072625671829e-06, + "loss": 0.993, + "step": 9710 + }, + { + "epoch": 0.6760798497600334, + "grad_norm": 2.5724727325861276, + "learning_rate": 9.529013166134505e-06, + "loss": 0.9653, + "step": 9720 + }, + { + "epoch": 0.6767754051610211, + "grad_norm": 2.87214334573511, + "learning_rate": 9.527297108948139e-06, + "loss": 1.0563, + "step": 9730 + }, + { + "epoch": 0.6774709605620087, + "grad_norm": 2.1241182938694974, + "learning_rate": 9.525578086283252e-06, + "loss": 1.0687, + "step": 9740 + }, + { + "epoch": 0.6781665159629965, + "grad_norm": 2.1104452397086164, + "learning_rate": 9.523856099265841e-06, + "loss": 1.0266, + "step": 9750 + }, + { + "epoch": 0.6788620713639841, + "grad_norm": 2.4092834319665504, + "learning_rate": 9.522131149023844e-06, + "loss": 0.9538, + "step": 9760 + }, + { + "epoch": 0.6795576267649718, + "grad_norm": 5.951761004831045, + "learning_rate": 9.52040323668714e-06, + "loss": 1.0209, + "step": 9770 + }, + { + "epoch": 0.6802531821659595, + "grad_norm": 4.332022897232842, + "learning_rate": 9.51867236338755e-06, + "loss": 0.9763, + "step": 9780 + }, + { + "epoch": 0.6809487375669472, + "grad_norm": 2.7677660952526595, + "learning_rate": 9.516938530258835e-06, + "loss": 1.0095, + "step": 9790 + }, + { + "epoch": 0.6816442929679349, + "grad_norm": 3.4989155864942965, + "learning_rate": 9.515201738436692e-06, + "loss": 1.0464, + "step": 9800 + }, + { + "epoch": 0.6823398483689226, + "grad_norm": 2.807837668406322, + "learning_rate": 9.51346198905876e-06, + "loss": 1.0045, + "step": 9810 + }, + { + "epoch": 0.6830354037699102, + "grad_norm": 2.3338358666832635, + "learning_rate": 9.51171928326461e-06, + "loss": 1.0109, + "step": 9820 + }, + { + "epoch": 0.683730959170898, + "grad_norm": 10.599042910960145, + "learning_rate": 9.509973622195754e-06, + "loss": 0.9701, + "step": 9830 + }, + { + "epoch": 0.6844265145718856, + "grad_norm": 2.4375187486558194, + "learning_rate": 9.508225006995638e-06, + "loss": 0.9983, + "step": 9840 + }, + { + "epoch": 0.6851220699728734, + "grad_norm": 2.7599796895511934, + "learning_rate": 9.506473438809642e-06, + "loss": 1.021, + "step": 9850 + }, + { + "epoch": 0.685817625373861, + "grad_norm": 2.3513850554386746, + "learning_rate": 9.504718918785084e-06, + "loss": 0.9633, + "step": 9860 + }, + { + "epoch": 0.6865131807748487, + "grad_norm": 2.074616865137769, + "learning_rate": 9.50296144807121e-06, + "loss": 0.9602, + "step": 9870 + }, + { + "epoch": 0.6872087361758364, + "grad_norm": 1.6456396175818244, + "learning_rate": 9.501201027819204e-06, + "loss": 0.965, + "step": 9880 + }, + { + "epoch": 0.6879042915768241, + "grad_norm": 3.380273811613186, + "learning_rate": 9.499437659182179e-06, + "loss": 1.0396, + "step": 9890 + }, + { + "epoch": 0.6885998469778117, + "grad_norm": 2.11989841786958, + "learning_rate": 9.497671343315177e-06, + "loss": 0.9943, + "step": 9900 + }, + { + "epoch": 0.6892954023787995, + "grad_norm": 2.0173000251341273, + "learning_rate": 9.49590208137518e-06, + "loss": 1.0796, + "step": 9910 + }, + { + "epoch": 0.6899909577797871, + "grad_norm": 2.496397529736902, + "learning_rate": 9.494129874521088e-06, + "loss": 1.0844, + "step": 9920 + }, + { + "epoch": 0.6906865131807749, + "grad_norm": 3.4954089922781404, + "learning_rate": 9.492354723913737e-06, + "loss": 0.9717, + "step": 9930 + }, + { + "epoch": 0.6913820685817625, + "grad_norm": 5.018795290750774, + "learning_rate": 9.490576630715889e-06, + "loss": 1.0044, + "step": 9940 + }, + { + "epoch": 0.6920776239827502, + "grad_norm": 2.6416611091806317, + "learning_rate": 9.488795596092233e-06, + "loss": 1.0823, + "step": 9950 + }, + { + "epoch": 0.692773179383738, + "grad_norm": 4.86353549731523, + "learning_rate": 9.487011621209387e-06, + "loss": 1.0131, + "step": 9960 + }, + { + "epoch": 0.6934687347847256, + "grad_norm": 2.7769227487112693, + "learning_rate": 9.485224707235895e-06, + "loss": 1.0497, + "step": 9970 + }, + { + "epoch": 0.6941642901857132, + "grad_norm": 2.0319303142455336, + "learning_rate": 9.48343485534222e-06, + "loss": 1.0234, + "step": 9980 + }, + { + "epoch": 0.694859845586701, + "grad_norm": 2.152874685025489, + "learning_rate": 9.481642066700759e-06, + "loss": 1.0242, + "step": 9990 + }, + { + "epoch": 0.6955554009876886, + "grad_norm": 2.289996198356572, + "learning_rate": 9.479846342485823e-06, + "loss": 0.9945, + "step": 10000 + }, + { + "epoch": 0.6955554009876886, + "eval_loss": 1.0294498205184937, + "eval_runtime": 1322.7015, + "eval_samples_per_second": 13.73, + "eval_steps_per_second": 2.288, + "step": 10000 + }, + { + "epoch": 0.6962509563886764, + "grad_norm": 2.5960848991039747, + "learning_rate": 9.478047683873656e-06, + "loss": 1.0322, + "step": 10010 + }, + { + "epoch": 0.696946511789664, + "grad_norm": 2.348821312009709, + "learning_rate": 9.476246092042413e-06, + "loss": 1.0681, + "step": 10020 + }, + { + "epoch": 0.6976420671906517, + "grad_norm": 2.335240932631854, + "learning_rate": 9.474441568172182e-06, + "loss": 0.9773, + "step": 10030 + }, + { + "epoch": 0.6983376225916395, + "grad_norm": 2.4367037752898466, + "learning_rate": 9.472634113444962e-06, + "loss": 1.0357, + "step": 10040 + }, + { + "epoch": 0.6990331779926271, + "grad_norm": 2.4970009284930983, + "learning_rate": 9.470823729044675e-06, + "loss": 1.0117, + "step": 10050 + }, + { + "epoch": 0.6997287333936149, + "grad_norm": 3.5606115320211633, + "learning_rate": 9.469010416157163e-06, + "loss": 1.0187, + "step": 10060 + }, + { + "epoch": 0.7004242887946025, + "grad_norm": 2.2660179751765384, + "learning_rate": 9.467194175970187e-06, + "loss": 1.0126, + "step": 10070 + }, + { + "epoch": 0.7011198441955901, + "grad_norm": 2.9907586143025413, + "learning_rate": 9.46537500967342e-06, + "loss": 1.0165, + "step": 10080 + }, + { + "epoch": 0.7018153995965779, + "grad_norm": 2.825006591655886, + "learning_rate": 9.463552918458463e-06, + "loss": 1.0482, + "step": 10090 + }, + { + "epoch": 0.7025109549975656, + "grad_norm": 3.308017266022384, + "learning_rate": 9.461727903518818e-06, + "loss": 1.0007, + "step": 10100 + }, + { + "epoch": 0.7032065103985532, + "grad_norm": 2.6975402982239576, + "learning_rate": 9.459899966049912e-06, + "loss": 0.9731, + "step": 10110 + }, + { + "epoch": 0.703902065799541, + "grad_norm": 1.8667765751261165, + "learning_rate": 9.458069107249086e-06, + "loss": 1.1018, + "step": 10120 + }, + { + "epoch": 0.7045976212005286, + "grad_norm": 2.2552795630403937, + "learning_rate": 9.456235328315591e-06, + "loss": 1.0715, + "step": 10130 + }, + { + "epoch": 0.7052931766015164, + "grad_norm": 2.5184726886281203, + "learning_rate": 9.454398630450592e-06, + "loss": 1.001, + "step": 10140 + }, + { + "epoch": 0.705988732002504, + "grad_norm": 5.538561070194269, + "learning_rate": 9.452559014857167e-06, + "loss": 1.0181, + "step": 10150 + }, + { + "epoch": 0.7066842874034917, + "grad_norm": 2.2833275678253835, + "learning_rate": 9.450716482740304e-06, + "loss": 1.0135, + "step": 10160 + }, + { + "epoch": 0.7073798428044794, + "grad_norm": 2.506824709500818, + "learning_rate": 9.4488710353069e-06, + "loss": 0.9874, + "step": 10170 + }, + { + "epoch": 0.708075398205467, + "grad_norm": 1.7390299774379567, + "learning_rate": 9.447022673765768e-06, + "loss": 1.049, + "step": 10180 + }, + { + "epoch": 0.7087709536064547, + "grad_norm": 3.6440476553741696, + "learning_rate": 9.445171399327621e-06, + "loss": 0.9983, + "step": 10190 + }, + { + "epoch": 0.7094665090074425, + "grad_norm": 2.3624602050888175, + "learning_rate": 9.443317213205086e-06, + "loss": 0.9829, + "step": 10200 + }, + { + "epoch": 0.7101620644084301, + "grad_norm": 3.240213859484688, + "learning_rate": 9.441460116612694e-06, + "loss": 1.0377, + "step": 10210 + }, + { + "epoch": 0.7108576198094179, + "grad_norm": 2.7363837209951383, + "learning_rate": 9.439600110766887e-06, + "loss": 1.0313, + "step": 10220 + }, + { + "epoch": 0.7115531752104055, + "grad_norm": 4.258263451443694, + "learning_rate": 9.437737196886006e-06, + "loss": 1.0611, + "step": 10230 + }, + { + "epoch": 0.7122487306113932, + "grad_norm": 1.9817513599037029, + "learning_rate": 9.435871376190301e-06, + "loss": 0.9718, + "step": 10240 + }, + { + "epoch": 0.7129442860123809, + "grad_norm": 5.471931997294798, + "learning_rate": 9.434002649901928e-06, + "loss": 0.9659, + "step": 10250 + }, + { + "epoch": 0.7136398414133686, + "grad_norm": 6.331871272953105, + "learning_rate": 9.43213101924494e-06, + "loss": 1.0968, + "step": 10260 + }, + { + "epoch": 0.7143353968143563, + "grad_norm": 3.441113422534496, + "learning_rate": 9.430256485445297e-06, + "loss": 1.0368, + "step": 10270 + }, + { + "epoch": 0.715030952215344, + "grad_norm": 2.071802261851675, + "learning_rate": 9.428379049730861e-06, + "loss": 1.0086, + "step": 10280 + }, + { + "epoch": 0.7157265076163316, + "grad_norm": 1.6718669409327498, + "learning_rate": 9.426498713331392e-06, + "loss": 0.9737, + "step": 10290 + }, + { + "epoch": 0.7164220630173194, + "grad_norm": 4.163168609168789, + "learning_rate": 9.424615477478553e-06, + "loss": 0.9631, + "step": 10300 + }, + { + "epoch": 0.717117618418307, + "grad_norm": 1.6430435810402986, + "learning_rate": 9.422729343405903e-06, + "loss": 1.022, + "step": 10310 + }, + { + "epoch": 0.7178131738192947, + "grad_norm": 1.9258969175095817, + "learning_rate": 9.4208403123489e-06, + "loss": 0.9897, + "step": 10320 + }, + { + "epoch": 0.7185087292202824, + "grad_norm": 6.554702977931786, + "learning_rate": 9.418948385544905e-06, + "loss": 1.1051, + "step": 10330 + }, + { + "epoch": 0.7192042846212701, + "grad_norm": 2.127886824382019, + "learning_rate": 9.417053564233168e-06, + "loss": 1.0187, + "step": 10340 + }, + { + "epoch": 0.7198998400222578, + "grad_norm": 2.4161656060877394, + "learning_rate": 9.415155849654837e-06, + "loss": 0.9395, + "step": 10350 + }, + { + "epoch": 0.7205953954232455, + "grad_norm": 2.1314201366638312, + "learning_rate": 9.41325524305296e-06, + "loss": 1.0391, + "step": 10360 + }, + { + "epoch": 0.7212909508242331, + "grad_norm": 3.100906891219446, + "learning_rate": 9.411351745672474e-06, + "loss": 1.0439, + "step": 10370 + }, + { + "epoch": 0.7219865062252209, + "grad_norm": 2.6531389570357558, + "learning_rate": 9.409445358760212e-06, + "loss": 0.9977, + "step": 10380 + }, + { + "epoch": 0.7226820616262085, + "grad_norm": 3.47874554994421, + "learning_rate": 9.407536083564897e-06, + "loss": 0.9905, + "step": 10390 + }, + { + "epoch": 0.7233776170271962, + "grad_norm": 3.267638927303419, + "learning_rate": 9.40562392133715e-06, + "loss": 1.0108, + "step": 10400 + }, + { + "epoch": 0.7240731724281839, + "grad_norm": 1.8299987687762398, + "learning_rate": 9.403708873329476e-06, + "loss": 1.015, + "step": 10410 + }, + { + "epoch": 0.7247687278291716, + "grad_norm": 2.5213030315843, + "learning_rate": 9.401790940796274e-06, + "loss": 1.0423, + "step": 10420 + }, + { + "epoch": 0.7254642832301593, + "grad_norm": 2.4869565326425334, + "learning_rate": 9.39987012499383e-06, + "loss": 1.0476, + "step": 10430 + }, + { + "epoch": 0.726159838631147, + "grad_norm": 1.9665883633721284, + "learning_rate": 9.397946427180326e-06, + "loss": 1.0339, + "step": 10440 + }, + { + "epoch": 0.7268553940321346, + "grad_norm": 1.8636005905845505, + "learning_rate": 9.39601984861582e-06, + "loss": 1.0314, + "step": 10450 + }, + { + "epoch": 0.7275509494331224, + "grad_norm": 2.455944134501207, + "learning_rate": 9.394090390562265e-06, + "loss": 0.9426, + "step": 10460 + }, + { + "epoch": 0.72824650483411, + "grad_norm": 3.4447295254233885, + "learning_rate": 9.392158054283497e-06, + "loss": 0.968, + "step": 10470 + }, + { + "epoch": 0.7289420602350978, + "grad_norm": 1.9599251908703275, + "learning_rate": 9.390222841045243e-06, + "loss": 1.0359, + "step": 10480 + }, + { + "epoch": 0.7296376156360854, + "grad_norm": 3.525996130939633, + "learning_rate": 9.388284752115105e-06, + "loss": 1.0009, + "step": 10490 + }, + { + "epoch": 0.7303331710370731, + "grad_norm": 2.2401902838635026, + "learning_rate": 9.386343788762576e-06, + "loss": 1.0362, + "step": 10500 + }, + { + "epoch": 0.7303331710370731, + "eval_loss": 1.0235521793365479, + "eval_runtime": 1324.4591, + "eval_samples_per_second": 13.712, + "eval_steps_per_second": 2.285, + "step": 10500 + }, + { + "epoch": 0.7310287264380608, + "grad_norm": 1.93693471755263, + "learning_rate": 9.384399952259029e-06, + "loss": 0.988, + "step": 10510 + }, + { + "epoch": 0.7317242818390485, + "grad_norm": 2.1974574836345195, + "learning_rate": 9.382453243877718e-06, + "loss": 0.9871, + "step": 10520 + }, + { + "epoch": 0.7324198372400361, + "grad_norm": 2.370707812038226, + "learning_rate": 9.380503664893783e-06, + "loss": 0.9658, + "step": 10530 + }, + { + "epoch": 0.7331153926410239, + "grad_norm": 2.581598744009052, + "learning_rate": 9.378551216584237e-06, + "loss": 0.989, + "step": 10540 + }, + { + "epoch": 0.7338109480420115, + "grad_norm": 2.852298850763678, + "learning_rate": 9.376595900227979e-06, + "loss": 1.0615, + "step": 10550 + }, + { + "epoch": 0.7345065034429993, + "grad_norm": 11.253185598231772, + "learning_rate": 9.37463771710578e-06, + "loss": 0.9438, + "step": 10560 + }, + { + "epoch": 0.7352020588439869, + "grad_norm": 4.024243912916086, + "learning_rate": 9.372676668500298e-06, + "loss": 1.0687, + "step": 10570 + }, + { + "epoch": 0.7358976142449746, + "grad_norm": 3.809632144570522, + "learning_rate": 9.370712755696061e-06, + "loss": 1.0354, + "step": 10580 + }, + { + "epoch": 0.7365931696459623, + "grad_norm": 1.7631103237715913, + "learning_rate": 9.368745979979471e-06, + "loss": 0.9628, + "step": 10590 + }, + { + "epoch": 0.73728872504695, + "grad_norm": 2.8139083790191064, + "learning_rate": 9.366776342638814e-06, + "loss": 1.0054, + "step": 10600 + }, + { + "epoch": 0.7379842804479376, + "grad_norm": 2.458995497763383, + "learning_rate": 9.364803844964246e-06, + "loss": 0.9521, + "step": 10610 + }, + { + "epoch": 0.7386798358489254, + "grad_norm": 1.636663246318918, + "learning_rate": 9.36282848824779e-06, + "loss": 0.9814, + "step": 10620 + }, + { + "epoch": 0.739375391249913, + "grad_norm": 1.9012664612522063, + "learning_rate": 9.360850273783353e-06, + "loss": 1.0494, + "step": 10630 + }, + { + "epoch": 0.7400709466509008, + "grad_norm": 2.555642206325541, + "learning_rate": 9.358869202866708e-06, + "loss": 1.0923, + "step": 10640 + }, + { + "epoch": 0.7407665020518884, + "grad_norm": 1.9236001013284194, + "learning_rate": 9.356885276795496e-06, + "loss": 1.1077, + "step": 10650 + }, + { + "epoch": 0.7414620574528761, + "grad_norm": 9.574974066821895, + "learning_rate": 9.354898496869238e-06, + "loss": 0.956, + "step": 10660 + }, + { + "epoch": 0.7421576128538638, + "grad_norm": 3.3261178358779833, + "learning_rate": 9.352908864389313e-06, + "loss": 1.0734, + "step": 10670 + }, + { + "epoch": 0.7428531682548515, + "grad_norm": 2.2708187312798573, + "learning_rate": 9.350916380658976e-06, + "loss": 1.0008, + "step": 10680 + }, + { + "epoch": 0.7435487236558392, + "grad_norm": 2.795697173059623, + "learning_rate": 9.348921046983348e-06, + "loss": 1.0487, + "step": 10690 + }, + { + "epoch": 0.7442442790568269, + "grad_norm": 2.079854069306723, + "learning_rate": 9.346922864669414e-06, + "loss": 0.9643, + "step": 10700 + }, + { + "epoch": 0.7449398344578145, + "grad_norm": 2.4824929973955783, + "learning_rate": 9.34492183502603e-06, + "loss": 1.0178, + "step": 10710 + }, + { + "epoch": 0.7456353898588023, + "grad_norm": 3.0843094192577523, + "learning_rate": 9.342917959363914e-06, + "loss": 1.0264, + "step": 10720 + }, + { + "epoch": 0.7463309452597899, + "grad_norm": 2.79234903200258, + "learning_rate": 9.340911238995644e-06, + "loss": 1.0297, + "step": 10730 + }, + { + "epoch": 0.7470265006607776, + "grad_norm": 1.881368400222725, + "learning_rate": 9.338901675235669e-06, + "loss": 1.0998, + "step": 10740 + }, + { + "epoch": 0.7477220560617653, + "grad_norm": 1.8584927425228714, + "learning_rate": 9.336889269400298e-06, + "loss": 1.0232, + "step": 10750 + }, + { + "epoch": 0.748417611462753, + "grad_norm": 4.963836853454613, + "learning_rate": 9.334874022807699e-06, + "loss": 1.0334, + "step": 10760 + }, + { + "epoch": 0.7491131668637407, + "grad_norm": 2.4182816749408804, + "learning_rate": 9.332855936777903e-06, + "loss": 1.0131, + "step": 10770 + }, + { + "epoch": 0.7498087222647284, + "grad_norm": 2.091123896256716, + "learning_rate": 9.330835012632801e-06, + "loss": 0.9642, + "step": 10780 + }, + { + "epoch": 0.750504277665716, + "grad_norm": 1.7914139250331695, + "learning_rate": 9.328811251696141e-06, + "loss": 1.0247, + "step": 10790 + }, + { + "epoch": 0.7511998330667038, + "grad_norm": 2.208305446848274, + "learning_rate": 9.326784655293533e-06, + "loss": 1.0063, + "step": 10800 + }, + { + "epoch": 0.7518953884676914, + "grad_norm": 1.8045449214518667, + "learning_rate": 9.32475522475244e-06, + "loss": 1.0085, + "step": 10810 + }, + { + "epoch": 0.7525909438686791, + "grad_norm": 2.519151103656969, + "learning_rate": 9.322722961402183e-06, + "loss": 1.0333, + "step": 10820 + }, + { + "epoch": 0.7532864992696668, + "grad_norm": 2.190774123981268, + "learning_rate": 9.320687866573941e-06, + "loss": 1.057, + "step": 10830 + }, + { + "epoch": 0.7539820546706545, + "grad_norm": 2.7061728696520233, + "learning_rate": 9.318649941600744e-06, + "loss": 1.0548, + "step": 10840 + }, + { + "epoch": 0.7546776100716422, + "grad_norm": 1.597847507576464, + "learning_rate": 9.316609187817479e-06, + "loss": 1.0209, + "step": 10850 + }, + { + "epoch": 0.7553731654726299, + "grad_norm": 2.6289992294178592, + "learning_rate": 9.31456560656088e-06, + "loss": 1.0272, + "step": 10860 + }, + { + "epoch": 0.7560687208736175, + "grad_norm": 6.424214537310558, + "learning_rate": 9.312519199169543e-06, + "loss": 0.9676, + "step": 10870 + }, + { + "epoch": 0.7567642762746053, + "grad_norm": 2.3169108722769756, + "learning_rate": 9.310469966983906e-06, + "loss": 0.9865, + "step": 10880 + }, + { + "epoch": 0.7574598316755929, + "grad_norm": 2.2735170948005083, + "learning_rate": 9.308417911346262e-06, + "loss": 1.0189, + "step": 10890 + }, + { + "epoch": 0.7581553870765807, + "grad_norm": 19.24638592686535, + "learning_rate": 9.306363033600753e-06, + "loss": 1.0243, + "step": 10900 + }, + { + "epoch": 0.7588509424775683, + "grad_norm": 2.42211729251852, + "learning_rate": 9.304305335093366e-06, + "loss": 0.9772, + "step": 10910 + }, + { + "epoch": 0.759546497878556, + "grad_norm": 5.265651680206077, + "learning_rate": 9.302244817171943e-06, + "loss": 1.0252, + "step": 10920 + }, + { + "epoch": 0.7602420532795438, + "grad_norm": 5.113158049146955, + "learning_rate": 9.300181481186164e-06, + "loss": 1.0156, + "step": 10930 + }, + { + "epoch": 0.7609376086805314, + "grad_norm": 2.253337540984761, + "learning_rate": 9.298115328487562e-06, + "loss": 1.0803, + "step": 10940 + }, + { + "epoch": 0.761633164081519, + "grad_norm": 5.108589756364146, + "learning_rate": 9.29604636042951e-06, + "loss": 1.0314, + "step": 10950 + }, + { + "epoch": 0.7623287194825068, + "grad_norm": 2.0236266404341605, + "learning_rate": 9.293974578367229e-06, + "loss": 1.006, + "step": 10960 + }, + { + "epoch": 0.7630242748834944, + "grad_norm": 2.1588283897287575, + "learning_rate": 9.29189998365778e-06, + "loss": 1.0048, + "step": 10970 + }, + { + "epoch": 0.7637198302844822, + "grad_norm": 4.250106597016893, + "learning_rate": 9.28982257766007e-06, + "loss": 1.0234, + "step": 10980 + }, + { + "epoch": 0.7644153856854699, + "grad_norm": 2.480869533059029, + "learning_rate": 9.287742361734843e-06, + "loss": 1.0071, + "step": 10990 + }, + { + "epoch": 0.7651109410864575, + "grad_norm": 4.121142924224638, + "learning_rate": 9.285659337244688e-06, + "loss": 0.9852, + "step": 11000 + }, + { + "epoch": 0.7651109410864575, + "eval_loss": 1.0177974700927734, + "eval_runtime": 1321.7961, + "eval_samples_per_second": 13.74, + "eval_steps_per_second": 2.29, + "step": 11000 + }, + { + "epoch": 0.7658064964874453, + "grad_norm": 1.899941070870275, + "learning_rate": 9.283573505554028e-06, + "loss": 0.9669, + "step": 11010 + }, + { + "epoch": 0.7665020518884329, + "grad_norm": 2.3424348759391274, + "learning_rate": 9.281484868029134e-06, + "loss": 0.9554, + "step": 11020 + }, + { + "epoch": 0.7671976072894207, + "grad_norm": 3.0772276204428874, + "learning_rate": 9.279393426038103e-06, + "loss": 1.0499, + "step": 11030 + }, + { + "epoch": 0.7678931626904083, + "grad_norm": 2.2852279642353563, + "learning_rate": 9.27729918095088e-06, + "loss": 1.0643, + "step": 11040 + }, + { + "epoch": 0.768588718091396, + "grad_norm": 1.7768931768736989, + "learning_rate": 9.275202134139239e-06, + "loss": 1.0435, + "step": 11050 + }, + { + "epoch": 0.7692842734923837, + "grad_norm": 13.937080749170958, + "learning_rate": 9.273102286976792e-06, + "loss": 0.968, + "step": 11060 + }, + { + "epoch": 0.7699798288933714, + "grad_norm": 2.781881490067669, + "learning_rate": 9.270999640838984e-06, + "loss": 0.9872, + "step": 11070 + }, + { + "epoch": 0.770675384294359, + "grad_norm": 3.3642312193053465, + "learning_rate": 9.268894197103095e-06, + "loss": 1.0114, + "step": 11080 + }, + { + "epoch": 0.7713709396953468, + "grad_norm": 10.209879402833154, + "learning_rate": 9.266785957148238e-06, + "loss": 1.0281, + "step": 11090 + }, + { + "epoch": 0.7720664950963344, + "grad_norm": 2.0601753248635495, + "learning_rate": 9.264674922355354e-06, + "loss": 0.9939, + "step": 11100 + }, + { + "epoch": 0.7727620504973222, + "grad_norm": 1.9462558775920222, + "learning_rate": 9.262561094107217e-06, + "loss": 1.0176, + "step": 11110 + }, + { + "epoch": 0.7734576058983098, + "grad_norm": 1.8118156894874784, + "learning_rate": 9.260444473788432e-06, + "loss": 0.9786, + "step": 11120 + }, + { + "epoch": 0.7741531612992975, + "grad_norm": 2.642118604692425, + "learning_rate": 9.258325062785432e-06, + "loss": 1.05, + "step": 11130 + }, + { + "epoch": 0.7748487167002852, + "grad_norm": 2.377256541607759, + "learning_rate": 9.256202862486474e-06, + "loss": 0.9999, + "step": 11140 + }, + { + "epoch": 0.7755442721012729, + "grad_norm": 1.722807670731929, + "learning_rate": 9.254077874281649e-06, + "loss": 1.0098, + "step": 11150 + }, + { + "epoch": 0.7762398275022605, + "grad_norm": 2.326573722208682, + "learning_rate": 9.25195009956287e-06, + "loss": 0.9796, + "step": 11160 + }, + { + "epoch": 0.7769353829032483, + "grad_norm": 1.7150373406740804, + "learning_rate": 9.249819539723876e-06, + "loss": 1.0173, + "step": 11170 + }, + { + "epoch": 0.7776309383042359, + "grad_norm": 5.503193650927291, + "learning_rate": 9.24768619616023e-06, + "loss": 1.0295, + "step": 11180 + }, + { + "epoch": 0.7783264937052237, + "grad_norm": 3.4907220133072294, + "learning_rate": 9.245550070269318e-06, + "loss": 1.0604, + "step": 11190 + }, + { + "epoch": 0.7790220491062113, + "grad_norm": 2.235155759937919, + "learning_rate": 9.243411163450349e-06, + "loss": 0.9888, + "step": 11200 + }, + { + "epoch": 0.779717604507199, + "grad_norm": 3.2834861301253278, + "learning_rate": 9.241269477104356e-06, + "loss": 0.9803, + "step": 11210 + }, + { + "epoch": 0.7804131599081867, + "grad_norm": 2.901035212818355, + "learning_rate": 9.239125012634187e-06, + "loss": 1.0575, + "step": 11220 + }, + { + "epoch": 0.7811087153091744, + "grad_norm": 3.885007357952634, + "learning_rate": 9.236977771444515e-06, + "loss": 0.9921, + "step": 11230 + }, + { + "epoch": 0.7818042707101621, + "grad_norm": 1.8287330384948512, + "learning_rate": 9.23482775494183e-06, + "loss": 1.0082, + "step": 11240 + }, + { + "epoch": 0.7824998261111498, + "grad_norm": 2.575016629058713, + "learning_rate": 9.23267496453444e-06, + "loss": 1.0348, + "step": 11250 + }, + { + "epoch": 0.7831953815121374, + "grad_norm": 3.570514424555113, + "learning_rate": 9.230519401632467e-06, + "loss": 1.0136, + "step": 11260 + }, + { + "epoch": 0.7838909369131252, + "grad_norm": 2.45706079132827, + "learning_rate": 9.228361067647857e-06, + "loss": 0.9929, + "step": 11270 + }, + { + "epoch": 0.7845864923141128, + "grad_norm": 2.1615482833961366, + "learning_rate": 9.226199963994362e-06, + "loss": 1.0303, + "step": 11280 + }, + { + "epoch": 0.7852820477151005, + "grad_norm": 1.9574562862090623, + "learning_rate": 9.224036092087552e-06, + "loss": 1.0044, + "step": 11290 + }, + { + "epoch": 0.7859776031160882, + "grad_norm": 2.0891097854404, + "learning_rate": 9.22186945334481e-06, + "loss": 1.0042, + "step": 11300 + }, + { + "epoch": 0.7866731585170759, + "grad_norm": 3.697050319033273, + "learning_rate": 9.219700049185337e-06, + "loss": 1.0174, + "step": 11310 + }, + { + "epoch": 0.7873687139180636, + "grad_norm": 2.337165363380176, + "learning_rate": 9.217527881030134e-06, + "loss": 0.9187, + "step": 11320 + }, + { + "epoch": 0.7880642693190513, + "grad_norm": 1.936370737924563, + "learning_rate": 9.215352950302022e-06, + "loss": 1.0298, + "step": 11330 + }, + { + "epoch": 0.7887598247200389, + "grad_norm": 3.23716164010239, + "learning_rate": 9.213175258425626e-06, + "loss": 1.0359, + "step": 11340 + }, + { + "epoch": 0.7894553801210267, + "grad_norm": 2.264274205737049, + "learning_rate": 9.210994806827384e-06, + "loss": 1.0415, + "step": 11350 + }, + { + "epoch": 0.7901509355220143, + "grad_norm": 1.8930079064741607, + "learning_rate": 9.208811596935537e-06, + "loss": 1.0203, + "step": 11360 + }, + { + "epoch": 0.790846490923002, + "grad_norm": 4.74238876493143, + "learning_rate": 9.206625630180137e-06, + "loss": 1.063, + "step": 11370 + }, + { + "epoch": 0.7915420463239897, + "grad_norm": 2.534549632232441, + "learning_rate": 9.204436907993039e-06, + "loss": 0.9773, + "step": 11380 + }, + { + "epoch": 0.7922376017249774, + "grad_norm": 1.991164366354699, + "learning_rate": 9.202245431807904e-06, + "loss": 0.948, + "step": 11390 + }, + { + "epoch": 0.7929331571259651, + "grad_norm": 2.7777668802105917, + "learning_rate": 9.200051203060196e-06, + "loss": 0.9671, + "step": 11400 + }, + { + "epoch": 0.7936287125269528, + "grad_norm": 3.0627997801368454, + "learning_rate": 9.197854223187186e-06, + "loss": 0.9808, + "step": 11410 + }, + { + "epoch": 0.7943242679279404, + "grad_norm": 2.19612211092547, + "learning_rate": 9.195654493627942e-06, + "loss": 1.0203, + "step": 11420 + }, + { + "epoch": 0.7950198233289282, + "grad_norm": 2.146868728350374, + "learning_rate": 9.193452015823332e-06, + "loss": 1.0292, + "step": 11430 + }, + { + "epoch": 0.7957153787299158, + "grad_norm": 1.8031484451888324, + "learning_rate": 9.191246791216031e-06, + "loss": 1.0351, + "step": 11440 + }, + { + "epoch": 0.7964109341309036, + "grad_norm": 1.6704404463570304, + "learning_rate": 9.189038821250506e-06, + "loss": 0.9753, + "step": 11450 + }, + { + "epoch": 0.7971064895318912, + "grad_norm": 4.295094631924187, + "learning_rate": 9.186828107373029e-06, + "loss": 1.0492, + "step": 11460 + }, + { + "epoch": 0.7978020449328789, + "grad_norm": 2.068821317266629, + "learning_rate": 9.184614651031665e-06, + "loss": 1.0499, + "step": 11470 + }, + { + "epoch": 0.7984976003338666, + "grad_norm": 1.9451407158575293, + "learning_rate": 9.182398453676276e-06, + "loss": 1.0797, + "step": 11480 + }, + { + "epoch": 0.7991931557348543, + "grad_norm": 1.7950244907995547, + "learning_rate": 9.180179516758518e-06, + "loss": 1.0422, + "step": 11490 + }, + { + "epoch": 0.7998887111358419, + "grad_norm": 2.8979140368016116, + "learning_rate": 9.177957841731844e-06, + "loss": 0.9551, + "step": 11500 + }, + { + "epoch": 0.7998887111358419, + "eval_loss": 1.0153679847717285, + "eval_runtime": 1324.0971, + "eval_samples_per_second": 13.716, + "eval_steps_per_second": 2.286, + "step": 11500 + }, + { + "epoch": 0.8005842665368297, + "grad_norm": 2.9834222175567953, + "learning_rate": 9.175733430051502e-06, + "loss": 1.0076, + "step": 11510 + }, + { + "epoch": 0.8012798219378173, + "grad_norm": 1.9040201363521523, + "learning_rate": 9.173506283174526e-06, + "loss": 0.9809, + "step": 11520 + }, + { + "epoch": 0.8019753773388051, + "grad_norm": 2.149194818655018, + "learning_rate": 9.17127640255975e-06, + "loss": 1.012, + "step": 11530 + }, + { + "epoch": 0.8026709327397927, + "grad_norm": 5.223495870287097, + "learning_rate": 9.169043789667792e-06, + "loss": 1.0334, + "step": 11540 + }, + { + "epoch": 0.8033664881407804, + "grad_norm": 2.2881377894594075, + "learning_rate": 9.166808445961065e-06, + "loss": 0.9506, + "step": 11550 + }, + { + "epoch": 0.8040620435417681, + "grad_norm": 1.908591671023987, + "learning_rate": 9.164570372903763e-06, + "loss": 0.9862, + "step": 11560 + }, + { + "epoch": 0.8047575989427558, + "grad_norm": 3.363707242201898, + "learning_rate": 9.162329571961877e-06, + "loss": 1.0981, + "step": 11570 + }, + { + "epoch": 0.8054531543437434, + "grad_norm": 2.732504775266076, + "learning_rate": 9.16008604460318e-06, + "loss": 1.001, + "step": 11580 + }, + { + "epoch": 0.8061487097447312, + "grad_norm": 2.0959059140840415, + "learning_rate": 9.15783979229723e-06, + "loss": 1.047, + "step": 11590 + }, + { + "epoch": 0.8068442651457188, + "grad_norm": 6.482942351351786, + "learning_rate": 9.155590816515372e-06, + "loss": 0.9945, + "step": 11600 + }, + { + "epoch": 0.8075398205467066, + "grad_norm": 2.0701187542587367, + "learning_rate": 9.153339118730735e-06, + "loss": 1.0559, + "step": 11610 + }, + { + "epoch": 0.8082353759476942, + "grad_norm": 2.8550470383822875, + "learning_rate": 9.15108470041823e-06, + "loss": 1.0379, + "step": 11620 + }, + { + "epoch": 0.8089309313486819, + "grad_norm": 2.621318122641278, + "learning_rate": 9.148827563054547e-06, + "loss": 1.0489, + "step": 11630 + }, + { + "epoch": 0.8096264867496696, + "grad_norm": 6.795014253402755, + "learning_rate": 9.146567708118166e-06, + "loss": 0.9635, + "step": 11640 + }, + { + "epoch": 0.8103220421506573, + "grad_norm": 3.416228313668176, + "learning_rate": 9.144305137089338e-06, + "loss": 1.0315, + "step": 11650 + }, + { + "epoch": 0.811017597551645, + "grad_norm": 1.764473138083247, + "learning_rate": 9.142039851450097e-06, + "loss": 0.9679, + "step": 11660 + }, + { + "epoch": 0.8117131529526327, + "grad_norm": 2.733768148192296, + "learning_rate": 9.139771852684254e-06, + "loss": 1.0398, + "step": 11670 + }, + { + "epoch": 0.8124087083536203, + "grad_norm": 2.1598984282474802, + "learning_rate": 9.137501142277398e-06, + "loss": 1.0474, + "step": 11680 + }, + { + "epoch": 0.8131042637546081, + "grad_norm": 1.9688400555676275, + "learning_rate": 9.135227721716895e-06, + "loss": 1.0379, + "step": 11690 + }, + { + "epoch": 0.8137998191555957, + "grad_norm": 1.9582477105890257, + "learning_rate": 9.132951592491886e-06, + "loss": 1.0615, + "step": 11700 + }, + { + "epoch": 0.8144953745565834, + "grad_norm": 2.278120081182661, + "learning_rate": 9.13067275609328e-06, + "loss": 1.0177, + "step": 11710 + }, + { + "epoch": 0.8151909299575711, + "grad_norm": 2.328494150646161, + "learning_rate": 9.12839121401377e-06, + "loss": 1.0016, + "step": 11720 + }, + { + "epoch": 0.8158864853585588, + "grad_norm": 3.0702794428606546, + "learning_rate": 9.126106967747814e-06, + "loss": 1.0273, + "step": 11730 + }, + { + "epoch": 0.8165820407595465, + "grad_norm": 6.687230109577623, + "learning_rate": 9.123820018791645e-06, + "loss": 0.9868, + "step": 11740 + }, + { + "epoch": 0.8172775961605342, + "grad_norm": 2.3032729282942346, + "learning_rate": 9.121530368643263e-06, + "loss": 1.0649, + "step": 11750 + }, + { + "epoch": 0.8179731515615218, + "grad_norm": 3.1056577703224955, + "learning_rate": 9.119238018802437e-06, + "loss": 0.9682, + "step": 11760 + }, + { + "epoch": 0.8186687069625096, + "grad_norm": 1.8725957499329278, + "learning_rate": 9.116942970770709e-06, + "loss": 0.9797, + "step": 11770 + }, + { + "epoch": 0.8193642623634972, + "grad_norm": 2.336334433959823, + "learning_rate": 9.114645226051385e-06, + "loss": 1.0145, + "step": 11780 + }, + { + "epoch": 0.8200598177644849, + "grad_norm": 1.8806558183543345, + "learning_rate": 9.112344786149536e-06, + "loss": 1.004, + "step": 11790 + }, + { + "epoch": 0.8207553731654726, + "grad_norm": 3.1371659792292306, + "learning_rate": 9.110041652572006e-06, + "loss": 1.015, + "step": 11800 + }, + { + "epoch": 0.8214509285664603, + "grad_norm": 2.64960281013485, + "learning_rate": 9.107735826827391e-06, + "loss": 0.9943, + "step": 11810 + }, + { + "epoch": 0.822146483967448, + "grad_norm": 2.268427236218466, + "learning_rate": 9.10542731042606e-06, + "loss": 1.0241, + "step": 11820 + }, + { + "epoch": 0.8228420393684357, + "grad_norm": 2.1381701602518675, + "learning_rate": 9.103116104880143e-06, + "loss": 0.9833, + "step": 11830 + }, + { + "epoch": 0.8235375947694233, + "grad_norm": 2.0880483333847333, + "learning_rate": 9.100802211703528e-06, + "loss": 0.9717, + "step": 11840 + }, + { + "epoch": 0.8242331501704111, + "grad_norm": 1.437036129467993, + "learning_rate": 9.098485632411868e-06, + "loss": 0.998, + "step": 11850 + }, + { + "epoch": 0.8249287055713987, + "grad_norm": 9.220622621551007, + "learning_rate": 9.096166368522571e-06, + "loss": 1.0151, + "step": 11860 + }, + { + "epoch": 0.8256242609723865, + "grad_norm": 2.13896488329584, + "learning_rate": 9.093844421554804e-06, + "loss": 0.9906, + "step": 11870 + }, + { + "epoch": 0.8263198163733741, + "grad_norm": 3.712827995139085, + "learning_rate": 9.091519793029499e-06, + "loss": 1.0055, + "step": 11880 + }, + { + "epoch": 0.8270153717743618, + "grad_norm": 16.473929937748913, + "learning_rate": 9.089192484469333e-06, + "loss": 0.9404, + "step": 11890 + }, + { + "epoch": 0.8277109271753496, + "grad_norm": 4.410062749305998, + "learning_rate": 9.086862497398745e-06, + "loss": 1.0285, + "step": 11900 + }, + { + "epoch": 0.8284064825763372, + "grad_norm": 1.536212212258463, + "learning_rate": 9.08452983334393e-06, + "loss": 1.0116, + "step": 11910 + }, + { + "epoch": 0.8291020379773248, + "grad_norm": 2.5832325301297328, + "learning_rate": 9.082194493832829e-06, + "loss": 1.0485, + "step": 11920 + }, + { + "epoch": 0.8297975933783126, + "grad_norm": 1.9598748693625885, + "learning_rate": 9.079856480395143e-06, + "loss": 0.9787, + "step": 11930 + }, + { + "epoch": 0.8304931487793002, + "grad_norm": 2.0955930042910116, + "learning_rate": 9.077515794562326e-06, + "loss": 0.9729, + "step": 11940 + }, + { + "epoch": 0.831188704180288, + "grad_norm": 2.1253675619392043, + "learning_rate": 9.075172437867572e-06, + "loss": 1.0046, + "step": 11950 + }, + { + "epoch": 0.8318842595812757, + "grad_norm": 1.924909914024533, + "learning_rate": 9.072826411845834e-06, + "loss": 1.0364, + "step": 11960 + }, + { + "epoch": 0.8325798149822633, + "grad_norm": 3.849404012333158, + "learning_rate": 9.07047771803381e-06, + "loss": 0.9487, + "step": 11970 + }, + { + "epoch": 0.8332753703832511, + "grad_norm": 4.029899887301721, + "learning_rate": 9.068126357969944e-06, + "loss": 0.9578, + "step": 11980 + }, + { + "epoch": 0.8339709257842387, + "grad_norm": 3.7847407049759005, + "learning_rate": 9.065772333194432e-06, + "loss": 1.0051, + "step": 11990 + }, + { + "epoch": 0.8346664811852263, + "grad_norm": 4.008013591784392, + "learning_rate": 9.063415645249207e-06, + "loss": 1.027, + "step": 12000 + }, + { + "epoch": 0.8346664811852263, + "eval_loss": 1.0145593881607056, + "eval_runtime": 1320.2266, + "eval_samples_per_second": 13.756, + "eval_steps_per_second": 2.293, + "step": 12000 + }, + { + "epoch": 0.8353620365862141, + "grad_norm": 4.474385294568397, + "learning_rate": 9.061056295677955e-06, + "loss": 1.0134, + "step": 12010 + }, + { + "epoch": 0.8360575919872018, + "grad_norm": 2.374772050267209, + "learning_rate": 9.0586942860261e-06, + "loss": 0.9728, + "step": 12020 + }, + { + "epoch": 0.8367531473881895, + "grad_norm": 3.422489192857499, + "learning_rate": 9.056329617840808e-06, + "loss": 0.9794, + "step": 12030 + }, + { + "epoch": 0.8374487027891772, + "grad_norm": 2.4165940671622557, + "learning_rate": 9.053962292670992e-06, + "loss": 1.0505, + "step": 12040 + }, + { + "epoch": 0.8381442581901648, + "grad_norm": 3.119092707202833, + "learning_rate": 9.051592312067302e-06, + "loss": 1.0295, + "step": 12050 + }, + { + "epoch": 0.8388398135911526, + "grad_norm": 2.423511951797814, + "learning_rate": 9.049219677582122e-06, + "loss": 1.0057, + "step": 12060 + }, + { + "epoch": 0.8395353689921402, + "grad_norm": 2.223963958557793, + "learning_rate": 9.046844390769582e-06, + "loss": 1.0063, + "step": 12070 + }, + { + "epoch": 0.840230924393128, + "grad_norm": 2.381988047866609, + "learning_rate": 9.044466453185549e-06, + "loss": 0.9986, + "step": 12080 + }, + { + "epoch": 0.8409264797941156, + "grad_norm": 2.7044069014003913, + "learning_rate": 9.042085866387621e-06, + "loss": 0.9429, + "step": 12090 + }, + { + "epoch": 0.8416220351951033, + "grad_norm": 2.805150609116645, + "learning_rate": 9.039702631935137e-06, + "loss": 0.9377, + "step": 12100 + }, + { + "epoch": 0.842317590596091, + "grad_norm": 1.7967899637980058, + "learning_rate": 9.037316751389164e-06, + "loss": 1.0641, + "step": 12110 + }, + { + "epoch": 0.8430131459970787, + "grad_norm": 2.6900617133058757, + "learning_rate": 9.034928226312511e-06, + "loss": 1.0298, + "step": 12120 + }, + { + "epoch": 0.8437087013980663, + "grad_norm": 5.5608863200385015, + "learning_rate": 9.03253705826971e-06, + "loss": 1.0117, + "step": 12130 + }, + { + "epoch": 0.8444042567990541, + "grad_norm": 1.6482231694915508, + "learning_rate": 9.03014324882703e-06, + "loss": 1.0215, + "step": 12140 + }, + { + "epoch": 0.8450998122000417, + "grad_norm": 2.8682256135978923, + "learning_rate": 9.027746799552469e-06, + "loss": 1.0114, + "step": 12150 + }, + { + "epoch": 0.8457953676010295, + "grad_norm": 2.1224227533508526, + "learning_rate": 9.025347712015752e-06, + "loss": 1.0053, + "step": 12160 + }, + { + "epoch": 0.8464909230020171, + "grad_norm": 3.182006250714755, + "learning_rate": 9.022945987788332e-06, + "loss": 0.9827, + "step": 12170 + }, + { + "epoch": 0.8471864784030048, + "grad_norm": 2.6991689630239866, + "learning_rate": 9.020541628443395e-06, + "loss": 1.0367, + "step": 12180 + }, + { + "epoch": 0.8478820338039925, + "grad_norm": 4.0609138743622575, + "learning_rate": 9.018134635555848e-06, + "loss": 0.9109, + "step": 12190 + }, + { + "epoch": 0.8485775892049802, + "grad_norm": 4.015578622282884, + "learning_rate": 9.015725010702321e-06, + "loss": 1.0231, + "step": 12200 + }, + { + "epoch": 0.8492731446059678, + "grad_norm": 3.371318433264835, + "learning_rate": 9.013312755461176e-06, + "loss": 0.9677, + "step": 12210 + }, + { + "epoch": 0.8499687000069556, + "grad_norm": 2.40442602986492, + "learning_rate": 9.010897871412487e-06, + "loss": 1.0159, + "step": 12220 + }, + { + "epoch": 0.8506642554079432, + "grad_norm": 22.53254771437563, + "learning_rate": 9.00848036013806e-06, + "loss": 1.0493, + "step": 12230 + }, + { + "epoch": 0.851359810808931, + "grad_norm": 2.3799347523885066, + "learning_rate": 9.006060223221417e-06, + "loss": 0.9947, + "step": 12240 + }, + { + "epoch": 0.8520553662099186, + "grad_norm": 3.0850284302852797, + "learning_rate": 9.003637462247801e-06, + "loss": 1.0293, + "step": 12250 + }, + { + "epoch": 0.8527509216109063, + "grad_norm": 2.166551728369461, + "learning_rate": 9.001212078804172e-06, + "loss": 0.9471, + "step": 12260 + }, + { + "epoch": 0.853446477011894, + "grad_norm": 2.2090427961546206, + "learning_rate": 8.99878407447921e-06, + "loss": 0.9922, + "step": 12270 + }, + { + "epoch": 0.8541420324128817, + "grad_norm": 2.0334165805263855, + "learning_rate": 8.996353450863307e-06, + "loss": 1.0437, + "step": 12280 + }, + { + "epoch": 0.8548375878138694, + "grad_norm": 3.8319681172556357, + "learning_rate": 8.99392020954858e-06, + "loss": 1.0332, + "step": 12290 + }, + { + "epoch": 0.8555331432148571, + "grad_norm": 2.98039051501507, + "learning_rate": 8.991484352128853e-06, + "loss": 0.9903, + "step": 12300 + }, + { + "epoch": 0.8562286986158447, + "grad_norm": 2.146332711162586, + "learning_rate": 8.989045880199669e-06, + "loss": 0.9672, + "step": 12310 + }, + { + "epoch": 0.8569242540168325, + "grad_norm": 3.878742763338814, + "learning_rate": 8.986604795358275e-06, + "loss": 1.0439, + "step": 12320 + }, + { + "epoch": 0.8576198094178201, + "grad_norm": 2.90217600571786, + "learning_rate": 8.984161099203636e-06, + "loss": 1.0219, + "step": 12330 + }, + { + "epoch": 0.8583153648188078, + "grad_norm": 3.4167573935400464, + "learning_rate": 8.98171479333643e-06, + "loss": 1.0699, + "step": 12340 + }, + { + "epoch": 0.8590109202197955, + "grad_norm": 2.729094360469992, + "learning_rate": 8.979265879359038e-06, + "loss": 1.0096, + "step": 12350 + }, + { + "epoch": 0.8597064756207832, + "grad_norm": 3.7681128148806144, + "learning_rate": 8.976814358875553e-06, + "loss": 1.0427, + "step": 12360 + }, + { + "epoch": 0.8604020310217709, + "grad_norm": 8.977559621437095, + "learning_rate": 8.974360233491773e-06, + "loss": 1.0317, + "step": 12370 + }, + { + "epoch": 0.8610975864227586, + "grad_norm": 4.23185120850223, + "learning_rate": 8.971903504815205e-06, + "loss": 0.9955, + "step": 12380 + }, + { + "epoch": 0.8617931418237462, + "grad_norm": 2.2797477998025073, + "learning_rate": 8.969444174455061e-06, + "loss": 0.9996, + "step": 12390 + }, + { + "epoch": 0.862488697224734, + "grad_norm": 2.1907282676316333, + "learning_rate": 8.966982244022254e-06, + "loss": 1.0181, + "step": 12400 + }, + { + "epoch": 0.8631842526257216, + "grad_norm": 2.1998305822100224, + "learning_rate": 8.964517715129404e-06, + "loss": 0.9891, + "step": 12410 + }, + { + "epoch": 0.8638798080267093, + "grad_norm": 3.024987425635011, + "learning_rate": 8.962050589390829e-06, + "loss": 1.0054, + "step": 12420 + }, + { + "epoch": 0.864575363427697, + "grad_norm": 3.707279036709514, + "learning_rate": 8.959580868422554e-06, + "loss": 0.9535, + "step": 12430 + }, + { + "epoch": 0.8652709188286847, + "grad_norm": 3.70944277319506, + "learning_rate": 8.957108553842296e-06, + "loss": 1.0291, + "step": 12440 + }, + { + "epoch": 0.8659664742296724, + "grad_norm": 5.633938173385902, + "learning_rate": 8.954633647269479e-06, + "loss": 1.0063, + "step": 12450 + }, + { + "epoch": 0.8666620296306601, + "grad_norm": 2.2334332372404413, + "learning_rate": 8.952156150325217e-06, + "loss": 0.9729, + "step": 12460 + }, + { + "epoch": 0.8673575850316477, + "grad_norm": 4.840851335253614, + "learning_rate": 8.949676064632327e-06, + "loss": 0.9717, + "step": 12470 + }, + { + "epoch": 0.8680531404326355, + "grad_norm": 2.178662586541157, + "learning_rate": 8.947193391815319e-06, + "loss": 1.0016, + "step": 12480 + }, + { + "epoch": 0.8687486958336231, + "grad_norm": 2.2248919317929507, + "learning_rate": 8.944708133500398e-06, + "loss": 0.9822, + "step": 12490 + }, + { + "epoch": 0.8694442512346109, + "grad_norm": 2.186781345140813, + "learning_rate": 8.942220291315463e-06, + "loss": 1.0638, + "step": 12500 + }, + { + "epoch": 0.8694442512346109, + "eval_loss": 1.0081130266189575, + "eval_runtime": 1323.3283, + "eval_samples_per_second": 13.724, + "eval_steps_per_second": 2.287, + "step": 12500 + }, + { + "epoch": 0.8701398066355985, + "grad_norm": 6.824702768650966, + "learning_rate": 8.939729866890103e-06, + "loss": 1.0211, + "step": 12510 + }, + { + "epoch": 0.8708353620365862, + "grad_norm": 2.1393831095518263, + "learning_rate": 8.937236861855602e-06, + "loss": 1.0506, + "step": 12520 + }, + { + "epoch": 0.8715309174375739, + "grad_norm": 1.7726935977907552, + "learning_rate": 8.934741277844933e-06, + "loss": 1.0027, + "step": 12530 + }, + { + "epoch": 0.8722264728385616, + "grad_norm": 1.9548647229776859, + "learning_rate": 8.932243116492756e-06, + "loss": 1.0041, + "step": 12540 + }, + { + "epoch": 0.8729220282395492, + "grad_norm": 2.922961891607269, + "learning_rate": 8.929742379435424e-06, + "loss": 0.999, + "step": 12550 + }, + { + "epoch": 0.873617583640537, + "grad_norm": 1.9980837252530805, + "learning_rate": 8.927239068310973e-06, + "loss": 0.9767, + "step": 12560 + }, + { + "epoch": 0.8743131390415246, + "grad_norm": 2.9432704408008976, + "learning_rate": 8.924733184759127e-06, + "loss": 1.0048, + "step": 12570 + }, + { + "epoch": 0.8750086944425124, + "grad_norm": 1.8128878555871557, + "learning_rate": 8.922224730421294e-06, + "loss": 1.0086, + "step": 12580 + }, + { + "epoch": 0.8757042498435, + "grad_norm": 1.969614517416086, + "learning_rate": 8.919713706940566e-06, + "loss": 0.9961, + "step": 12590 + }, + { + "epoch": 0.8763998052444877, + "grad_norm": 2.1604437961560357, + "learning_rate": 8.917200115961719e-06, + "loss": 1.0025, + "step": 12600 + }, + { + "epoch": 0.8770953606454754, + "grad_norm": 2.8379004332325155, + "learning_rate": 8.91468395913121e-06, + "loss": 1.018, + "step": 12610 + }, + { + "epoch": 0.8777909160464631, + "grad_norm": 1.5538605303169957, + "learning_rate": 8.912165238097177e-06, + "loss": 1.0344, + "step": 12620 + }, + { + "epoch": 0.8784864714474507, + "grad_norm": 2.282711973938636, + "learning_rate": 8.909643954509435e-06, + "loss": 0.9737, + "step": 12630 + }, + { + "epoch": 0.8791820268484385, + "grad_norm": 4.028559212713911, + "learning_rate": 8.907120110019483e-06, + "loss": 1.0382, + "step": 12640 + }, + { + "epoch": 0.8798775822494261, + "grad_norm": 2.509509798700072, + "learning_rate": 8.904593706280493e-06, + "loss": 0.9431, + "step": 12650 + }, + { + "epoch": 0.8805731376504139, + "grad_norm": 2.503896209306404, + "learning_rate": 8.902064744947314e-06, + "loss": 0.9578, + "step": 12660 + }, + { + "epoch": 0.8812686930514015, + "grad_norm": 2.1984977269118624, + "learning_rate": 8.899533227676471e-06, + "loss": 0.9899, + "step": 12670 + }, + { + "epoch": 0.8819642484523892, + "grad_norm": 3.2523450571807375, + "learning_rate": 8.896999156126165e-06, + "loss": 0.972, + "step": 12680 + }, + { + "epoch": 0.882659803853377, + "grad_norm": 8.241808378924285, + "learning_rate": 8.894462531956266e-06, + "loss": 1.0291, + "step": 12690 + }, + { + "epoch": 0.8833553592543646, + "grad_norm": 3.885295290224538, + "learning_rate": 8.89192335682832e-06, + "loss": 1.0225, + "step": 12700 + }, + { + "epoch": 0.8840509146553523, + "grad_norm": 2.5804563941387753, + "learning_rate": 8.88938163240554e-06, + "loss": 0.9592, + "step": 12710 + }, + { + "epoch": 0.88474647005634, + "grad_norm": 2.9571812692730424, + "learning_rate": 8.886837360352814e-06, + "loss": 1.0365, + "step": 12720 + }, + { + "epoch": 0.8854420254573276, + "grad_norm": 2.035856128063919, + "learning_rate": 8.884290542336692e-06, + "loss": 0.9579, + "step": 12730 + }, + { + "epoch": 0.8861375808583154, + "grad_norm": 2.181855273320585, + "learning_rate": 8.881741180025398e-06, + "loss": 1.03, + "step": 12740 + }, + { + "epoch": 0.886833136259303, + "grad_norm": 2.814926977352609, + "learning_rate": 8.87918927508882e-06, + "loss": 1.0122, + "step": 12750 + }, + { + "epoch": 0.8875286916602907, + "grad_norm": 2.941899452263326, + "learning_rate": 8.876634829198511e-06, + "loss": 0.978, + "step": 12760 + }, + { + "epoch": 0.8882242470612784, + "grad_norm": 2.3268609679754304, + "learning_rate": 8.87407784402769e-06, + "loss": 1.0376, + "step": 12770 + }, + { + "epoch": 0.8889198024622661, + "grad_norm": 2.2606334318561783, + "learning_rate": 8.871518321251235e-06, + "loss": 0.9738, + "step": 12780 + }, + { + "epoch": 0.8896153578632539, + "grad_norm": 2.388568836385881, + "learning_rate": 8.868956262545694e-06, + "loss": 1.0347, + "step": 12790 + }, + { + "epoch": 0.8903109132642415, + "grad_norm": 1.9258552192266813, + "learning_rate": 8.866391669589268e-06, + "loss": 1.0018, + "step": 12800 + }, + { + "epoch": 0.8910064686652291, + "grad_norm": 3.397386944948726, + "learning_rate": 8.86382454406182e-06, + "loss": 0.9761, + "step": 12810 + }, + { + "epoch": 0.8917020240662169, + "grad_norm": 2.4306346924994764, + "learning_rate": 8.861254887644877e-06, + "loss": 1.0488, + "step": 12820 + }, + { + "epoch": 0.8923975794672045, + "grad_norm": 2.4489014098360866, + "learning_rate": 8.85868270202162e-06, + "loss": 1.0011, + "step": 12830 + }, + { + "epoch": 0.8930931348681923, + "grad_norm": 4.030900420326565, + "learning_rate": 8.856107988876884e-06, + "loss": 0.9388, + "step": 12840 + }, + { + "epoch": 0.89378869026918, + "grad_norm": 3.268182828975952, + "learning_rate": 8.853530749897163e-06, + "loss": 0.9725, + "step": 12850 + }, + { + "epoch": 0.8944842456701676, + "grad_norm": 2.139747650163956, + "learning_rate": 8.850950986770607e-06, + "loss": 0.9804, + "step": 12860 + }, + { + "epoch": 0.8951798010711554, + "grad_norm": 2.231508533093627, + "learning_rate": 8.848368701187015e-06, + "loss": 0.9931, + "step": 12870 + }, + { + "epoch": 0.895875356472143, + "grad_norm": 2.6508428495515433, + "learning_rate": 8.845783894837843e-06, + "loss": 1.0054, + "step": 12880 + }, + { + "epoch": 0.8965709118731306, + "grad_norm": 3.6191165799972853, + "learning_rate": 8.843196569416192e-06, + "loss": 1.035, + "step": 12890 + }, + { + "epoch": 0.8972664672741184, + "grad_norm": 2.672034168033511, + "learning_rate": 8.84060672661682e-06, + "loss": 1.0423, + "step": 12900 + }, + { + "epoch": 0.897962022675106, + "grad_norm": 2.9546721668007727, + "learning_rate": 8.83801436813613e-06, + "loss": 1.0169, + "step": 12910 + }, + { + "epoch": 0.8986575780760938, + "grad_norm": 2.405679911415528, + "learning_rate": 8.83541949567217e-06, + "loss": 1.0316, + "step": 12920 + }, + { + "epoch": 0.8993531334770815, + "grad_norm": 2.4470543644310925, + "learning_rate": 8.832822110924644e-06, + "loss": 0.9862, + "step": 12930 + }, + { + "epoch": 0.9000486888780691, + "grad_norm": 3.2577591311262712, + "learning_rate": 8.83022221559489e-06, + "loss": 0.9843, + "step": 12940 + }, + { + "epoch": 0.9007442442790569, + "grad_norm": 1.868589151700602, + "learning_rate": 8.827619811385901e-06, + "loss": 0.9248, + "step": 12950 + }, + { + "epoch": 0.9014397996800445, + "grad_norm": 3.3864970088126647, + "learning_rate": 8.825014900002306e-06, + "loss": 0.9727, + "step": 12960 + }, + { + "epoch": 0.9021353550810322, + "grad_norm": 2.1957281515148424, + "learning_rate": 8.82240748315038e-06, + "loss": 0.9649, + "step": 12970 + }, + { + "epoch": 0.9028309104820199, + "grad_norm": 2.6118791836576714, + "learning_rate": 8.819797562538035e-06, + "loss": 1.0082, + "step": 12980 + }, + { + "epoch": 0.9035264658830076, + "grad_norm": 3.0882766709501697, + "learning_rate": 8.817185139874828e-06, + "loss": 1.0034, + "step": 12990 + }, + { + "epoch": 0.9042220212839953, + "grad_norm": 2.551833287888204, + "learning_rate": 8.814570216871958e-06, + "loss": 1.0415, + "step": 13000 + }, + { + "epoch": 0.9042220212839953, + "eval_loss": 1.0044872760772705, + "eval_runtime": 1320.7579, + "eval_samples_per_second": 13.75, + "eval_steps_per_second": 2.292, + "step": 13000 + }, + { + "epoch": 0.904917576684983, + "grad_norm": 2.2429558153480564, + "learning_rate": 8.811952795242248e-06, + "loss": 1.0379, + "step": 13010 + }, + { + "epoch": 0.9056131320859706, + "grad_norm": 2.292915819660424, + "learning_rate": 8.809332876700173e-06, + "loss": 0.9858, + "step": 13020 + }, + { + "epoch": 0.9063086874869584, + "grad_norm": 2.5022716933087192, + "learning_rate": 8.806710462961831e-06, + "loss": 1.0293, + "step": 13030 + }, + { + "epoch": 0.907004242887946, + "grad_norm": 3.110792265066373, + "learning_rate": 8.804085555744966e-06, + "loss": 1.0236, + "step": 13040 + }, + { + "epoch": 0.9076997982889338, + "grad_norm": 1.603688021056249, + "learning_rate": 8.801458156768945e-06, + "loss": 0.9889, + "step": 13050 + }, + { + "epoch": 0.9083953536899214, + "grad_norm": 1.774418634143913, + "learning_rate": 8.798828267754775e-06, + "loss": 0.98, + "step": 13060 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 3.044333175960114, + "learning_rate": 8.796195890425092e-06, + "loss": 1.0017, + "step": 13070 + }, + { + "epoch": 0.9097864644918968, + "grad_norm": 3.6597331540537477, + "learning_rate": 8.793561026504156e-06, + "loss": 0.997, + "step": 13080 + }, + { + "epoch": 0.9104820198928845, + "grad_norm": 2.0874441382689897, + "learning_rate": 8.790923677717861e-06, + "loss": 0.9354, + "step": 13090 + }, + { + "epoch": 0.9111775752938721, + "grad_norm": 2.8541096392642995, + "learning_rate": 8.788283845793733e-06, + "loss": 0.9804, + "step": 13100 + }, + { + "epoch": 0.9118731306948599, + "grad_norm": 2.391836131756992, + "learning_rate": 8.785641532460916e-06, + "loss": 0.9983, + "step": 13110 + }, + { + "epoch": 0.9125686860958475, + "grad_norm": 1.883203968802448, + "learning_rate": 8.782996739450182e-06, + "loss": 0.999, + "step": 13120 + }, + { + "epoch": 0.9132642414968353, + "grad_norm": 1.9093163546974494, + "learning_rate": 8.78034946849393e-06, + "loss": 0.9853, + "step": 13130 + }, + { + "epoch": 0.9139597968978229, + "grad_norm": 2.2860200509331423, + "learning_rate": 8.777699721326181e-06, + "loss": 0.9638, + "step": 13140 + }, + { + "epoch": 0.9146553522988106, + "grad_norm": 2.3071620003312137, + "learning_rate": 8.775047499682576e-06, + "loss": 1.0183, + "step": 13150 + }, + { + "epoch": 0.9153509076997983, + "grad_norm": 2.6169935147807983, + "learning_rate": 8.772392805300377e-06, + "loss": 0.9694, + "step": 13160 + }, + { + "epoch": 0.916046463100786, + "grad_norm": 2.951374167509544, + "learning_rate": 8.769735639918468e-06, + "loss": 1.0462, + "step": 13170 + }, + { + "epoch": 0.9167420185017736, + "grad_norm": 1.95654006918851, + "learning_rate": 8.767076005277351e-06, + "loss": 1.0087, + "step": 13180 + }, + { + "epoch": 0.9174375739027614, + "grad_norm": 2.533976129134612, + "learning_rate": 8.764413903119147e-06, + "loss": 1.0636, + "step": 13190 + }, + { + "epoch": 0.918133129303749, + "grad_norm": 2.785258652454596, + "learning_rate": 8.761749335187583e-06, + "loss": 1.0301, + "step": 13200 + }, + { + "epoch": 0.9188286847047368, + "grad_norm": 5.210926277253133, + "learning_rate": 8.75908230322802e-06, + "loss": 1.0, + "step": 13210 + }, + { + "epoch": 0.9195242401057244, + "grad_norm": 2.2200475795448473, + "learning_rate": 8.756412808987413e-06, + "loss": 0.9463, + "step": 13220 + }, + { + "epoch": 0.9202197955067121, + "grad_norm": 2.3628592119649654, + "learning_rate": 8.753740854214345e-06, + "loss": 0.988, + "step": 13230 + }, + { + "epoch": 0.9209153509076998, + "grad_norm": 1.6351076256075918, + "learning_rate": 8.751066440659001e-06, + "loss": 0.9808, + "step": 13240 + }, + { + "epoch": 0.9216109063086875, + "grad_norm": 2.358107009944824, + "learning_rate": 8.748389570073183e-06, + "loss": 1.0426, + "step": 13250 + }, + { + "epoch": 0.9223064617096752, + "grad_norm": 3.953761850336169, + "learning_rate": 8.745710244210299e-06, + "loss": 0.9424, + "step": 13260 + }, + { + "epoch": 0.9230020171106629, + "grad_norm": 2.2451376228721167, + "learning_rate": 8.743028464825365e-06, + "loss": 0.9797, + "step": 13270 + }, + { + "epoch": 0.9236975725116505, + "grad_norm": 4.115946187230681, + "learning_rate": 8.740344233675006e-06, + "loss": 1.0055, + "step": 13280 + }, + { + "epoch": 0.9243931279126383, + "grad_norm": 2.2769735731629632, + "learning_rate": 8.737657552517452e-06, + "loss": 0.9766, + "step": 13290 + }, + { + "epoch": 0.9250886833136259, + "grad_norm": 2.2708841010208856, + "learning_rate": 8.734968423112538e-06, + "loss": 1.014, + "step": 13300 + }, + { + "epoch": 0.9257842387146136, + "grad_norm": 3.7653522209835852, + "learning_rate": 8.7322768472217e-06, + "loss": 0.969, + "step": 13310 + }, + { + "epoch": 0.9264797941156013, + "grad_norm": 3.5734585941969295, + "learning_rate": 8.729582826607984e-06, + "loss": 0.9977, + "step": 13320 + }, + { + "epoch": 0.927175349516589, + "grad_norm": 3.8648606664141805, + "learning_rate": 8.726886363036029e-06, + "loss": 0.9867, + "step": 13330 + }, + { + "epoch": 0.9278709049175767, + "grad_norm": 3.028179109232542, + "learning_rate": 8.724187458272075e-06, + "loss": 0.963, + "step": 13340 + }, + { + "epoch": 0.9285664603185644, + "grad_norm": 3.6259597712708773, + "learning_rate": 8.72148611408397e-06, + "loss": 0.9908, + "step": 13350 + }, + { + "epoch": 0.929262015719552, + "grad_norm": 2.3648610370167535, + "learning_rate": 8.71878233224115e-06, + "loss": 0.9571, + "step": 13360 + }, + { + "epoch": 0.9299575711205398, + "grad_norm": 2.979364271850091, + "learning_rate": 8.716076114514649e-06, + "loss": 1.0278, + "step": 13370 + }, + { + "epoch": 0.9306531265215274, + "grad_norm": 2.4291022075391595, + "learning_rate": 8.713367462677102e-06, + "loss": 0.9401, + "step": 13380 + }, + { + "epoch": 0.9313486819225151, + "grad_norm": 1.9330506157232135, + "learning_rate": 8.710656378502735e-06, + "loss": 0.9841, + "step": 13390 + }, + { + "epoch": 0.9320442373235028, + "grad_norm": 1.780205456458321, + "learning_rate": 8.707942863767367e-06, + "loss": 1.0441, + "step": 13400 + }, + { + "epoch": 0.9327397927244905, + "grad_norm": 4.773522639390308, + "learning_rate": 8.705226920248409e-06, + "loss": 1.0206, + "step": 13410 + }, + { + "epoch": 0.9334353481254782, + "grad_norm": 4.045354710923843, + "learning_rate": 8.702508549724863e-06, + "loss": 1.013, + "step": 13420 + }, + { + "epoch": 0.9341309035264659, + "grad_norm": 3.091454451901605, + "learning_rate": 8.699787753977319e-06, + "loss": 0.9838, + "step": 13430 + }, + { + "epoch": 0.9348264589274535, + "grad_norm": 1.9634414571361625, + "learning_rate": 8.697064534787963e-06, + "loss": 0.9594, + "step": 13440 + }, + { + "epoch": 0.9355220143284413, + "grad_norm": 1.6457753262202575, + "learning_rate": 8.69433889394056e-06, + "loss": 1.0199, + "step": 13450 + }, + { + "epoch": 0.9362175697294289, + "grad_norm": 2.8645234738838483, + "learning_rate": 8.691610833220463e-06, + "loss": 0.9875, + "step": 13460 + }, + { + "epoch": 0.9369131251304167, + "grad_norm": 2.0532715667032364, + "learning_rate": 8.688880354414612e-06, + "loss": 1.0352, + "step": 13470 + }, + { + "epoch": 0.9376086805314043, + "grad_norm": 3.1793052012003287, + "learning_rate": 8.686147459311534e-06, + "loss": 1.0302, + "step": 13480 + }, + { + "epoch": 0.938304235932392, + "grad_norm": 5.400769102149822, + "learning_rate": 8.68341214970133e-06, + "loss": 1.0308, + "step": 13490 + }, + { + "epoch": 0.9389997913333797, + "grad_norm": 2.69206574303917, + "learning_rate": 8.68067442737569e-06, + "loss": 1.0229, + "step": 13500 + }, + { + "epoch": 0.9389997913333797, + "eval_loss": 0.9994012117385864, + "eval_runtime": 1321.922, + "eval_samples_per_second": 13.738, + "eval_steps_per_second": 2.29, + "step": 13500 + }, + { + "epoch": 0.9396953467343674, + "grad_norm": 2.633229412819453, + "learning_rate": 8.677934294127883e-06, + "loss": 0.9972, + "step": 13510 + }, + { + "epoch": 0.940390902135355, + "grad_norm": 2.4271467405371294, + "learning_rate": 8.675191751752752e-06, + "loss": 0.9804, + "step": 13520 + }, + { + "epoch": 0.9410864575363428, + "grad_norm": 1.7563505975278282, + "learning_rate": 8.672446802046729e-06, + "loss": 1.0373, + "step": 13530 + }, + { + "epoch": 0.9417820129373304, + "grad_norm": 2.66822837012369, + "learning_rate": 8.66969944680781e-06, + "loss": 0.9303, + "step": 13540 + }, + { + "epoch": 0.9424775683383182, + "grad_norm": 2.639464079999039, + "learning_rate": 8.666949687835574e-06, + "loss": 1.0351, + "step": 13550 + }, + { + "epoch": 0.9431731237393058, + "grad_norm": 2.356381947580666, + "learning_rate": 8.664197526931173e-06, + "loss": 0.9987, + "step": 13560 + }, + { + "epoch": 0.9438686791402935, + "grad_norm": 3.310839057536699, + "learning_rate": 8.661442965897335e-06, + "loss": 0.9855, + "step": 13570 + }, + { + "epoch": 0.9445642345412812, + "grad_norm": 2.6772319993329208, + "learning_rate": 8.658686006538356e-06, + "loss": 1.0394, + "step": 13580 + }, + { + "epoch": 0.9452597899422689, + "grad_norm": 1.8931051449058238, + "learning_rate": 8.655926650660104e-06, + "loss": 0.9826, + "step": 13590 + }, + { + "epoch": 0.9459553453432565, + "grad_norm": 3.6251188033904382, + "learning_rate": 8.653164900070019e-06, + "loss": 0.9763, + "step": 13600 + }, + { + "epoch": 0.9466509007442443, + "grad_norm": 1.9679646599154572, + "learning_rate": 8.650400756577107e-06, + "loss": 0.9815, + "step": 13610 + }, + { + "epoch": 0.9473464561452319, + "grad_norm": 1.6869567065192415, + "learning_rate": 8.647634221991941e-06, + "loss": 0.9701, + "step": 13620 + }, + { + "epoch": 0.9480420115462197, + "grad_norm": 3.066180410119811, + "learning_rate": 8.644865298126663e-06, + "loss": 0.9445, + "step": 13630 + }, + { + "epoch": 0.9487375669472073, + "grad_norm": 2.116760623311233, + "learning_rate": 8.642093986794982e-06, + "loss": 1.04, + "step": 13640 + }, + { + "epoch": 0.949433122348195, + "grad_norm": 2.014241145367716, + "learning_rate": 8.63932028981216e-06, + "loss": 1.0095, + "step": 13650 + }, + { + "epoch": 0.9501286777491827, + "grad_norm": 4.467315586060441, + "learning_rate": 8.636544208995036e-06, + "loss": 0.955, + "step": 13660 + }, + { + "epoch": 0.9508242331501704, + "grad_norm": 2.5273106234019527, + "learning_rate": 8.633765746161999e-06, + "loss": 1.0681, + "step": 13670 + }, + { + "epoch": 0.9515197885511582, + "grad_norm": 3.4986072589522657, + "learning_rate": 8.630984903133005e-06, + "loss": 1.0633, + "step": 13680 + }, + { + "epoch": 0.9522153439521458, + "grad_norm": 3.091018997668463, + "learning_rate": 8.62820168172957e-06, + "loss": 0.9686, + "step": 13690 + }, + { + "epoch": 0.9529108993531334, + "grad_norm": 2.96372980668552, + "learning_rate": 8.625416083774758e-06, + "loss": 0.9582, + "step": 13700 + }, + { + "epoch": 0.9536064547541212, + "grad_norm": 3.6680611844737547, + "learning_rate": 8.6226281110932e-06, + "loss": 0.9949, + "step": 13710 + }, + { + "epoch": 0.9543020101551088, + "grad_norm": 6.394073982694562, + "learning_rate": 8.61983776551108e-06, + "loss": 1.0863, + "step": 13720 + }, + { + "epoch": 0.9549975655560965, + "grad_norm": 1.671104074835555, + "learning_rate": 8.617045048856134e-06, + "loss": 1.0106, + "step": 13730 + }, + { + "epoch": 0.9556931209570843, + "grad_norm": 8.174869785501627, + "learning_rate": 8.614249962957653e-06, + "loss": 1.0871, + "step": 13740 + }, + { + "epoch": 0.9563886763580719, + "grad_norm": 2.9439308634200136, + "learning_rate": 8.61145250964648e-06, + "loss": 0.9586, + "step": 13750 + }, + { + "epoch": 0.9570842317590597, + "grad_norm": 2.064644409230074, + "learning_rate": 8.608652690755006e-06, + "loss": 0.9775, + "step": 13760 + }, + { + "epoch": 0.9577797871600473, + "grad_norm": 3.1344510713168865, + "learning_rate": 8.605850508117176e-06, + "loss": 1.0043, + "step": 13770 + }, + { + "epoch": 0.958475342561035, + "grad_norm": 1.9383213755916588, + "learning_rate": 8.603045963568477e-06, + "loss": 1.0595, + "step": 13780 + }, + { + "epoch": 0.9591708979620227, + "grad_norm": 2.762141824010194, + "learning_rate": 8.600239058945952e-06, + "loss": 0.9605, + "step": 13790 + }, + { + "epoch": 0.9598664533630104, + "grad_norm": 3.3898073483121904, + "learning_rate": 8.597429796088182e-06, + "loss": 0.9396, + "step": 13800 + }, + { + "epoch": 0.960562008763998, + "grad_norm": 5.29428723765544, + "learning_rate": 8.594618176835294e-06, + "loss": 0.9522, + "step": 13810 + }, + { + "epoch": 0.9612575641649858, + "grad_norm": 2.8574677310491743, + "learning_rate": 8.591804203028963e-06, + "loss": 1.0106, + "step": 13820 + }, + { + "epoch": 0.9619531195659734, + "grad_norm": 3.6426559015863686, + "learning_rate": 8.588987876512402e-06, + "loss": 1.0158, + "step": 13830 + }, + { + "epoch": 0.9626486749669612, + "grad_norm": 2.808584390275072, + "learning_rate": 8.586169199130368e-06, + "loss": 1.0104, + "step": 13840 + }, + { + "epoch": 0.9633442303679488, + "grad_norm": 2.295604659707233, + "learning_rate": 8.583348172729153e-06, + "loss": 1.0031, + "step": 13850 + }, + { + "epoch": 0.9640397857689365, + "grad_norm": 1.9695581600343546, + "learning_rate": 8.580524799156596e-06, + "loss": 1.0247, + "step": 13860 + }, + { + "epoch": 0.9647353411699242, + "grad_norm": 2.4354990626640354, + "learning_rate": 8.577699080262062e-06, + "loss": 1.0329, + "step": 13870 + }, + { + "epoch": 0.9654308965709119, + "grad_norm": 1.5526237692710336, + "learning_rate": 8.574871017896463e-06, + "loss": 0.9748, + "step": 13880 + }, + { + "epoch": 0.9661264519718996, + "grad_norm": 1.9391890908857037, + "learning_rate": 8.572040613912241e-06, + "loss": 0.9766, + "step": 13890 + }, + { + "epoch": 0.9668220073728873, + "grad_norm": 2.9567705383090206, + "learning_rate": 8.569207870163372e-06, + "loss": 1.0219, + "step": 13900 + }, + { + "epoch": 0.9675175627738749, + "grad_norm": 2.9357302059324653, + "learning_rate": 8.566372788505364e-06, + "loss": 1.0433, + "step": 13910 + }, + { + "epoch": 0.9682131181748627, + "grad_norm": 1.961616171885582, + "learning_rate": 8.56353537079526e-06, + "loss": 1.0236, + "step": 13920 + }, + { + "epoch": 0.9689086735758503, + "grad_norm": 2.7569448635808023, + "learning_rate": 8.560695618891627e-06, + "loss": 1.0457, + "step": 13930 + }, + { + "epoch": 0.969604228976838, + "grad_norm": 3.2870728427806224, + "learning_rate": 8.557853534654568e-06, + "loss": 1.0255, + "step": 13940 + }, + { + "epoch": 0.9702997843778257, + "grad_norm": 3.4924455573760027, + "learning_rate": 8.555009119945708e-06, + "loss": 1.02, + "step": 13950 + }, + { + "epoch": 0.9709953397788134, + "grad_norm": 2.3529398327652506, + "learning_rate": 8.552162376628203e-06, + "loss": 0.9622, + "step": 13960 + }, + { + "epoch": 0.9716908951798011, + "grad_norm": 2.896659880771685, + "learning_rate": 8.54931330656673e-06, + "loss": 0.994, + "step": 13970 + }, + { + "epoch": 0.9723864505807888, + "grad_norm": 3.9942645678167756, + "learning_rate": 8.54646191162749e-06, + "loss": 0.9665, + "step": 13980 + }, + { + "epoch": 0.9730820059817764, + "grad_norm": 2.3751030183986237, + "learning_rate": 8.543608193678216e-06, + "loss": 0.9436, + "step": 13990 + }, + { + "epoch": 0.9737775613827642, + "grad_norm": 1.8831932448017135, + "learning_rate": 8.54075215458815e-06, + "loss": 0.8967, + "step": 14000 + }, + { + "epoch": 0.9737775613827642, + "eval_loss": 0.9970532059669495, + "eval_runtime": 1321.144, + "eval_samples_per_second": 13.746, + "eval_steps_per_second": 2.291, + "step": 14000 + }, + { + "epoch": 0.9744731167837518, + "grad_norm": 2.0272995866239927, + "learning_rate": 8.537893796228061e-06, + "loss": 0.99, + "step": 14010 + }, + { + "epoch": 0.9751686721847395, + "grad_norm": 2.625414829609773, + "learning_rate": 8.535033120470237e-06, + "loss": 0.9764, + "step": 14020 + }, + { + "epoch": 0.9758642275857272, + "grad_norm": 3.7908450055209175, + "learning_rate": 8.532170129188482e-06, + "loss": 0.9886, + "step": 14030 + }, + { + "epoch": 0.9765597829867149, + "grad_norm": 2.1534060866392366, + "learning_rate": 8.52930482425812e-06, + "loss": 0.9715, + "step": 14040 + }, + { + "epoch": 0.9772553383877026, + "grad_norm": 5.856864674832504, + "learning_rate": 8.526437207555986e-06, + "loss": 1.0234, + "step": 14050 + }, + { + "epoch": 0.9779508937886903, + "grad_norm": 1.8371742719834165, + "learning_rate": 8.523567280960433e-06, + "loss": 0.9689, + "step": 14060 + }, + { + "epoch": 0.9786464491896779, + "grad_norm": 2.4425559244601813, + "learning_rate": 8.520695046351324e-06, + "loss": 0.9669, + "step": 14070 + }, + { + "epoch": 0.9793420045906657, + "grad_norm": 2.8311475236237684, + "learning_rate": 8.517820505610038e-06, + "loss": 1.049, + "step": 14080 + }, + { + "epoch": 0.9800375599916533, + "grad_norm": 2.140858383632347, + "learning_rate": 8.514943660619459e-06, + "loss": 1.0368, + "step": 14090 + }, + { + "epoch": 0.9807331153926411, + "grad_norm": 1.6290292798047696, + "learning_rate": 8.512064513263986e-06, + "loss": 0.9878, + "step": 14100 + }, + { + "epoch": 0.9814286707936287, + "grad_norm": 2.330818557361859, + "learning_rate": 8.509183065429522e-06, + "loss": 0.9759, + "step": 14110 + }, + { + "epoch": 0.9821242261946164, + "grad_norm": 2.1377902738154426, + "learning_rate": 8.50629931900348e-06, + "loss": 1.0119, + "step": 14120 + }, + { + "epoch": 0.9828197815956041, + "grad_norm": 2.4030153241438943, + "learning_rate": 8.503413275874773e-06, + "loss": 1.0292, + "step": 14130 + }, + { + "epoch": 0.9835153369965918, + "grad_norm": 2.278307116774152, + "learning_rate": 8.500524937933826e-06, + "loss": 0.9587, + "step": 14140 + }, + { + "epoch": 0.9842108923975794, + "grad_norm": 2.3255680870882216, + "learning_rate": 8.497634307072562e-06, + "loss": 0.9715, + "step": 14150 + }, + { + "epoch": 0.9849064477985672, + "grad_norm": 2.601229268642035, + "learning_rate": 8.494741385184408e-06, + "loss": 0.9893, + "step": 14160 + }, + { + "epoch": 0.9856020031995548, + "grad_norm": 2.2110375214714724, + "learning_rate": 8.49184617416429e-06, + "loss": 0.9867, + "step": 14170 + }, + { + "epoch": 0.9862975586005426, + "grad_norm": 2.2523961860387045, + "learning_rate": 8.488948675908637e-06, + "loss": 1.0081, + "step": 14180 + }, + { + "epoch": 0.9869931140015302, + "grad_norm": 3.4923874928319814, + "learning_rate": 8.486048892315369e-06, + "loss": 0.8994, + "step": 14190 + }, + { + "epoch": 0.9876886694025179, + "grad_norm": 2.406451314686743, + "learning_rate": 8.483146825283912e-06, + "loss": 1.0449, + "step": 14200 + }, + { + "epoch": 0.9883842248035056, + "grad_norm": 2.3789411798257745, + "learning_rate": 8.480242476715181e-06, + "loss": 0.9679, + "step": 14210 + }, + { + "epoch": 0.9890797802044933, + "grad_norm": 2.001311533743165, + "learning_rate": 8.477335848511589e-06, + "loss": 1.0119, + "step": 14220 + }, + { + "epoch": 0.9897753356054809, + "grad_norm": 1.6358036081204348, + "learning_rate": 8.474426942577041e-06, + "loss": 0.9233, + "step": 14230 + }, + { + "epoch": 0.9904708910064687, + "grad_norm": 1.7046474666501565, + "learning_rate": 8.471515760816932e-06, + "loss": 0.9587, + "step": 14240 + }, + { + "epoch": 0.9911664464074563, + "grad_norm": 1.63650950133544, + "learning_rate": 8.468602305138154e-06, + "loss": 0.9907, + "step": 14250 + }, + { + "epoch": 0.9918620018084441, + "grad_norm": 1.6433086739441587, + "learning_rate": 8.46568657744908e-06, + "loss": 1.0237, + "step": 14260 + }, + { + "epoch": 0.9925575572094317, + "grad_norm": 3.4708835275927497, + "learning_rate": 8.462768579659575e-06, + "loss": 1.0069, + "step": 14270 + }, + { + "epoch": 0.9932531126104194, + "grad_norm": 3.4318046189127265, + "learning_rate": 8.459848313680994e-06, + "loss": 1.0069, + "step": 14280 + }, + { + "epoch": 0.9939486680114071, + "grad_norm": 2.199737264269533, + "learning_rate": 8.456925781426173e-06, + "loss": 1.0046, + "step": 14290 + }, + { + "epoch": 0.9946442234123948, + "grad_norm": 2.1906596397136617, + "learning_rate": 8.454000984809437e-06, + "loss": 0.9648, + "step": 14300 + }, + { + "epoch": 0.9953397788133825, + "grad_norm": 2.6577023455653115, + "learning_rate": 8.451073925746586e-06, + "loss": 0.9958, + "step": 14310 + }, + { + "epoch": 0.9960353342143702, + "grad_norm": 4.7153119665517735, + "learning_rate": 8.448144606154917e-06, + "loss": 0.9895, + "step": 14320 + }, + { + "epoch": 0.9967308896153578, + "grad_norm": 2.197637095685147, + "learning_rate": 8.445213027953189e-06, + "loss": 0.9484, + "step": 14330 + }, + { + "epoch": 0.9974264450163456, + "grad_norm": 3.046755092748883, + "learning_rate": 8.442279193061656e-06, + "loss": 0.952, + "step": 14340 + }, + { + "epoch": 0.9981220004173332, + "grad_norm": 2.0326093167338883, + "learning_rate": 8.439343103402042e-06, + "loss": 1.0489, + "step": 14350 + }, + { + "epoch": 0.9988175558183209, + "grad_norm": 13.289683919047459, + "learning_rate": 8.436404760897549e-06, + "loss": 0.9841, + "step": 14360 + }, + { + "epoch": 0.9995131112193086, + "grad_norm": 2.1247983986300865, + "learning_rate": 8.433464167472855e-06, + "loss": 1.0364, + "step": 14370 + }, + { + "epoch": 1.0002086666202963, + "grad_norm": 1.7693073086593194, + "learning_rate": 8.430521325054115e-06, + "loss": 0.9109, + "step": 14380 + }, + { + "epoch": 1.000904222021284, + "grad_norm": 2.0754158177494686, + "learning_rate": 8.427576235568954e-06, + "loss": 0.8415, + "step": 14390 + }, + { + "epoch": 1.0015997774222716, + "grad_norm": 1.9310696816262571, + "learning_rate": 8.424628900946473e-06, + "loss": 0.7498, + "step": 14400 + }, + { + "epoch": 1.0022953328232593, + "grad_norm": 4.8379977329234265, + "learning_rate": 8.421679323117233e-06, + "loss": 0.7854, + "step": 14410 + }, + { + "epoch": 1.002990888224247, + "grad_norm": 1.491258382509482, + "learning_rate": 8.418727504013279e-06, + "loss": 0.7813, + "step": 14420 + }, + { + "epoch": 1.0036864436252348, + "grad_norm": 2.328811569651274, + "learning_rate": 8.415773445568117e-06, + "loss": 0.7934, + "step": 14430 + }, + { + "epoch": 1.0043819990262224, + "grad_norm": 2.360845489684663, + "learning_rate": 8.412817149716714e-06, + "loss": 0.7652, + "step": 14440 + }, + { + "epoch": 1.0050775544272101, + "grad_norm": 19.22458778098758, + "learning_rate": 8.409858618395513e-06, + "loss": 0.8189, + "step": 14450 + }, + { + "epoch": 1.005773109828198, + "grad_norm": 17.39795489836203, + "learning_rate": 8.406897853542415e-06, + "loss": 0.7817, + "step": 14460 + }, + { + "epoch": 1.0064686652291854, + "grad_norm": 1.8214936615185988, + "learning_rate": 8.403934857096787e-06, + "loss": 0.8139, + "step": 14470 + }, + { + "epoch": 1.0071642206301732, + "grad_norm": 2.619471066390921, + "learning_rate": 8.400969630999454e-06, + "loss": 0.7685, + "step": 14480 + }, + { + "epoch": 1.007859776031161, + "grad_norm": 2.7545789317270586, + "learning_rate": 8.398002177192706e-06, + "loss": 0.7984, + "step": 14490 + }, + { + "epoch": 1.0085553314321485, + "grad_norm": 3.542957302918095, + "learning_rate": 8.395032497620292e-06, + "loss": 0.7758, + "step": 14500 + }, + { + "epoch": 1.0085553314321485, + "eval_loss": 1.012967586517334, + "eval_runtime": 1324.3531, + "eval_samples_per_second": 13.713, + "eval_steps_per_second": 2.286, + "step": 14500 + }, + { + "epoch": 1.0092508868331362, + "grad_norm": 3.3043307815589134, + "learning_rate": 8.392060594227412e-06, + "loss": 0.8198, + "step": 14510 + }, + { + "epoch": 1.009946442234124, + "grad_norm": 1.8943773650854228, + "learning_rate": 8.38908646896073e-06, + "loss": 0.7975, + "step": 14520 + }, + { + "epoch": 1.0106419976351115, + "grad_norm": 2.003472495567239, + "learning_rate": 8.386110123768364e-06, + "loss": 0.8334, + "step": 14530 + }, + { + "epoch": 1.0113375530360993, + "grad_norm": 2.283748168581594, + "learning_rate": 8.383131560599887e-06, + "loss": 0.7718, + "step": 14540 + }, + { + "epoch": 1.012033108437087, + "grad_norm": 2.9043770110451526, + "learning_rate": 8.380150781406317e-06, + "loss": 0.7694, + "step": 14550 + }, + { + "epoch": 1.0127286638380748, + "grad_norm": 8.281330948583483, + "learning_rate": 8.377167788140132e-06, + "loss": 0.8004, + "step": 14560 + }, + { + "epoch": 1.0134242192390623, + "grad_norm": 3.485727085574552, + "learning_rate": 8.374182582755262e-06, + "loss": 0.8268, + "step": 14570 + }, + { + "epoch": 1.01411977464005, + "grad_norm": 3.6996752270617144, + "learning_rate": 8.371195167207075e-06, + "loss": 0.7697, + "step": 14580 + }, + { + "epoch": 1.0148153300410379, + "grad_norm": 7.722756781672308, + "learning_rate": 8.3682055434524e-06, + "loss": 0.7559, + "step": 14590 + }, + { + "epoch": 1.0155108854420254, + "grad_norm": 2.2768787419272463, + "learning_rate": 8.3652137134495e-06, + "loss": 0.755, + "step": 14600 + }, + { + "epoch": 1.0162064408430131, + "grad_norm": 4.072398048825866, + "learning_rate": 8.362219679158093e-06, + "loss": 0.7682, + "step": 14610 + }, + { + "epoch": 1.016901996244001, + "grad_norm": 2.021520026299211, + "learning_rate": 8.359223442539335e-06, + "loss": 0.7427, + "step": 14620 + }, + { + "epoch": 1.0175975516449884, + "grad_norm": 4.6671218569364, + "learning_rate": 8.356225005555828e-06, + "loss": 0.7641, + "step": 14630 + }, + { + "epoch": 1.0182931070459762, + "grad_norm": 2.5965022075321285, + "learning_rate": 8.353224370171611e-06, + "loss": 0.7772, + "step": 14640 + }, + { + "epoch": 1.018988662446964, + "grad_norm": 2.1985603017824698, + "learning_rate": 8.35022153835217e-06, + "loss": 0.7953, + "step": 14650 + }, + { + "epoch": 1.0196842178479515, + "grad_norm": 2.834751713534921, + "learning_rate": 8.347216512064421e-06, + "loss": 0.797, + "step": 14660 + }, + { + "epoch": 1.0203797732489392, + "grad_norm": 5.075140993381732, + "learning_rate": 8.344209293276723e-06, + "loss": 0.8195, + "step": 14670 + }, + { + "epoch": 1.021075328649927, + "grad_norm": 2.364461427522112, + "learning_rate": 8.341199883958874e-06, + "loss": 0.82, + "step": 14680 + }, + { + "epoch": 1.0217708840509148, + "grad_norm": 3.160558670861848, + "learning_rate": 8.338188286082098e-06, + "loss": 0.7772, + "step": 14690 + }, + { + "epoch": 1.0224664394519023, + "grad_norm": 4.932989568217441, + "learning_rate": 8.335174501619059e-06, + "loss": 0.7859, + "step": 14700 + }, + { + "epoch": 1.02316199485289, + "grad_norm": 2.3030095014617378, + "learning_rate": 8.33215853254385e-06, + "loss": 0.8209, + "step": 14710 + }, + { + "epoch": 1.0238575502538778, + "grad_norm": 2.400749443507866, + "learning_rate": 8.329140380832002e-06, + "loss": 0.7969, + "step": 14720 + }, + { + "epoch": 1.0245531056548653, + "grad_norm": 1.8947341399183362, + "learning_rate": 8.326120048460464e-06, + "loss": 0.7243, + "step": 14730 + }, + { + "epoch": 1.025248661055853, + "grad_norm": 3.3333068544790336, + "learning_rate": 8.323097537407623e-06, + "loss": 0.7994, + "step": 14740 + }, + { + "epoch": 1.0259442164568409, + "grad_norm": 3.0678492673167943, + "learning_rate": 8.32007284965329e-06, + "loss": 0.7874, + "step": 14750 + }, + { + "epoch": 1.0266397718578284, + "grad_norm": 2.4164694151885113, + "learning_rate": 8.3170459871787e-06, + "loss": 0.7943, + "step": 14760 + }, + { + "epoch": 1.0273353272588162, + "grad_norm": 3.093063643421655, + "learning_rate": 8.314016951966515e-06, + "loss": 0.7311, + "step": 14770 + }, + { + "epoch": 1.028030882659804, + "grad_norm": 1.800163957501535, + "learning_rate": 8.310985746000819e-06, + "loss": 0.8, + "step": 14780 + }, + { + "epoch": 1.0287264380607914, + "grad_norm": 1.9197069325774583, + "learning_rate": 8.307952371267119e-06, + "loss": 0.7937, + "step": 14790 + }, + { + "epoch": 1.0294219934617792, + "grad_norm": 3.5125961113837003, + "learning_rate": 8.304916829752338e-06, + "loss": 0.727, + "step": 14800 + }, + { + "epoch": 1.030117548862767, + "grad_norm": 7.929782348906098, + "learning_rate": 8.301879123444827e-06, + "loss": 0.7278, + "step": 14810 + }, + { + "epoch": 1.0308131042637547, + "grad_norm": 2.031963264385994, + "learning_rate": 8.298839254334349e-06, + "loss": 0.7845, + "step": 14820 + }, + { + "epoch": 1.0315086596647423, + "grad_norm": 1.9522142653183456, + "learning_rate": 8.295797224412082e-06, + "loss": 0.8358, + "step": 14830 + }, + { + "epoch": 1.03220421506573, + "grad_norm": 1.7008469101704944, + "learning_rate": 8.292753035670624e-06, + "loss": 0.8175, + "step": 14840 + }, + { + "epoch": 1.0328997704667178, + "grad_norm": 2.852970124657936, + "learning_rate": 8.289706690103986e-06, + "loss": 0.7977, + "step": 14850 + }, + { + "epoch": 1.0335953258677053, + "grad_norm": 1.7861946163699018, + "learning_rate": 8.286658189707589e-06, + "loss": 0.7711, + "step": 14860 + }, + { + "epoch": 1.034290881268693, + "grad_norm": 2.351441489690811, + "learning_rate": 8.283607536478267e-06, + "loss": 0.8002, + "step": 14870 + }, + { + "epoch": 1.0349864366696808, + "grad_norm": 2.6776791310547985, + "learning_rate": 8.280554732414266e-06, + "loss": 0.7984, + "step": 14880 + }, + { + "epoch": 1.0356819920706684, + "grad_norm": 2.1533558681643155, + "learning_rate": 8.27749977951524e-06, + "loss": 0.7694, + "step": 14890 + }, + { + "epoch": 1.0363775474716561, + "grad_norm": 1.855429539595412, + "learning_rate": 8.274442679782248e-06, + "loss": 0.7537, + "step": 14900 + }, + { + "epoch": 1.0370731028726439, + "grad_norm": 1.712510773574159, + "learning_rate": 8.27138343521776e-06, + "loss": 0.7618, + "step": 14910 + }, + { + "epoch": 1.0377686582736314, + "grad_norm": 2.19574733948517, + "learning_rate": 8.268322047825644e-06, + "loss": 0.8029, + "step": 14920 + }, + { + "epoch": 1.0384642136746192, + "grad_norm": 2.991841306654689, + "learning_rate": 8.265258519611176e-06, + "loss": 0.7923, + "step": 14930 + }, + { + "epoch": 1.039159769075607, + "grad_norm": 2.7214145313092244, + "learning_rate": 8.262192852581039e-06, + "loss": 0.7654, + "step": 14940 + }, + { + "epoch": 1.0398553244765945, + "grad_norm": 1.4963585728944129, + "learning_rate": 8.259125048743306e-06, + "loss": 0.7701, + "step": 14950 + }, + { + "epoch": 1.0405508798775822, + "grad_norm": 1.9277432880803742, + "learning_rate": 8.25605511010746e-06, + "loss": 0.7852, + "step": 14960 + }, + { + "epoch": 1.04124643527857, + "grad_norm": 2.5809005496678226, + "learning_rate": 8.252983038684375e-06, + "loss": 0.7855, + "step": 14970 + }, + { + "epoch": 1.0419419906795577, + "grad_norm": 3.84914432788333, + "learning_rate": 8.249908836486324e-06, + "loss": 0.8878, + "step": 14980 + }, + { + "epoch": 1.0426375460805453, + "grad_norm": 1.805414626385251, + "learning_rate": 8.24683250552698e-06, + "loss": 0.7967, + "step": 14990 + }, + { + "epoch": 1.043333101481533, + "grad_norm": 4.364236396760784, + "learning_rate": 8.243754047821406e-06, + "loss": 0.8182, + "step": 15000 + }, + { + "epoch": 1.043333101481533, + "eval_loss": 1.008154273033142, + "eval_runtime": 1322.4532, + "eval_samples_per_second": 13.733, + "eval_steps_per_second": 2.289, + "step": 15000 + }, + { + "epoch": 1.0440286568825208, + "grad_norm": 2.041301978452966, + "learning_rate": 8.240673465386058e-06, + "loss": 0.7608, + "step": 15010 + }, + { + "epoch": 1.0447242122835083, + "grad_norm": 4.078813803965721, + "learning_rate": 8.237590760238784e-06, + "loss": 0.7724, + "step": 15020 + }, + { + "epoch": 1.045419767684496, + "grad_norm": 6.257191351007447, + "learning_rate": 8.234505934398827e-06, + "loss": 0.7209, + "step": 15030 + }, + { + "epoch": 1.0461153230854838, + "grad_norm": 2.368829876215369, + "learning_rate": 8.23141898988681e-06, + "loss": 0.8592, + "step": 15040 + }, + { + "epoch": 1.0468108784864714, + "grad_norm": 2.6086230635508203, + "learning_rate": 8.228329928724753e-06, + "loss": 0.7802, + "step": 15050 + }, + { + "epoch": 1.0475064338874591, + "grad_norm": 2.3601531418864745, + "learning_rate": 8.225238752936058e-06, + "loss": 0.7734, + "step": 15060 + }, + { + "epoch": 1.0482019892884469, + "grad_norm": 2.228775965862419, + "learning_rate": 8.222145464545511e-06, + "loss": 0.7954, + "step": 15070 + }, + { + "epoch": 1.0488975446894344, + "grad_norm": 2.399773334702507, + "learning_rate": 8.219050065579285e-06, + "loss": 0.7979, + "step": 15080 + }, + { + "epoch": 1.0495931000904222, + "grad_norm": 2.6835894315381745, + "learning_rate": 8.215952558064934e-06, + "loss": 0.7833, + "step": 15090 + }, + { + "epoch": 1.05028865549141, + "grad_norm": 2.250310586888638, + "learning_rate": 8.212852944031394e-06, + "loss": 0.7779, + "step": 15100 + }, + { + "epoch": 1.0509842108923977, + "grad_norm": 1.8929539757913427, + "learning_rate": 8.209751225508975e-06, + "loss": 0.7827, + "step": 15110 + }, + { + "epoch": 1.0516797662933852, + "grad_norm": 1.746505007983024, + "learning_rate": 8.206647404529375e-06, + "loss": 0.8505, + "step": 15120 + }, + { + "epoch": 1.052375321694373, + "grad_norm": 1.913636829464887, + "learning_rate": 8.203541483125666e-06, + "loss": 0.7496, + "step": 15130 + }, + { + "epoch": 1.0530708770953607, + "grad_norm": 2.366205256778208, + "learning_rate": 8.20043346333229e-06, + "loss": 0.7834, + "step": 15140 + }, + { + "epoch": 1.0537664324963483, + "grad_norm": 3.7050401312042363, + "learning_rate": 8.19732334718507e-06, + "loss": 0.8077, + "step": 15150 + }, + { + "epoch": 1.054461987897336, + "grad_norm": 2.3980651154709736, + "learning_rate": 8.1942111367212e-06, + "loss": 0.7883, + "step": 15160 + }, + { + "epoch": 1.0551575432983238, + "grad_norm": 1.8920859624071478, + "learning_rate": 8.191096833979246e-06, + "loss": 0.8032, + "step": 15170 + }, + { + "epoch": 1.0558530986993113, + "grad_norm": 2.9466144305343613, + "learning_rate": 8.187980440999144e-06, + "loss": 0.7737, + "step": 15180 + }, + { + "epoch": 1.056548654100299, + "grad_norm": 2.042308194406671, + "learning_rate": 8.1848619598222e-06, + "loss": 0.7691, + "step": 15190 + }, + { + "epoch": 1.0572442095012868, + "grad_norm": 1.7676245777283346, + "learning_rate": 8.181741392491084e-06, + "loss": 0.7672, + "step": 15200 + }, + { + "epoch": 1.0579397649022744, + "grad_norm": 1.9877167825995652, + "learning_rate": 8.178618741049841e-06, + "loss": 0.7329, + "step": 15210 + }, + { + "epoch": 1.0586353203032621, + "grad_norm": 5.314487082042738, + "learning_rate": 8.175494007543872e-06, + "loss": 0.8402, + "step": 15220 + }, + { + "epoch": 1.0593308757042499, + "grad_norm": 2.2982647709548853, + "learning_rate": 8.172367194019949e-06, + "loss": 0.8005, + "step": 15230 + }, + { + "epoch": 1.0600264311052374, + "grad_norm": 2.941986023752014, + "learning_rate": 8.169238302526201e-06, + "loss": 0.7386, + "step": 15240 + }, + { + "epoch": 1.0607219865062252, + "grad_norm": 2.512538787512095, + "learning_rate": 8.16610733511212e-06, + "loss": 0.8054, + "step": 15250 + }, + { + "epoch": 1.061417541907213, + "grad_norm": 1.9077070839719104, + "learning_rate": 8.162974293828559e-06, + "loss": 0.7319, + "step": 15260 + }, + { + "epoch": 1.0621130973082007, + "grad_norm": 2.1063303724498827, + "learning_rate": 8.159839180727726e-06, + "loss": 0.7659, + "step": 15270 + }, + { + "epoch": 1.0628086527091882, + "grad_norm": 1.9468001628244593, + "learning_rate": 8.156701997863195e-06, + "loss": 0.7678, + "step": 15280 + }, + { + "epoch": 1.063504208110176, + "grad_norm": 4.651298869363503, + "learning_rate": 8.153562747289883e-06, + "loss": 0.7387, + "step": 15290 + }, + { + "epoch": 1.0641997635111637, + "grad_norm": 1.9136179825717037, + "learning_rate": 8.150421431064069e-06, + "loss": 0.7722, + "step": 15300 + }, + { + "epoch": 1.0648953189121513, + "grad_norm": 2.1834212694756268, + "learning_rate": 8.147278051243386e-06, + "loss": 0.7029, + "step": 15310 + }, + { + "epoch": 1.065590874313139, + "grad_norm": 2.231297507895056, + "learning_rate": 8.144132609886815e-06, + "loss": 0.7606, + "step": 15320 + }, + { + "epoch": 1.0662864297141268, + "grad_norm": 2.6988878180575013, + "learning_rate": 8.140985109054688e-06, + "loss": 0.746, + "step": 15330 + }, + { + "epoch": 1.0669819851151143, + "grad_norm": 1.9455203613082785, + "learning_rate": 8.137835550808688e-06, + "loss": 0.7494, + "step": 15340 + }, + { + "epoch": 1.067677540516102, + "grad_norm": 2.3044690415063145, + "learning_rate": 8.134683937211846e-06, + "loss": 0.7994, + "step": 15350 + }, + { + "epoch": 1.0683730959170898, + "grad_norm": 2.6904848418564855, + "learning_rate": 8.131530270328534e-06, + "loss": 0.7576, + "step": 15360 + }, + { + "epoch": 1.0690686513180774, + "grad_norm": 3.7351589780558427, + "learning_rate": 8.128374552224474e-06, + "loss": 0.774, + "step": 15370 + }, + { + "epoch": 1.0697642067190651, + "grad_norm": 6.187234305678746, + "learning_rate": 8.125216784966734e-06, + "loss": 0.8471, + "step": 15380 + }, + { + "epoch": 1.070459762120053, + "grad_norm": 2.6920152372879067, + "learning_rate": 8.122056970623714e-06, + "loss": 0.7444, + "step": 15390 + }, + { + "epoch": 1.0711553175210407, + "grad_norm": 2.2292804173604823, + "learning_rate": 8.118895111265166e-06, + "loss": 0.8016, + "step": 15400 + }, + { + "epoch": 1.0718508729220282, + "grad_norm": 1.9051653452466875, + "learning_rate": 8.115731208962177e-06, + "loss": 0.7859, + "step": 15410 + }, + { + "epoch": 1.072546428323016, + "grad_norm": 2.41435822188529, + "learning_rate": 8.11256526578717e-06, + "loss": 0.8348, + "step": 15420 + }, + { + "epoch": 1.0732419837240037, + "grad_norm": 3.0514765988863313, + "learning_rate": 8.109397283813909e-06, + "loss": 0.7689, + "step": 15430 + }, + { + "epoch": 1.0739375391249912, + "grad_norm": 2.0143480127529902, + "learning_rate": 8.106227265117488e-06, + "loss": 0.7926, + "step": 15440 + }, + { + "epoch": 1.074633094525979, + "grad_norm": 8.8364272110794, + "learning_rate": 8.103055211774343e-06, + "loss": 0.7825, + "step": 15450 + }, + { + "epoch": 1.0753286499269668, + "grad_norm": 3.4629692541956323, + "learning_rate": 8.099881125862237e-06, + "loss": 0.7788, + "step": 15460 + }, + { + "epoch": 1.0760242053279543, + "grad_norm": 4.289156116126039, + "learning_rate": 8.096705009460262e-06, + "loss": 0.8063, + "step": 15470 + }, + { + "epoch": 1.076719760728942, + "grad_norm": 2.378026542083582, + "learning_rate": 8.093526864648848e-06, + "loss": 0.7386, + "step": 15480 + }, + { + "epoch": 1.0774153161299298, + "grad_norm": 2.1801125378242827, + "learning_rate": 8.090346693509749e-06, + "loss": 0.7601, + "step": 15490 + }, + { + "epoch": 1.0781108715309173, + "grad_norm": 2.161701413360215, + "learning_rate": 8.087164498126044e-06, + "loss": 0.7682, + "step": 15500 + }, + { + "epoch": 1.0781108715309173, + "eval_loss": 1.005501627922058, + "eval_runtime": 1323.7105, + "eval_samples_per_second": 13.72, + "eval_steps_per_second": 2.287, + "step": 15500 + }, + { + "epoch": 1.078806426931905, + "grad_norm": 1.7862820305751332, + "learning_rate": 8.083980280582142e-06, + "loss": 0.7402, + "step": 15510 + }, + { + "epoch": 1.0795019823328929, + "grad_norm": 2.283393506088444, + "learning_rate": 8.080794042963774e-06, + "loss": 0.75, + "step": 15520 + }, + { + "epoch": 1.0801975377338806, + "grad_norm": 2.164405287134192, + "learning_rate": 8.077605787357996e-06, + "loss": 0.819, + "step": 15530 + }, + { + "epoch": 1.0808930931348681, + "grad_norm": 2.293995947918096, + "learning_rate": 8.074415515853186e-06, + "loss": 0.6934, + "step": 15540 + }, + { + "epoch": 1.081588648535856, + "grad_norm": 1.8041944256631577, + "learning_rate": 8.07122323053904e-06, + "loss": 0.8165, + "step": 15550 + }, + { + "epoch": 1.0822842039368437, + "grad_norm": 1.63238850564565, + "learning_rate": 8.068028933506576e-06, + "loss": 0.7685, + "step": 15560 + }, + { + "epoch": 1.0829797593378312, + "grad_norm": 1.9455017298778592, + "learning_rate": 8.064832626848127e-06, + "loss": 0.7092, + "step": 15570 + }, + { + "epoch": 1.083675314738819, + "grad_norm": 3.060932693763485, + "learning_rate": 8.061634312657344e-06, + "loss": 0.7462, + "step": 15580 + }, + { + "epoch": 1.0843708701398067, + "grad_norm": 1.8525824696808604, + "learning_rate": 8.058433993029195e-06, + "loss": 0.744, + "step": 15590 + }, + { + "epoch": 1.0850664255407942, + "grad_norm": 2.3380694135478626, + "learning_rate": 8.055231670059958e-06, + "loss": 0.7953, + "step": 15600 + }, + { + "epoch": 1.085761980941782, + "grad_norm": 4.6254969091503115, + "learning_rate": 8.05202734584722e-06, + "loss": 0.7399, + "step": 15610 + }, + { + "epoch": 1.0864575363427698, + "grad_norm": 2.070747974007497, + "learning_rate": 8.04882102248989e-06, + "loss": 0.7852, + "step": 15620 + }, + { + "epoch": 1.0871530917437573, + "grad_norm": 2.114263889909383, + "learning_rate": 8.045612702088177e-06, + "loss": 0.7837, + "step": 15630 + }, + { + "epoch": 1.087848647144745, + "grad_norm": 2.499223381285713, + "learning_rate": 8.0424023867436e-06, + "loss": 0.7975, + "step": 15640 + }, + { + "epoch": 1.0885442025457328, + "grad_norm": 1.913244350131982, + "learning_rate": 8.039190078558987e-06, + "loss": 0.675, + "step": 15650 + }, + { + "epoch": 1.0892397579467206, + "grad_norm": 2.1299775878770193, + "learning_rate": 8.03597577963847e-06, + "loss": 0.7709, + "step": 15660 + }, + { + "epoch": 1.089935313347708, + "grad_norm": 1.644570254768367, + "learning_rate": 8.032759492087485e-06, + "loss": 0.7415, + "step": 15670 + }, + { + "epoch": 1.0906308687486959, + "grad_norm": 2.447268320833942, + "learning_rate": 8.029541218012771e-06, + "loss": 0.7363, + "step": 15680 + }, + { + "epoch": 1.0913264241496836, + "grad_norm": 2.1486357580137487, + "learning_rate": 8.026320959522366e-06, + "loss": 0.7677, + "step": 15690 + }, + { + "epoch": 1.0920219795506712, + "grad_norm": 3.0566815644302627, + "learning_rate": 8.023098718725613e-06, + "loss": 0.8013, + "step": 15700 + }, + { + "epoch": 1.092717534951659, + "grad_norm": 2.764752755667172, + "learning_rate": 8.019874497733144e-06, + "loss": 0.7698, + "step": 15710 + }, + { + "epoch": 1.0934130903526467, + "grad_norm": 2.574210978060382, + "learning_rate": 8.016648298656902e-06, + "loss": 0.8336, + "step": 15720 + }, + { + "epoch": 1.0941086457536342, + "grad_norm": 2.23939620627452, + "learning_rate": 8.013420123610112e-06, + "loss": 0.7512, + "step": 15730 + }, + { + "epoch": 1.094804201154622, + "grad_norm": 2.1139974215003954, + "learning_rate": 8.010189974707302e-06, + "loss": 0.7684, + "step": 15740 + }, + { + "epoch": 1.0954997565556097, + "grad_norm": 1.7705745440110903, + "learning_rate": 8.00695785406429e-06, + "loss": 0.7639, + "step": 15750 + }, + { + "epoch": 1.0961953119565973, + "grad_norm": 2.919045763991456, + "learning_rate": 8.003723763798185e-06, + "loss": 0.8016, + "step": 15760 + }, + { + "epoch": 1.096890867357585, + "grad_norm": 2.57131755764745, + "learning_rate": 8.000487706027386e-06, + "loss": 0.8116, + "step": 15770 + }, + { + "epoch": 1.0975864227585728, + "grad_norm": 2.793062621774531, + "learning_rate": 7.997249682871583e-06, + "loss": 0.7256, + "step": 15780 + }, + { + "epoch": 1.0982819781595605, + "grad_norm": 3.3035373429998085, + "learning_rate": 7.994009696451753e-06, + "loss": 0.778, + "step": 15790 + }, + { + "epoch": 1.098977533560548, + "grad_norm": 2.045621223775346, + "learning_rate": 7.990767748890153e-06, + "loss": 0.8084, + "step": 15800 + }, + { + "epoch": 1.0996730889615358, + "grad_norm": 2.209722247695391, + "learning_rate": 7.987523842310334e-06, + "loss": 0.7747, + "step": 15810 + }, + { + "epoch": 1.1003686443625236, + "grad_norm": 2.2014377966765575, + "learning_rate": 7.984277978837125e-06, + "loss": 0.781, + "step": 15820 + }, + { + "epoch": 1.101064199763511, + "grad_norm": 1.9097567605078565, + "learning_rate": 7.981030160596636e-06, + "loss": 0.7533, + "step": 15830 + }, + { + "epoch": 1.1017597551644989, + "grad_norm": 1.773610722230463, + "learning_rate": 7.97778038971626e-06, + "loss": 0.7749, + "step": 15840 + }, + { + "epoch": 1.1024553105654866, + "grad_norm": 3.269601953629404, + "learning_rate": 7.974528668324668e-06, + "loss": 0.7821, + "step": 15850 + }, + { + "epoch": 1.1031508659664742, + "grad_norm": 2.6686277068679667, + "learning_rate": 7.971274998551808e-06, + "loss": 0.8321, + "step": 15860 + }, + { + "epoch": 1.103846421367462, + "grad_norm": 2.13898689590959, + "learning_rate": 7.968019382528904e-06, + "loss": 0.7654, + "step": 15870 + }, + { + "epoch": 1.1045419767684497, + "grad_norm": 5.014813392173544, + "learning_rate": 7.964761822388458e-06, + "loss": 0.745, + "step": 15880 + }, + { + "epoch": 1.1052375321694372, + "grad_norm": 2.4983173601286106, + "learning_rate": 7.961502320264242e-06, + "loss": 0.7726, + "step": 15890 + }, + { + "epoch": 1.105933087570425, + "grad_norm": 2.2180833407145126, + "learning_rate": 7.9582408782913e-06, + "loss": 0.728, + "step": 15900 + }, + { + "epoch": 1.1066286429714127, + "grad_norm": 1.9701857010396273, + "learning_rate": 7.954977498605949e-06, + "loss": 0.779, + "step": 15910 + }, + { + "epoch": 1.1073241983724003, + "grad_norm": 2.6466970141188626, + "learning_rate": 7.951712183345774e-06, + "loss": 0.7272, + "step": 15920 + }, + { + "epoch": 1.108019753773388, + "grad_norm": 2.329857534545829, + "learning_rate": 7.948444934649626e-06, + "loss": 0.7228, + "step": 15930 + }, + { + "epoch": 1.1087153091743758, + "grad_norm": 2.875987845398697, + "learning_rate": 7.945175754657628e-06, + "loss": 0.758, + "step": 15940 + }, + { + "epoch": 1.1094108645753635, + "grad_norm": 1.9358336770584013, + "learning_rate": 7.94190464551116e-06, + "loss": 0.7987, + "step": 15950 + }, + { + "epoch": 1.110106419976351, + "grad_norm": 2.9979622646017274, + "learning_rate": 7.938631609352873e-06, + "loss": 0.8294, + "step": 15960 + }, + { + "epoch": 1.1108019753773388, + "grad_norm": 2.1327647547502164, + "learning_rate": 7.935356648326675e-06, + "loss": 0.7336, + "step": 15970 + }, + { + "epoch": 1.1114975307783266, + "grad_norm": 1.820935941968983, + "learning_rate": 7.93207976457774e-06, + "loss": 0.7703, + "step": 15980 + }, + { + "epoch": 1.1121930861793141, + "grad_norm": 2.0485294999301185, + "learning_rate": 7.928800960252497e-06, + "loss": 0.81, + "step": 15990 + }, + { + "epoch": 1.1128886415803019, + "grad_norm": 2.0459279398778207, + "learning_rate": 7.925520237498632e-06, + "loss": 0.8002, + "step": 16000 + }, + { + "epoch": 1.1128886415803019, + "eval_loss": 1.0015792846679688, + "eval_runtime": 1320.3024, + "eval_samples_per_second": 13.755, + "eval_steps_per_second": 2.293, + "step": 16000 + }, + { + "epoch": 1.1135841969812896, + "grad_norm": 1.8597462940016958, + "learning_rate": 7.922237598465093e-06, + "loss": 0.7907, + "step": 16010 + }, + { + "epoch": 1.1142797523822772, + "grad_norm": 5.035839395901807, + "learning_rate": 7.91895304530208e-06, + "loss": 0.8029, + "step": 16020 + }, + { + "epoch": 1.114975307783265, + "grad_norm": 1.6329590302083103, + "learning_rate": 7.915666580161046e-06, + "loss": 0.7768, + "step": 16030 + }, + { + "epoch": 1.1156708631842527, + "grad_norm": 2.611632091459868, + "learning_rate": 7.912378205194698e-06, + "loss": 0.736, + "step": 16040 + }, + { + "epoch": 1.1163664185852402, + "grad_norm": 1.7531262613855327, + "learning_rate": 7.909087922556993e-06, + "loss": 0.7904, + "step": 16050 + }, + { + "epoch": 1.117061973986228, + "grad_norm": 2.3432896944054025, + "learning_rate": 7.90579573440314e-06, + "loss": 0.7894, + "step": 16060 + }, + { + "epoch": 1.1177575293872157, + "grad_norm": 2.105688269321465, + "learning_rate": 7.902501642889593e-06, + "loss": 0.7665, + "step": 16070 + }, + { + "epoch": 1.1184530847882033, + "grad_norm": 3.6542509060811073, + "learning_rate": 7.899205650174051e-06, + "loss": 0.8254, + "step": 16080 + }, + { + "epoch": 1.119148640189191, + "grad_norm": 1.8023462420786895, + "learning_rate": 7.895907758415467e-06, + "loss": 0.8006, + "step": 16090 + }, + { + "epoch": 1.1198441955901788, + "grad_norm": 1.6413771780085031, + "learning_rate": 7.892607969774027e-06, + "loss": 0.7521, + "step": 16100 + }, + { + "epoch": 1.1205397509911665, + "grad_norm": 2.1910718380889778, + "learning_rate": 7.889306286411168e-06, + "loss": 0.7892, + "step": 16110 + }, + { + "epoch": 1.121235306392154, + "grad_norm": 2.2183188612460762, + "learning_rate": 7.886002710489562e-06, + "loss": 0.7646, + "step": 16120 + }, + { + "epoch": 1.1219308617931418, + "grad_norm": 6.055963120949321, + "learning_rate": 7.882697244173126e-06, + "loss": 0.7898, + "step": 16130 + }, + { + "epoch": 1.1226264171941296, + "grad_norm": 2.2487429054193586, + "learning_rate": 7.879389889627011e-06, + "loss": 0.7839, + "step": 16140 + }, + { + "epoch": 1.1233219725951171, + "grad_norm": 2.3520155996304366, + "learning_rate": 7.876080649017608e-06, + "loss": 0.7627, + "step": 16150 + }, + { + "epoch": 1.1240175279961049, + "grad_norm": 4.443927747563754, + "learning_rate": 7.872769524512539e-06, + "loss": 0.726, + "step": 16160 + }, + { + "epoch": 1.1247130833970926, + "grad_norm": 1.7615235652491243, + "learning_rate": 7.869456518280666e-06, + "loss": 0.7971, + "step": 16170 + }, + { + "epoch": 1.1254086387980802, + "grad_norm": 2.59257400831061, + "learning_rate": 7.86614163249208e-06, + "loss": 0.7944, + "step": 16180 + }, + { + "epoch": 1.126104194199068, + "grad_norm": 2.9034173968190387, + "learning_rate": 7.8628248693181e-06, + "loss": 0.7543, + "step": 16190 + }, + { + "epoch": 1.1267997496000557, + "grad_norm": 2.1430596133294597, + "learning_rate": 7.859506230931285e-06, + "loss": 0.82, + "step": 16200 + }, + { + "epoch": 1.1274953050010432, + "grad_norm": 3.590522175593638, + "learning_rate": 7.85618571950541e-06, + "loss": 0.757, + "step": 16210 + }, + { + "epoch": 1.128190860402031, + "grad_norm": 2.4688195649031277, + "learning_rate": 7.852863337215483e-06, + "loss": 0.7893, + "step": 16220 + }, + { + "epoch": 1.1288864158030187, + "grad_norm": 3.159369507053288, + "learning_rate": 7.849539086237739e-06, + "loss": 0.7823, + "step": 16230 + }, + { + "epoch": 1.1295819712040065, + "grad_norm": 2.6911651685682325, + "learning_rate": 7.846212968749636e-06, + "loss": 0.8026, + "step": 16240 + }, + { + "epoch": 1.130277526604994, + "grad_norm": 2.5521876406179995, + "learning_rate": 7.842884986929851e-06, + "loss": 0.7328, + "step": 16250 + }, + { + "epoch": 1.1309730820059818, + "grad_norm": 2.593859323827276, + "learning_rate": 7.839555142958284e-06, + "loss": 0.7786, + "step": 16260 + }, + { + "epoch": 1.1316686374069695, + "grad_norm": 1.693628505189045, + "learning_rate": 7.836223439016061e-06, + "loss": 0.8247, + "step": 16270 + }, + { + "epoch": 1.132364192807957, + "grad_norm": 2.2643232246144938, + "learning_rate": 7.832889877285516e-06, + "loss": 0.8578, + "step": 16280 + }, + { + "epoch": 1.1330597482089448, + "grad_norm": 1.7225387560943632, + "learning_rate": 7.829554459950205e-06, + "loss": 0.7756, + "step": 16290 + }, + { + "epoch": 1.1337553036099326, + "grad_norm": 1.5371949401006386, + "learning_rate": 7.8262171891949e-06, + "loss": 0.7869, + "step": 16300 + }, + { + "epoch": 1.1344508590109201, + "grad_norm": 3.728516051305851, + "learning_rate": 7.822878067205589e-06, + "loss": 0.7644, + "step": 16310 + }, + { + "epoch": 1.135146414411908, + "grad_norm": 2.020288462552607, + "learning_rate": 7.819537096169464e-06, + "loss": 0.7765, + "step": 16320 + }, + { + "epoch": 1.1358419698128956, + "grad_norm": 1.6619153035731904, + "learning_rate": 7.81619427827494e-06, + "loss": 0.7052, + "step": 16330 + }, + { + "epoch": 1.1365375252138832, + "grad_norm": 1.939097817630278, + "learning_rate": 7.812849615711632e-06, + "loss": 0.765, + "step": 16340 + }, + { + "epoch": 1.137233080614871, + "grad_norm": 1.8201110321996512, + "learning_rate": 7.809503110670369e-06, + "loss": 0.7084, + "step": 16350 + }, + { + "epoch": 1.1379286360158587, + "grad_norm": 2.11034817840111, + "learning_rate": 7.806154765343183e-06, + "loss": 0.7422, + "step": 16360 + }, + { + "epoch": 1.1386241914168465, + "grad_norm": 1.9526986873069585, + "learning_rate": 7.802804581923316e-06, + "loss": 0.7906, + "step": 16370 + }, + { + "epoch": 1.139319746817834, + "grad_norm": 4.795414744527113, + "learning_rate": 7.79945256260521e-06, + "loss": 0.7902, + "step": 16380 + }, + { + "epoch": 1.1400153022188217, + "grad_norm": 2.0530991407646995, + "learning_rate": 7.796098709584509e-06, + "loss": 0.8178, + "step": 16390 + }, + { + "epoch": 1.1407108576198095, + "grad_norm": 1.8332121293322194, + "learning_rate": 7.792743025058062e-06, + "loss": 0.8273, + "step": 16400 + }, + { + "epoch": 1.141406413020797, + "grad_norm": 2.0661068333252164, + "learning_rate": 7.789385511223917e-06, + "loss": 0.7614, + "step": 16410 + }, + { + "epoch": 1.1421019684217848, + "grad_norm": 2.1590520232308226, + "learning_rate": 7.786026170281316e-06, + "loss": 0.7699, + "step": 16420 + }, + { + "epoch": 1.1427975238227726, + "grad_norm": 1.8982436791864918, + "learning_rate": 7.782665004430702e-06, + "loss": 0.7525, + "step": 16430 + }, + { + "epoch": 1.14349307922376, + "grad_norm": 1.8136063529100597, + "learning_rate": 7.779302015873712e-06, + "loss": 0.8093, + "step": 16440 + }, + { + "epoch": 1.1441886346247478, + "grad_norm": 2.3617075082995, + "learning_rate": 7.775937206813178e-06, + "loss": 0.7218, + "step": 16450 + }, + { + "epoch": 1.1448841900257356, + "grad_norm": 2.2238614653442927, + "learning_rate": 7.772570579453122e-06, + "loss": 0.8011, + "step": 16460 + }, + { + "epoch": 1.1455797454267231, + "grad_norm": 2.0551508472216105, + "learning_rate": 7.769202135998758e-06, + "loss": 0.7709, + "step": 16470 + }, + { + "epoch": 1.146275300827711, + "grad_norm": 1.7079938958760612, + "learning_rate": 7.765831878656491e-06, + "loss": 0.753, + "step": 16480 + }, + { + "epoch": 1.1469708562286987, + "grad_norm": 2.02693774785408, + "learning_rate": 7.762459809633915e-06, + "loss": 0.7473, + "step": 16490 + }, + { + "epoch": 1.1476664116296864, + "grad_norm": 1.821177744425402, + "learning_rate": 7.759085931139808e-06, + "loss": 0.7807, + "step": 16500 + }, + { + "epoch": 1.1476664116296864, + "eval_loss": 1.0020612478256226, + "eval_runtime": 1322.529, + "eval_samples_per_second": 13.732, + "eval_steps_per_second": 2.289, + "step": 16500 + }, + { + "epoch": 1.148361967030674, + "grad_norm": 3.7316951910599268, + "learning_rate": 7.75571024538413e-06, + "loss": 0.7416, + "step": 16510 + }, + { + "epoch": 1.1490575224316617, + "grad_norm": 1.6114087831740995, + "learning_rate": 7.752332754578035e-06, + "loss": 0.7708, + "step": 16520 + }, + { + "epoch": 1.1497530778326495, + "grad_norm": 1.9668807006863875, + "learning_rate": 7.748953460933849e-06, + "loss": 0.7818, + "step": 16530 + }, + { + "epoch": 1.150448633233637, + "grad_norm": 2.2029826372567265, + "learning_rate": 7.745572366665085e-06, + "loss": 0.7458, + "step": 16540 + }, + { + "epoch": 1.1511441886346248, + "grad_norm": 4.124112139621779, + "learning_rate": 7.742189473986434e-06, + "loss": 0.7935, + "step": 16550 + }, + { + "epoch": 1.1518397440356125, + "grad_norm": 1.911038666537639, + "learning_rate": 7.738804785113762e-06, + "loss": 0.7438, + "step": 16560 + }, + { + "epoch": 1.1525352994366, + "grad_norm": 2.5588095739046373, + "learning_rate": 7.735418302264119e-06, + "loss": 0.8093, + "step": 16570 + }, + { + "epoch": 1.1532308548375878, + "grad_norm": 2.659014640360898, + "learning_rate": 7.732030027655719e-06, + "loss": 0.7901, + "step": 16580 + }, + { + "epoch": 1.1539264102385756, + "grad_norm": 2.1190558850926173, + "learning_rate": 7.728639963507962e-06, + "loss": 0.8226, + "step": 16590 + }, + { + "epoch": 1.154621965639563, + "grad_norm": 2.1558248725933864, + "learning_rate": 7.72524811204141e-06, + "loss": 0.7609, + "step": 16600 + }, + { + "epoch": 1.1553175210405509, + "grad_norm": 2.7010357874250817, + "learning_rate": 7.721854475477802e-06, + "loss": 0.7656, + "step": 16610 + }, + { + "epoch": 1.1560130764415386, + "grad_norm": 3.206004042369307, + "learning_rate": 7.718459056040042e-06, + "loss": 0.7928, + "step": 16620 + }, + { + "epoch": 1.1567086318425264, + "grad_norm": 2.7612633815513363, + "learning_rate": 7.715061855952206e-06, + "loss": 0.7527, + "step": 16630 + }, + { + "epoch": 1.157404187243514, + "grad_norm": 1.7121261300214738, + "learning_rate": 7.711662877439531e-06, + "loss": 0.7526, + "step": 16640 + }, + { + "epoch": 1.1580997426445017, + "grad_norm": 1.7661411892741479, + "learning_rate": 7.708262122728426e-06, + "loss": 0.7418, + "step": 16650 + }, + { + "epoch": 1.1587952980454894, + "grad_norm": 1.6874942185599806, + "learning_rate": 7.704859594046462e-06, + "loss": 0.7957, + "step": 16660 + }, + { + "epoch": 1.159490853446477, + "grad_norm": 2.78554388762539, + "learning_rate": 7.701455293622361e-06, + "loss": 0.8409, + "step": 16670 + }, + { + "epoch": 1.1601864088474647, + "grad_norm": 3.035761805378373, + "learning_rate": 7.698049223686021e-06, + "loss": 0.7465, + "step": 16680 + }, + { + "epoch": 1.1608819642484525, + "grad_norm": 6.704926617795548, + "learning_rate": 7.69464138646849e-06, + "loss": 0.7496, + "step": 16690 + }, + { + "epoch": 1.16157751964944, + "grad_norm": 3.186939871528251, + "learning_rate": 7.691231784201976e-06, + "loss": 0.7988, + "step": 16700 + }, + { + "epoch": 1.1622730750504278, + "grad_norm": 2.7247943197353086, + "learning_rate": 7.687820419119843e-06, + "loss": 0.7726, + "step": 16710 + }, + { + "epoch": 1.1629686304514155, + "grad_norm": 1.6616678517717085, + "learning_rate": 7.68440729345661e-06, + "loss": 0.7462, + "step": 16720 + }, + { + "epoch": 1.163664185852403, + "grad_norm": 4.46227365370575, + "learning_rate": 7.680992409447949e-06, + "loss": 0.7745, + "step": 16730 + }, + { + "epoch": 1.1643597412533908, + "grad_norm": 5.109408245278419, + "learning_rate": 7.677575769330682e-06, + "loss": 0.7604, + "step": 16740 + }, + { + "epoch": 1.1650552966543786, + "grad_norm": 1.8181251635946558, + "learning_rate": 7.674157375342785e-06, + "loss": 0.7621, + "step": 16750 + }, + { + "epoch": 1.1657508520553663, + "grad_norm": 2.6193321550903663, + "learning_rate": 7.670737229723381e-06, + "loss": 0.7728, + "step": 16760 + }, + { + "epoch": 1.1664464074563539, + "grad_norm": 1.8475949565388012, + "learning_rate": 7.667315334712738e-06, + "loss": 0.7663, + "step": 16770 + }, + { + "epoch": 1.1671419628573416, + "grad_norm": 1.5623834765488078, + "learning_rate": 7.663891692552273e-06, + "loss": 0.7779, + "step": 16780 + }, + { + "epoch": 1.1678375182583292, + "grad_norm": 1.8013388393097125, + "learning_rate": 7.660466305484546e-06, + "loss": 0.8074, + "step": 16790 + }, + { + "epoch": 1.168533073659317, + "grad_norm": 2.2811021483902567, + "learning_rate": 7.65703917575326e-06, + "loss": 0.7752, + "step": 16800 + }, + { + "epoch": 1.1692286290603047, + "grad_norm": 3.126614122983318, + "learning_rate": 7.653610305603263e-06, + "loss": 0.7583, + "step": 16810 + }, + { + "epoch": 1.1699241844612924, + "grad_norm": 1.869469074462047, + "learning_rate": 7.650179697280537e-06, + "loss": 0.7954, + "step": 16820 + }, + { + "epoch": 1.17061973986228, + "grad_norm": 2.309588090813393, + "learning_rate": 7.646747353032205e-06, + "loss": 0.7656, + "step": 16830 + }, + { + "epoch": 1.1713152952632677, + "grad_norm": 1.8869017332606937, + "learning_rate": 7.643313275106529e-06, + "loss": 0.7823, + "step": 16840 + }, + { + "epoch": 1.1720108506642555, + "grad_norm": 2.531350285300656, + "learning_rate": 7.639877465752902e-06, + "loss": 0.785, + "step": 16850 + }, + { + "epoch": 1.172706406065243, + "grad_norm": 2.5437710334375985, + "learning_rate": 7.63643992722186e-06, + "loss": 0.7617, + "step": 16860 + }, + { + "epoch": 1.1734019614662308, + "grad_norm": 3.99981318630921, + "learning_rate": 7.63300066176506e-06, + "loss": 0.7898, + "step": 16870 + }, + { + "epoch": 1.1740975168672185, + "grad_norm": 2.1562880763485563, + "learning_rate": 7.629559671635302e-06, + "loss": 0.7845, + "step": 16880 + }, + { + "epoch": 1.1747930722682063, + "grad_norm": 1.766004061604684, + "learning_rate": 7.626116959086502e-06, + "loss": 0.7077, + "step": 16890 + }, + { + "epoch": 1.1754886276691938, + "grad_norm": 1.9324428508219686, + "learning_rate": 7.62267252637372e-06, + "loss": 0.8203, + "step": 16900 + }, + { + "epoch": 1.1761841830701816, + "grad_norm": 1.8750943925505448, + "learning_rate": 7.61922637575313e-06, + "loss": 0.7732, + "step": 16910 + }, + { + "epoch": 1.1768797384711691, + "grad_norm": 1.9814132011212648, + "learning_rate": 7.6157785094820345e-06, + "loss": 0.7723, + "step": 16920 + }, + { + "epoch": 1.1775752938721569, + "grad_norm": 2.2142512374067507, + "learning_rate": 7.612328929818866e-06, + "loss": 0.7707, + "step": 16930 + }, + { + "epoch": 1.1782708492731446, + "grad_norm": 2.2813346447179557, + "learning_rate": 7.6088776390231714e-06, + "loss": 0.7827, + "step": 16940 + }, + { + "epoch": 1.1789664046741324, + "grad_norm": 2.506501245481797, + "learning_rate": 7.605424639355623e-06, + "loss": 0.7557, + "step": 16950 + }, + { + "epoch": 1.17966196007512, + "grad_norm": 4.470886479085731, + "learning_rate": 7.601969933078009e-06, + "loss": 0.7395, + "step": 16960 + }, + { + "epoch": 1.1803575154761077, + "grad_norm": 5.287905528897376, + "learning_rate": 7.598513522453239e-06, + "loss": 0.8064, + "step": 16970 + }, + { + "epoch": 1.1810530708770954, + "grad_norm": 1.777506513624609, + "learning_rate": 7.595055409745339e-06, + "loss": 0.7899, + "step": 16980 + }, + { + "epoch": 1.181748626278083, + "grad_norm": 2.5169539292534613, + "learning_rate": 7.5915955972194445e-06, + "loss": 0.8041, + "step": 16990 + }, + { + "epoch": 1.1824441816790707, + "grad_norm": 2.137582103670167, + "learning_rate": 7.588134087141812e-06, + "loss": 0.8314, + "step": 17000 + }, + { + "epoch": 1.1824441816790707, + "eval_loss": 0.9981618523597717, + "eval_runtime": 1320.119, + "eval_samples_per_second": 13.757, + "eval_steps_per_second": 2.293, + "step": 17000 + }, + { + "epoch": 1.1831397370800585, + "grad_norm": 1.8581009099993406, + "learning_rate": 7.584670881779803e-06, + "loss": 0.7741, + "step": 17010 + }, + { + "epoch": 1.183835292481046, + "grad_norm": 3.2859835536860884, + "learning_rate": 7.581205983401896e-06, + "loss": 0.7177, + "step": 17020 + }, + { + "epoch": 1.1845308478820338, + "grad_norm": 2.0126732767542035, + "learning_rate": 7.5777393942776725e-06, + "loss": 0.7567, + "step": 17030 + }, + { + "epoch": 1.1852264032830215, + "grad_norm": 1.7042819946911698, + "learning_rate": 7.574271116677826e-06, + "loss": 0.7552, + "step": 17040 + }, + { + "epoch": 1.185921958684009, + "grad_norm": 3.4081260054515994, + "learning_rate": 7.570801152874153e-06, + "loss": 0.8006, + "step": 17050 + }, + { + "epoch": 1.1866175140849968, + "grad_norm": 1.665906394997139, + "learning_rate": 7.567329505139556e-06, + "loss": 0.7359, + "step": 17060 + }, + { + "epoch": 1.1873130694859846, + "grad_norm": 2.9102543399913787, + "learning_rate": 7.563856175748039e-06, + "loss": 0.8052, + "step": 17070 + }, + { + "epoch": 1.1880086248869723, + "grad_norm": 2.7835954387225876, + "learning_rate": 7.560381166974711e-06, + "loss": 0.7222, + "step": 17080 + }, + { + "epoch": 1.1887041802879599, + "grad_norm": 2.116206748667092, + "learning_rate": 7.556904481095777e-06, + "loss": 0.6945, + "step": 17090 + }, + { + "epoch": 1.1893997356889476, + "grad_norm": 2.6474974919029473, + "learning_rate": 7.553426120388542e-06, + "loss": 0.7904, + "step": 17100 + }, + { + "epoch": 1.1900952910899354, + "grad_norm": 1.6542420582961852, + "learning_rate": 7.549946087131408e-06, + "loss": 0.7669, + "step": 17110 + }, + { + "epoch": 1.190790846490923, + "grad_norm": 2.634540799302254, + "learning_rate": 7.546464383603875e-06, + "loss": 0.8292, + "step": 17120 + }, + { + "epoch": 1.1914864018919107, + "grad_norm": 3.1541944680302656, + "learning_rate": 7.542981012086532e-06, + "loss": 0.7286, + "step": 17130 + }, + { + "epoch": 1.1921819572928984, + "grad_norm": 4.691167177727303, + "learning_rate": 7.539495974861066e-06, + "loss": 0.8254, + "step": 17140 + }, + { + "epoch": 1.192877512693886, + "grad_norm": 2.1324009285148833, + "learning_rate": 7.536009274210251e-06, + "loss": 0.7517, + "step": 17150 + }, + { + "epoch": 1.1935730680948737, + "grad_norm": 1.9660583603569404, + "learning_rate": 7.532520912417953e-06, + "loss": 0.7958, + "step": 17160 + }, + { + "epoch": 1.1942686234958615, + "grad_norm": 2.6490876830011185, + "learning_rate": 7.529030891769124e-06, + "loss": 0.7713, + "step": 17170 + }, + { + "epoch": 1.194964178896849, + "grad_norm": 2.0406926536855168, + "learning_rate": 7.525539214549805e-06, + "loss": 0.7688, + "step": 17180 + }, + { + "epoch": 1.1956597342978368, + "grad_norm": 2.504823337437613, + "learning_rate": 7.52204588304712e-06, + "loss": 0.7974, + "step": 17190 + }, + { + "epoch": 1.1963552896988245, + "grad_norm": 2.6977730906411708, + "learning_rate": 7.51855089954928e-06, + "loss": 0.7419, + "step": 17200 + }, + { + "epoch": 1.1970508450998123, + "grad_norm": 2.1711788901148563, + "learning_rate": 7.515054266345571e-06, + "loss": 0.7374, + "step": 17210 + }, + { + "epoch": 1.1977464005007998, + "grad_norm": 2.6803844660676055, + "learning_rate": 7.5115559857263664e-06, + "loss": 0.7648, + "step": 17220 + }, + { + "epoch": 1.1984419559017876, + "grad_norm": 1.9158994093160746, + "learning_rate": 7.508056059983119e-06, + "loss": 0.8007, + "step": 17230 + }, + { + "epoch": 1.1991375113027753, + "grad_norm": 1.6913106870155072, + "learning_rate": 7.5045544914083515e-06, + "loss": 0.742, + "step": 17240 + }, + { + "epoch": 1.1998330667037629, + "grad_norm": 2.0014719293894903, + "learning_rate": 7.5010512822956706e-06, + "loss": 0.8042, + "step": 17250 + }, + { + "epoch": 1.2005286221047506, + "grad_norm": 2.0154154566255893, + "learning_rate": 7.497546434939756e-06, + "loss": 0.7672, + "step": 17260 + }, + { + "epoch": 1.2012241775057384, + "grad_norm": 7.132444295126552, + "learning_rate": 7.494039951636359e-06, + "loss": 0.7652, + "step": 17270 + }, + { + "epoch": 1.201919732906726, + "grad_norm": 2.8583448537320297, + "learning_rate": 7.4905318346823e-06, + "loss": 0.7071, + "step": 17280 + }, + { + "epoch": 1.2026152883077137, + "grad_norm": 3.098218350044356, + "learning_rate": 7.487022086375474e-06, + "loss": 0.7969, + "step": 17290 + }, + { + "epoch": 1.2033108437087014, + "grad_norm": 2.634061655640909, + "learning_rate": 7.483510709014845e-06, + "loss": 0.8593, + "step": 17300 + }, + { + "epoch": 1.204006399109689, + "grad_norm": 2.1327622335840664, + "learning_rate": 7.479997704900437e-06, + "loss": 0.7554, + "step": 17310 + }, + { + "epoch": 1.2047019545106767, + "grad_norm": 2.112842366957866, + "learning_rate": 7.4764830763333485e-06, + "loss": 0.8079, + "step": 17320 + }, + { + "epoch": 1.2053975099116645, + "grad_norm": 2.690091961060749, + "learning_rate": 7.472966825615738e-06, + "loss": 0.776, + "step": 17330 + }, + { + "epoch": 1.2060930653126523, + "grad_norm": 2.64165494511701, + "learning_rate": 7.4694489550508235e-06, + "loss": 0.7604, + "step": 17340 + }, + { + "epoch": 1.2067886207136398, + "grad_norm": 1.8571724926250404, + "learning_rate": 7.465929466942888e-06, + "loss": 0.7764, + "step": 17350 + }, + { + "epoch": 1.2074841761146275, + "grad_norm": 2.3995099400124045, + "learning_rate": 7.462408363597276e-06, + "loss": 0.7815, + "step": 17360 + }, + { + "epoch": 1.2081797315156153, + "grad_norm": 1.4599555758629184, + "learning_rate": 7.458885647320384e-06, + "loss": 0.7959, + "step": 17370 + }, + { + "epoch": 1.2088752869166028, + "grad_norm": 2.104288077046386, + "learning_rate": 7.455361320419669e-06, + "loss": 0.7623, + "step": 17380 + }, + { + "epoch": 1.2095708423175906, + "grad_norm": 2.2058004708086254, + "learning_rate": 7.451835385203644e-06, + "loss": 0.7295, + "step": 17390 + }, + { + "epoch": 1.2102663977185784, + "grad_norm": 2.1203310669338986, + "learning_rate": 7.448307843981871e-06, + "loss": 0.8321, + "step": 17400 + }, + { + "epoch": 1.210961953119566, + "grad_norm": 1.947081312004404, + "learning_rate": 7.444778699064968e-06, + "loss": 0.8128, + "step": 17410 + }, + { + "epoch": 1.2116575085205536, + "grad_norm": 1.8706075485238056, + "learning_rate": 7.441247952764601e-06, + "loss": 0.7341, + "step": 17420 + }, + { + "epoch": 1.2123530639215414, + "grad_norm": 2.871691791420935, + "learning_rate": 7.437715607393486e-06, + "loss": 0.7915, + "step": 17430 + }, + { + "epoch": 1.213048619322529, + "grad_norm": 1.9432419603533932, + "learning_rate": 7.434181665265388e-06, + "loss": 0.7267, + "step": 17440 + }, + { + "epoch": 1.2137441747235167, + "grad_norm": 1.7873752103865772, + "learning_rate": 7.4306461286951135e-06, + "loss": 0.6983, + "step": 17450 + }, + { + "epoch": 1.2144397301245045, + "grad_norm": 2.293627657435503, + "learning_rate": 7.42710899999852e-06, + "loss": 0.8153, + "step": 17460 + }, + { + "epoch": 1.2151352855254922, + "grad_norm": 9.548900430322604, + "learning_rate": 7.4235702814925e-06, + "loss": 0.7968, + "step": 17470 + }, + { + "epoch": 1.2158308409264798, + "grad_norm": 1.9121674105185644, + "learning_rate": 7.420029975494996e-06, + "loss": 0.7438, + "step": 17480 + }, + { + "epoch": 1.2165263963274675, + "grad_norm": 2.8005803278386074, + "learning_rate": 7.416488084324981e-06, + "loss": 0.7567, + "step": 17490 + }, + { + "epoch": 1.2172219517284553, + "grad_norm": 3.8964106049943292, + "learning_rate": 7.4129446103024725e-06, + "loss": 0.7788, + "step": 17500 + }, + { + "epoch": 1.2172219517284553, + "eval_loss": 0.99642413854599, + "eval_runtime": 1322.861, + "eval_samples_per_second": 13.729, + "eval_steps_per_second": 2.288, + "step": 17500 + }, + { + "epoch": 1.2179175071294428, + "grad_norm": 1.7600581897839367, + "learning_rate": 7.409399555748526e-06, + "loss": 0.7781, + "step": 17510 + }, + { + "epoch": 1.2186130625304306, + "grad_norm": 3.468186895754951, + "learning_rate": 7.405852922985228e-06, + "loss": 0.7968, + "step": 17520 + }, + { + "epoch": 1.2193086179314183, + "grad_norm": 2.1756592407195505, + "learning_rate": 7.4023047143357e-06, + "loss": 0.7599, + "step": 17530 + }, + { + "epoch": 1.2200041733324059, + "grad_norm": 2.484210848175499, + "learning_rate": 7.398754932124096e-06, + "loss": 0.7931, + "step": 17540 + }, + { + "epoch": 1.2206997287333936, + "grad_norm": 2.630797889894326, + "learning_rate": 7.395203578675603e-06, + "loss": 0.7704, + "step": 17550 + }, + { + "epoch": 1.2213952841343814, + "grad_norm": 2.3025978203296735, + "learning_rate": 7.3916506563164325e-06, + "loss": 0.7761, + "step": 17560 + }, + { + "epoch": 1.222090839535369, + "grad_norm": 2.4864366090319026, + "learning_rate": 7.388096167373826e-06, + "loss": 0.8142, + "step": 17570 + }, + { + "epoch": 1.2227863949363567, + "grad_norm": 2.553756553710882, + "learning_rate": 7.384540114176056e-06, + "loss": 0.7927, + "step": 17580 + }, + { + "epoch": 1.2234819503373444, + "grad_norm": 1.696254865008682, + "learning_rate": 7.38098249905241e-06, + "loss": 0.7394, + "step": 17590 + }, + { + "epoch": 1.2241775057383322, + "grad_norm": 1.9946838771753401, + "learning_rate": 7.3774233243332035e-06, + "loss": 0.7901, + "step": 17600 + }, + { + "epoch": 1.2248730611393197, + "grad_norm": 2.3137955859814716, + "learning_rate": 7.3738625923497785e-06, + "loss": 0.81, + "step": 17610 + }, + { + "epoch": 1.2255686165403075, + "grad_norm": 1.4260663806350895, + "learning_rate": 7.370300305434489e-06, + "loss": 0.7443, + "step": 17620 + }, + { + "epoch": 1.226264171941295, + "grad_norm": 2.2202700145913488, + "learning_rate": 7.366736465920709e-06, + "loss": 0.8013, + "step": 17630 + }, + { + "epoch": 1.2269597273422828, + "grad_norm": 2.288812053923346, + "learning_rate": 7.363171076142836e-06, + "loss": 0.7908, + "step": 17640 + }, + { + "epoch": 1.2276552827432705, + "grad_norm": 2.019784106434881, + "learning_rate": 7.359604138436274e-06, + "loss": 0.7508, + "step": 17650 + }, + { + "epoch": 1.2283508381442583, + "grad_norm": 2.358898806003495, + "learning_rate": 7.356035655137447e-06, + "loss": 0.7713, + "step": 17660 + }, + { + "epoch": 1.2290463935452458, + "grad_norm": 2.0631534017696707, + "learning_rate": 7.352465628583789e-06, + "loss": 0.7664, + "step": 17670 + }, + { + "epoch": 1.2297419489462336, + "grad_norm": 2.5205947854857818, + "learning_rate": 7.348894061113747e-06, + "loss": 0.7532, + "step": 17680 + }, + { + "epoch": 1.2304375043472213, + "grad_norm": 1.9750342696652357, + "learning_rate": 7.345320955066773e-06, + "loss": 0.8295, + "step": 17690 + }, + { + "epoch": 1.2311330597482089, + "grad_norm": 2.2048763334487664, + "learning_rate": 7.341746312783331e-06, + "loss": 0.7371, + "step": 17700 + }, + { + "epoch": 1.2318286151491966, + "grad_norm": 3.2951901128294385, + "learning_rate": 7.33817013660489e-06, + "loss": 0.804, + "step": 17710 + }, + { + "epoch": 1.2325241705501844, + "grad_norm": 2.926005910079128, + "learning_rate": 7.334592428873924e-06, + "loss": 0.786, + "step": 17720 + }, + { + "epoch": 1.2332197259511721, + "grad_norm": 2.625437336154398, + "learning_rate": 7.331013191933908e-06, + "loss": 0.8053, + "step": 17730 + }, + { + "epoch": 1.2339152813521597, + "grad_norm": 1.6201415002000674, + "learning_rate": 7.327432428129322e-06, + "loss": 0.772, + "step": 17740 + }, + { + "epoch": 1.2346108367531474, + "grad_norm": 1.9745247793644694, + "learning_rate": 7.323850139805643e-06, + "loss": 0.7843, + "step": 17750 + }, + { + "epoch": 1.235306392154135, + "grad_norm": 1.746127804171034, + "learning_rate": 7.320266329309349e-06, + "loss": 0.767, + "step": 17760 + }, + { + "epoch": 1.2360019475551227, + "grad_norm": 2.4480333820660043, + "learning_rate": 7.316680998987915e-06, + "loss": 0.7719, + "step": 17770 + }, + { + "epoch": 1.2366975029561105, + "grad_norm": 2.4373626952516307, + "learning_rate": 7.31309415118981e-06, + "loss": 0.7516, + "step": 17780 + }, + { + "epoch": 1.2373930583570982, + "grad_norm": 2.0377527937111877, + "learning_rate": 7.309505788264496e-06, + "loss": 0.7406, + "step": 17790 + }, + { + "epoch": 1.2380886137580858, + "grad_norm": 1.8785236734680293, + "learning_rate": 7.305915912562432e-06, + "loss": 0.7841, + "step": 17800 + }, + { + "epoch": 1.2387841691590735, + "grad_norm": 2.749265819000473, + "learning_rate": 7.302324526435064e-06, + "loss": 0.7838, + "step": 17810 + }, + { + "epoch": 1.2394797245600613, + "grad_norm": 1.7926100175034565, + "learning_rate": 7.298731632234827e-06, + "loss": 0.7823, + "step": 17820 + }, + { + "epoch": 1.2401752799610488, + "grad_norm": 2.0104651459919167, + "learning_rate": 7.295137232315148e-06, + "loss": 0.7776, + "step": 17830 + }, + { + "epoch": 1.2408708353620366, + "grad_norm": 2.178683453123003, + "learning_rate": 7.291541329030434e-06, + "loss": 0.6994, + "step": 17840 + }, + { + "epoch": 1.2415663907630243, + "grad_norm": 2.08449485339008, + "learning_rate": 7.287943924736082e-06, + "loss": 0.7603, + "step": 17850 + }, + { + "epoch": 1.242261946164012, + "grad_norm": 1.9265850940321556, + "learning_rate": 7.28434502178847e-06, + "loss": 0.7835, + "step": 17860 + }, + { + "epoch": 1.2429575015649996, + "grad_norm": 2.276843545342071, + "learning_rate": 7.2807446225449606e-06, + "loss": 0.792, + "step": 17870 + }, + { + "epoch": 1.2436530569659874, + "grad_norm": 3.0103680226083203, + "learning_rate": 7.277142729363891e-06, + "loss": 0.7984, + "step": 17880 + }, + { + "epoch": 1.244348612366975, + "grad_norm": 1.9331450804336252, + "learning_rate": 7.27353934460458e-06, + "loss": 0.7769, + "step": 17890 + }, + { + "epoch": 1.2450441677679627, + "grad_norm": 1.4840990611068878, + "learning_rate": 7.269934470627325e-06, + "loss": 0.725, + "step": 17900 + }, + { + "epoch": 1.2457397231689504, + "grad_norm": 2.16274334427615, + "learning_rate": 7.266328109793396e-06, + "loss": 0.7992, + "step": 17910 + }, + { + "epoch": 1.2464352785699382, + "grad_norm": 2.907712210104194, + "learning_rate": 7.262720264465038e-06, + "loss": 0.7905, + "step": 17920 + }, + { + "epoch": 1.2471308339709257, + "grad_norm": 2.2622338767549492, + "learning_rate": 7.259110937005468e-06, + "loss": 0.8024, + "step": 17930 + }, + { + "epoch": 1.2478263893719135, + "grad_norm": 2.368235219174843, + "learning_rate": 7.2555001297788775e-06, + "loss": 0.7563, + "step": 17940 + }, + { + "epoch": 1.2485219447729012, + "grad_norm": 2.055736145778879, + "learning_rate": 7.25188784515042e-06, + "loss": 0.8231, + "step": 17950 + }, + { + "epoch": 1.2492175001738888, + "grad_norm": 1.889564819023402, + "learning_rate": 7.2482740854862245e-06, + "loss": 0.7792, + "step": 17960 + }, + { + "epoch": 1.2499130555748765, + "grad_norm": 1.9513141716659577, + "learning_rate": 7.244658853153379e-06, + "loss": 0.7736, + "step": 17970 + }, + { + "epoch": 1.2506086109758643, + "grad_norm": 1.9730209003881736, + "learning_rate": 7.241042150519943e-06, + "loss": 0.8083, + "step": 17980 + }, + { + "epoch": 1.251304166376852, + "grad_norm": 1.977397802809894, + "learning_rate": 7.237423979954934e-06, + "loss": 0.7686, + "step": 17990 + }, + { + "epoch": 1.2519997217778396, + "grad_norm": 3.711469448173577, + "learning_rate": 7.2338043438283324e-06, + "loss": 0.7805, + "step": 18000 + }, + { + "epoch": 1.2519997217778396, + "eval_loss": 0.9935358166694641, + "eval_runtime": 1321.2202, + "eval_samples_per_second": 13.746, + "eval_steps_per_second": 2.291, + "step": 18000 + }, + { + "epoch": 1.2526952771788273, + "grad_norm": 1.7765323568083324, + "learning_rate": 7.230183244511081e-06, + "loss": 0.774, + "step": 18010 + }, + { + "epoch": 1.2533908325798149, + "grad_norm": 2.4662277146996994, + "learning_rate": 7.226560684375077e-06, + "loss": 0.7924, + "step": 18020 + }, + { + "epoch": 1.2540863879808026, + "grad_norm": 2.3404738050226634, + "learning_rate": 7.2229366657931755e-06, + "loss": 0.7717, + "step": 18030 + }, + { + "epoch": 1.2547819433817904, + "grad_norm": 2.258003356635941, + "learning_rate": 7.219311191139191e-06, + "loss": 0.76, + "step": 18040 + }, + { + "epoch": 1.2554774987827781, + "grad_norm": 2.1895245852612644, + "learning_rate": 7.2156842627878856e-06, + "loss": 0.7653, + "step": 18050 + }, + { + "epoch": 1.2561730541837657, + "grad_norm": 1.78288199853681, + "learning_rate": 7.212055883114979e-06, + "loss": 0.7565, + "step": 18060 + }, + { + "epoch": 1.2568686095847534, + "grad_norm": 1.969957058363858, + "learning_rate": 7.208426054497137e-06, + "loss": 0.7734, + "step": 18070 + }, + { + "epoch": 1.2575641649857412, + "grad_norm": 2.3784319735373334, + "learning_rate": 7.204794779311979e-06, + "loss": 0.7763, + "step": 18080 + }, + { + "epoch": 1.2582597203867287, + "grad_norm": 3.3514320955370893, + "learning_rate": 7.201162059938068e-06, + "loss": 0.7496, + "step": 18090 + }, + { + "epoch": 1.2589552757877165, + "grad_norm": 2.791298659509157, + "learning_rate": 7.197527898754915e-06, + "loss": 0.8803, + "step": 18100 + }, + { + "epoch": 1.2596508311887042, + "grad_norm": 2.220546932059371, + "learning_rate": 7.193892298142974e-06, + "loss": 0.7823, + "step": 18110 + }, + { + "epoch": 1.260346386589692, + "grad_norm": 5.163079198171669, + "learning_rate": 7.190255260483645e-06, + "loss": 0.7749, + "step": 18120 + }, + { + "epoch": 1.2610419419906795, + "grad_norm": 1.96051765363023, + "learning_rate": 7.186616788159265e-06, + "loss": 0.7402, + "step": 18130 + }, + { + "epoch": 1.2617374973916673, + "grad_norm": 2.6994257622081794, + "learning_rate": 7.182976883553113e-06, + "loss": 0.7599, + "step": 18140 + }, + { + "epoch": 1.2624330527926548, + "grad_norm": 2.8188543727656907, + "learning_rate": 7.179335549049408e-06, + "loss": 0.788, + "step": 18150 + }, + { + "epoch": 1.2631286081936426, + "grad_norm": 2.152460789132226, + "learning_rate": 7.175692787033304e-06, + "loss": 0.7177, + "step": 18160 + }, + { + "epoch": 1.2638241635946303, + "grad_norm": 2.380918477415356, + "learning_rate": 7.172048599890886e-06, + "loss": 0.7563, + "step": 18170 + }, + { + "epoch": 1.264519718995618, + "grad_norm": 1.6268958768397501, + "learning_rate": 7.1684029900091775e-06, + "loss": 0.835, + "step": 18180 + }, + { + "epoch": 1.2652152743966056, + "grad_norm": 1.5196345750597593, + "learning_rate": 7.164755959776135e-06, + "loss": 0.7681, + "step": 18190 + }, + { + "epoch": 1.2659108297975934, + "grad_norm": 2.0448226306860406, + "learning_rate": 7.161107511580642e-06, + "loss": 0.8255, + "step": 18200 + }, + { + "epoch": 1.266606385198581, + "grad_norm": 2.756944561565143, + "learning_rate": 7.1574576478125094e-06, + "loss": 0.8059, + "step": 18210 + }, + { + "epoch": 1.2673019405995687, + "grad_norm": 4.138966678138996, + "learning_rate": 7.153806370862482e-06, + "loss": 0.7658, + "step": 18220 + }, + { + "epoch": 1.2679974960005564, + "grad_norm": 2.241417520175397, + "learning_rate": 7.1501536831222205e-06, + "loss": 0.7707, + "step": 18230 + }, + { + "epoch": 1.2686930514015442, + "grad_norm": 1.7294452519088037, + "learning_rate": 7.146499586984319e-06, + "loss": 0.7372, + "step": 18240 + }, + { + "epoch": 1.269388606802532, + "grad_norm": 2.4692056237394158, + "learning_rate": 7.142844084842291e-06, + "loss": 0.7275, + "step": 18250 + }, + { + "epoch": 1.2700841622035195, + "grad_norm": 2.382961823414427, + "learning_rate": 7.1391871790905685e-06, + "loss": 0.8221, + "step": 18260 + }, + { + "epoch": 1.2707797176045073, + "grad_norm": 2.898443758783943, + "learning_rate": 7.135528872124504e-06, + "loss": 0.7723, + "step": 18270 + }, + { + "epoch": 1.2714752730054948, + "grad_norm": 2.0533438376105897, + "learning_rate": 7.13186916634037e-06, + "loss": 0.7722, + "step": 18280 + }, + { + "epoch": 1.2721708284064825, + "grad_norm": 2.2593298908080937, + "learning_rate": 7.128208064135353e-06, + "loss": 0.759, + "step": 18290 + }, + { + "epoch": 1.2728663838074703, + "grad_norm": 2.00418320289675, + "learning_rate": 7.124545567907555e-06, + "loss": 0.7829, + "step": 18300 + }, + { + "epoch": 1.273561939208458, + "grad_norm": 1.4864286912380473, + "learning_rate": 7.120881680055991e-06, + "loss": 0.7625, + "step": 18310 + }, + { + "epoch": 1.2742574946094456, + "grad_norm": 2.442933280930573, + "learning_rate": 7.117216402980588e-06, + "loss": 0.7527, + "step": 18320 + }, + { + "epoch": 1.2749530500104334, + "grad_norm": 3.9591876696628, + "learning_rate": 7.113549739082183e-06, + "loss": 0.7651, + "step": 18330 + }, + { + "epoch": 1.2756486054114209, + "grad_norm": 2.0169684725728083, + "learning_rate": 7.109881690762519e-06, + "loss": 0.8067, + "step": 18340 + }, + { + "epoch": 1.2763441608124086, + "grad_norm": 2.190997463422378, + "learning_rate": 7.10621226042425e-06, + "loss": 0.7858, + "step": 18350 + }, + { + "epoch": 1.2770397162133964, + "grad_norm": 1.5138525929791102, + "learning_rate": 7.102541450470932e-06, + "loss": 0.7165, + "step": 18360 + }, + { + "epoch": 1.2777352716143842, + "grad_norm": 2.0097637540495277, + "learning_rate": 7.0988692633070265e-06, + "loss": 0.7639, + "step": 18370 + }, + { + "epoch": 1.2784308270153717, + "grad_norm": 2.4367948749325015, + "learning_rate": 7.095195701337895e-06, + "loss": 0.7645, + "step": 18380 + }, + { + "epoch": 1.2791263824163595, + "grad_norm": 2.457755467194514, + "learning_rate": 7.091520766969802e-06, + "loss": 0.7448, + "step": 18390 + }, + { + "epoch": 1.2798219378173472, + "grad_norm": 1.9214498553476231, + "learning_rate": 7.0878444626099085e-06, + "loss": 0.7637, + "step": 18400 + }, + { + "epoch": 1.2805174932183347, + "grad_norm": 1.9510015471187026, + "learning_rate": 7.084166790666275e-06, + "loss": 0.7779, + "step": 18410 + }, + { + "epoch": 1.2812130486193225, + "grad_norm": 2.0365093381892954, + "learning_rate": 7.080487753547858e-06, + "loss": 0.7429, + "step": 18420 + }, + { + "epoch": 1.2819086040203103, + "grad_norm": 2.2059469736263204, + "learning_rate": 7.076807353664505e-06, + "loss": 0.7759, + "step": 18430 + }, + { + "epoch": 1.282604159421298, + "grad_norm": 1.9178373393982615, + "learning_rate": 7.073125593426961e-06, + "loss": 0.7793, + "step": 18440 + }, + { + "epoch": 1.2832997148222856, + "grad_norm": 1.8935777304205814, + "learning_rate": 7.069442475246856e-06, + "loss": 0.7957, + "step": 18450 + }, + { + "epoch": 1.2839952702232733, + "grad_norm": 2.4024636984826944, + "learning_rate": 7.065758001536715e-06, + "loss": 0.8036, + "step": 18460 + }, + { + "epoch": 1.2846908256242608, + "grad_norm": 2.5832685551517938, + "learning_rate": 7.062072174709951e-06, + "loss": 0.7646, + "step": 18470 + }, + { + "epoch": 1.2853863810252486, + "grad_norm": 5.6705964871002985, + "learning_rate": 7.058384997180857e-06, + "loss": 0.7825, + "step": 18480 + }, + { + "epoch": 1.2860819364262364, + "grad_norm": 1.789756532757647, + "learning_rate": 7.054696471364617e-06, + "loss": 0.7774, + "step": 18490 + }, + { + "epoch": 1.2867774918272241, + "grad_norm": 5.196696896279188, + "learning_rate": 7.051006599677293e-06, + "loss": 0.7881, + "step": 18500 + }, + { + "epoch": 1.2867774918272241, + "eval_loss": 0.9903595447540283, + "eval_runtime": 1324.1702, + "eval_samples_per_second": 13.715, + "eval_steps_per_second": 2.286, + "step": 18500 + }, + { + "epoch": 1.2874730472282117, + "grad_norm": 1.9820950207986534, + "learning_rate": 7.0473153845358375e-06, + "loss": 0.7692, + "step": 18510 + }, + { + "epoch": 1.2881686026291994, + "grad_norm": 2.230411120218938, + "learning_rate": 7.043622828358073e-06, + "loss": 0.7981, + "step": 18520 + }, + { + "epoch": 1.2888641580301872, + "grad_norm": 2.0842556011306037, + "learning_rate": 7.0399289335627034e-06, + "loss": 0.7764, + "step": 18530 + }, + { + "epoch": 1.2895597134311747, + "grad_norm": 2.8096590504633094, + "learning_rate": 7.036233702569315e-06, + "loss": 0.7948, + "step": 18540 + }, + { + "epoch": 1.2902552688321625, + "grad_norm": 2.172885088066203, + "learning_rate": 7.032537137798361e-06, + "loss": 0.7491, + "step": 18550 + }, + { + "epoch": 1.2909508242331502, + "grad_norm": 2.007078294879024, + "learning_rate": 7.02883924167117e-06, + "loss": 0.7469, + "step": 18560 + }, + { + "epoch": 1.291646379634138, + "grad_norm": 2.080885104287996, + "learning_rate": 7.025140016609951e-06, + "loss": 0.7525, + "step": 18570 + }, + { + "epoch": 1.2923419350351255, + "grad_norm": 2.190830558341148, + "learning_rate": 7.021439465037776e-06, + "loss": 0.799, + "step": 18580 + }, + { + "epoch": 1.2930374904361133, + "grad_norm": 2.0735180281143073, + "learning_rate": 7.017737589378582e-06, + "loss": 0.7244, + "step": 18590 + }, + { + "epoch": 1.2937330458371008, + "grad_norm": 1.9226021067464878, + "learning_rate": 7.014034392057183e-06, + "loss": 0.7506, + "step": 18600 + }, + { + "epoch": 1.2944286012380886, + "grad_norm": 1.9130862153805104, + "learning_rate": 7.010329875499252e-06, + "loss": 0.7644, + "step": 18610 + }, + { + "epoch": 1.2951241566390763, + "grad_norm": 5.146102045068852, + "learning_rate": 7.0066240421313305e-06, + "loss": 0.8114, + "step": 18620 + }, + { + "epoch": 1.295819712040064, + "grad_norm": 2.4471161317012795, + "learning_rate": 7.0029168943808175e-06, + "loss": 0.8144, + "step": 18630 + }, + { + "epoch": 1.2965152674410516, + "grad_norm": 7.572435671597748, + "learning_rate": 6.9992084346759794e-06, + "loss": 0.7305, + "step": 18640 + }, + { + "epoch": 1.2972108228420394, + "grad_norm": 1.8008684052912916, + "learning_rate": 6.995498665445935e-06, + "loss": 0.727, + "step": 18650 + }, + { + "epoch": 1.2979063782430271, + "grad_norm": 2.3827323778065588, + "learning_rate": 6.991787589120664e-06, + "loss": 0.7633, + "step": 18660 + }, + { + "epoch": 1.2986019336440147, + "grad_norm": 4.368892505164442, + "learning_rate": 6.988075208131006e-06, + "loss": 0.7611, + "step": 18670 + }, + { + "epoch": 1.2992974890450024, + "grad_norm": 2.412447567342999, + "learning_rate": 6.98436152490865e-06, + "loss": 0.7902, + "step": 18680 + }, + { + "epoch": 1.2999930444459902, + "grad_norm": 2.770863894712905, + "learning_rate": 6.980646541886138e-06, + "loss": 0.7792, + "step": 18690 + }, + { + "epoch": 1.300688599846978, + "grad_norm": 2.195884489103992, + "learning_rate": 6.976930261496866e-06, + "loss": 0.7649, + "step": 18700 + }, + { + "epoch": 1.3013841552479655, + "grad_norm": 2.949625698300715, + "learning_rate": 6.973212686175079e-06, + "loss": 0.7728, + "step": 18710 + }, + { + "epoch": 1.3020797106489532, + "grad_norm": 2.815629176662259, + "learning_rate": 6.96949381835587e-06, + "loss": 0.7574, + "step": 18720 + }, + { + "epoch": 1.3027752660499408, + "grad_norm": 2.481305635692924, + "learning_rate": 6.9657736604751804e-06, + "loss": 0.777, + "step": 18730 + }, + { + "epoch": 1.3034708214509285, + "grad_norm": 1.9429635587319118, + "learning_rate": 6.962052214969792e-06, + "loss": 0.7198, + "step": 18740 + }, + { + "epoch": 1.3041663768519163, + "grad_norm": 2.9019737592344015, + "learning_rate": 6.958329484277333e-06, + "loss": 0.778, + "step": 18750 + }, + { + "epoch": 1.304861932252904, + "grad_norm": 2.0873994707995287, + "learning_rate": 6.954605470836277e-06, + "loss": 0.766, + "step": 18760 + }, + { + "epoch": 1.3055574876538916, + "grad_norm": 1.9382778240689618, + "learning_rate": 6.950880177085932e-06, + "loss": 0.7684, + "step": 18770 + }, + { + "epoch": 1.3062530430548793, + "grad_norm": 2.481228568623526, + "learning_rate": 6.947153605466445e-06, + "loss": 0.802, + "step": 18780 + }, + { + "epoch": 1.306948598455867, + "grad_norm": 1.9645889877577871, + "learning_rate": 6.9434257584188035e-06, + "loss": 0.7803, + "step": 18790 + }, + { + "epoch": 1.3076441538568546, + "grad_norm": 8.701876505334496, + "learning_rate": 6.93969663838483e-06, + "loss": 0.8219, + "step": 18800 + }, + { + "epoch": 1.3083397092578424, + "grad_norm": 4.072371648547787, + "learning_rate": 6.935966247807177e-06, + "loss": 0.7738, + "step": 18810 + }, + { + "epoch": 1.3090352646588301, + "grad_norm": 3.2109473760496567, + "learning_rate": 6.932234589129332e-06, + "loss": 0.7665, + "step": 18820 + }, + { + "epoch": 1.309730820059818, + "grad_norm": 2.226340338267774, + "learning_rate": 6.928501664795616e-06, + "loss": 0.7199, + "step": 18830 + }, + { + "epoch": 1.3104263754608054, + "grad_norm": 2.689695857747803, + "learning_rate": 6.924767477251173e-06, + "loss": 0.7851, + "step": 18840 + }, + { + "epoch": 1.3111219308617932, + "grad_norm": 2.7456400541461994, + "learning_rate": 6.921032028941979e-06, + "loss": 0.7237, + "step": 18850 + }, + { + "epoch": 1.3118174862627807, + "grad_norm": 1.7140470719770466, + "learning_rate": 6.917295322314834e-06, + "loss": 0.7497, + "step": 18860 + }, + { + "epoch": 1.3125130416637685, + "grad_norm": 2.1299512781648815, + "learning_rate": 6.913557359817362e-06, + "loss": 0.8161, + "step": 18870 + }, + { + "epoch": 1.3132085970647562, + "grad_norm": 1.9877267892913975, + "learning_rate": 6.909818143898008e-06, + "loss": 0.7582, + "step": 18880 + }, + { + "epoch": 1.313904152465744, + "grad_norm": 3.5601887806339, + "learning_rate": 6.906077677006045e-06, + "loss": 0.7652, + "step": 18890 + }, + { + "epoch": 1.3145997078667315, + "grad_norm": 2.2114148278326367, + "learning_rate": 6.902335961591559e-06, + "loss": 0.7569, + "step": 18900 + }, + { + "epoch": 1.3152952632677193, + "grad_norm": 2.0190329820780497, + "learning_rate": 6.898593000105452e-06, + "loss": 0.783, + "step": 18910 + }, + { + "epoch": 1.315990818668707, + "grad_norm": 2.1711400247645876, + "learning_rate": 6.894848794999449e-06, + "loss": 0.7566, + "step": 18920 + }, + { + "epoch": 1.3166863740696946, + "grad_norm": 3.494357534796239, + "learning_rate": 6.891103348726085e-06, + "loss": 0.8202, + "step": 18930 + }, + { + "epoch": 1.3173819294706823, + "grad_norm": 3.0609629707633923, + "learning_rate": 6.887356663738709e-06, + "loss": 0.7128, + "step": 18940 + }, + { + "epoch": 1.31807748487167, + "grad_norm": 2.343925665556496, + "learning_rate": 6.883608742491481e-06, + "loss": 0.7961, + "step": 18950 + }, + { + "epoch": 1.3187730402726578, + "grad_norm": 3.19151370052274, + "learning_rate": 6.879859587439373e-06, + "loss": 0.7355, + "step": 18960 + }, + { + "epoch": 1.3194685956736454, + "grad_norm": 1.767592514590558, + "learning_rate": 6.876109201038161e-06, + "loss": 0.7585, + "step": 18970 + }, + { + "epoch": 1.3201641510746331, + "grad_norm": 1.8721596763149115, + "learning_rate": 6.872357585744434e-06, + "loss": 0.7476, + "step": 18980 + }, + { + "epoch": 1.3208597064756207, + "grad_norm": 1.43658660379587, + "learning_rate": 6.868604744015578e-06, + "loss": 0.7689, + "step": 18990 + }, + { + "epoch": 1.3215552618766084, + "grad_norm": 2.0598765226617153, + "learning_rate": 6.864850678309788e-06, + "loss": 0.7767, + "step": 19000 + }, + { + "epoch": 1.3215552618766084, + "eval_loss": 0.9881010055541992, + "eval_runtime": 1321.4583, + "eval_samples_per_second": 13.743, + "eval_steps_per_second": 2.291, + "step": 19000 + }, + { + "epoch": 1.3222508172775962, + "grad_norm": 1.9653298595958368, + "learning_rate": 6.861095391086059e-06, + "loss": 0.7536, + "step": 19010 + }, + { + "epoch": 1.322946372678584, + "grad_norm": 2.7888571624853347, + "learning_rate": 6.857338884804185e-06, + "loss": 0.7534, + "step": 19020 + }, + { + "epoch": 1.3236419280795715, + "grad_norm": 2.6427019891265657, + "learning_rate": 6.853581161924763e-06, + "loss": 0.7414, + "step": 19030 + }, + { + "epoch": 1.3243374834805592, + "grad_norm": 2.1206819785143765, + "learning_rate": 6.849822224909179e-06, + "loss": 0.7919, + "step": 19040 + }, + { + "epoch": 1.325033038881547, + "grad_norm": 2.51428988287449, + "learning_rate": 6.8460620762196226e-06, + "loss": 0.799, + "step": 19050 + }, + { + "epoch": 1.3257285942825345, + "grad_norm": 4.402617120557353, + "learning_rate": 6.842300718319072e-06, + "loss": 0.8367, + "step": 19060 + }, + { + "epoch": 1.3264241496835223, + "grad_norm": 2.4312920405850154, + "learning_rate": 6.838538153671298e-06, + "loss": 0.8434, + "step": 19070 + }, + { + "epoch": 1.32711970508451, + "grad_norm": 1.6911368364197648, + "learning_rate": 6.834774384740865e-06, + "loss": 0.7296, + "step": 19080 + }, + { + "epoch": 1.3278152604854978, + "grad_norm": 1.7755637239961988, + "learning_rate": 6.83100941399312e-06, + "loss": 0.788, + "step": 19090 + }, + { + "epoch": 1.3285108158864853, + "grad_norm": 1.8057475048476272, + "learning_rate": 6.827243243894205e-06, + "loss": 0.7416, + "step": 19100 + }, + { + "epoch": 1.329206371287473, + "grad_norm": 1.7407588772410987, + "learning_rate": 6.823475876911042e-06, + "loss": 0.7733, + "step": 19110 + }, + { + "epoch": 1.3299019266884606, + "grad_norm": 1.6298822219726592, + "learning_rate": 6.819707315511338e-06, + "loss": 0.727, + "step": 19120 + }, + { + "epoch": 1.3305974820894484, + "grad_norm": 3.0071778337487403, + "learning_rate": 6.815937562163585e-06, + "loss": 0.7868, + "step": 19130 + }, + { + "epoch": 1.3312930374904361, + "grad_norm": 1.7843636166521493, + "learning_rate": 6.81216661933705e-06, + "loss": 0.7953, + "step": 19140 + }, + { + "epoch": 1.331988592891424, + "grad_norm": 5.1691658709460775, + "learning_rate": 6.80839448950179e-06, + "loss": 0.8082, + "step": 19150 + }, + { + "epoch": 1.3326841482924114, + "grad_norm": 2.141369064323762, + "learning_rate": 6.804621175128625e-06, + "loss": 0.7772, + "step": 19160 + }, + { + "epoch": 1.3333797036933992, + "grad_norm": 2.539362948713666, + "learning_rate": 6.8008466786891616e-06, + "loss": 0.772, + "step": 19170 + }, + { + "epoch": 1.3340752590943867, + "grad_norm": 2.241331413341468, + "learning_rate": 6.797071002655778e-06, + "loss": 0.79, + "step": 19180 + }, + { + "epoch": 1.3347708144953745, + "grad_norm": 1.8564671187034636, + "learning_rate": 6.793294149501624e-06, + "loss": 0.7099, + "step": 19190 + }, + { + "epoch": 1.3354663698963622, + "grad_norm": 3.7374629453553867, + "learning_rate": 6.7895161217006185e-06, + "loss": 0.7651, + "step": 19200 + }, + { + "epoch": 1.33616192529735, + "grad_norm": 1.6136751074989548, + "learning_rate": 6.785736921727457e-06, + "loss": 0.7629, + "step": 19210 + }, + { + "epoch": 1.3368574806983375, + "grad_norm": 1.9586906764107266, + "learning_rate": 6.781956552057596e-06, + "loss": 0.8135, + "step": 19220 + }, + { + "epoch": 1.3375530360993253, + "grad_norm": 1.813087342079586, + "learning_rate": 6.778175015167256e-06, + "loss": 0.7768, + "step": 19230 + }, + { + "epoch": 1.338248591500313, + "grad_norm": 2.0788282266791094, + "learning_rate": 6.774392313533434e-06, + "loss": 0.8611, + "step": 19240 + }, + { + "epoch": 1.3389441469013006, + "grad_norm": 2.032776904116073, + "learning_rate": 6.770608449633877e-06, + "loss": 0.7903, + "step": 19250 + }, + { + "epoch": 1.3396397023022883, + "grad_norm": 1.5057749718298135, + "learning_rate": 6.766823425947098e-06, + "loss": 0.7524, + "step": 19260 + }, + { + "epoch": 1.340335257703276, + "grad_norm": 5.165237103196143, + "learning_rate": 6.763037244952373e-06, + "loss": 0.7746, + "step": 19270 + }, + { + "epoch": 1.3410308131042639, + "grad_norm": 3.9774042825934046, + "learning_rate": 6.7592499091297325e-06, + "loss": 0.7295, + "step": 19280 + }, + { + "epoch": 1.3417263685052514, + "grad_norm": 1.861241788668798, + "learning_rate": 6.755461420959965e-06, + "loss": 0.7263, + "step": 19290 + }, + { + "epoch": 1.3424219239062392, + "grad_norm": 3.688801367429668, + "learning_rate": 6.751671782924611e-06, + "loss": 0.8596, + "step": 19300 + }, + { + "epoch": 1.3431174793072267, + "grad_norm": 1.7443845749695852, + "learning_rate": 6.7478809975059665e-06, + "loss": 0.7213, + "step": 19310 + }, + { + "epoch": 1.3438130347082144, + "grad_norm": 3.218817685070631, + "learning_rate": 6.7440890671870806e-06, + "loss": 0.8186, + "step": 19320 + }, + { + "epoch": 1.3445085901092022, + "grad_norm": 2.444097506146499, + "learning_rate": 6.740295994451749e-06, + "loss": 0.8417, + "step": 19330 + }, + { + "epoch": 1.34520414551019, + "grad_norm": 2.9548899378963767, + "learning_rate": 6.736501781784518e-06, + "loss": 0.7413, + "step": 19340 + }, + { + "epoch": 1.3458997009111775, + "grad_norm": 2.167486159479147, + "learning_rate": 6.732706431670679e-06, + "loss": 0.7245, + "step": 19350 + }, + { + "epoch": 1.3465952563121653, + "grad_norm": 2.296035624059687, + "learning_rate": 6.728909946596269e-06, + "loss": 0.7104, + "step": 19360 + }, + { + "epoch": 1.347290811713153, + "grad_norm": 1.7541643646976364, + "learning_rate": 6.72511232904807e-06, + "loss": 0.7649, + "step": 19370 + }, + { + "epoch": 1.3479863671141405, + "grad_norm": 2.8477202718552346, + "learning_rate": 6.721313581513605e-06, + "loss": 0.7612, + "step": 19380 + }, + { + "epoch": 1.3486819225151283, + "grad_norm": 1.6367098671821907, + "learning_rate": 6.7175137064811325e-06, + "loss": 0.7667, + "step": 19390 + }, + { + "epoch": 1.349377477916116, + "grad_norm": 2.1819927818883653, + "learning_rate": 6.713712706439659e-06, + "loss": 0.8211, + "step": 19400 + }, + { + "epoch": 1.3500730333171038, + "grad_norm": 1.6270244621277976, + "learning_rate": 6.709910583878919e-06, + "loss": 0.762, + "step": 19410 + }, + { + "epoch": 1.3507685887180914, + "grad_norm": 1.7301796586675615, + "learning_rate": 6.7061073412893874e-06, + "loss": 0.7654, + "step": 19420 + }, + { + "epoch": 1.3514641441190791, + "grad_norm": 1.920678138025983, + "learning_rate": 6.702302981162271e-06, + "loss": 0.7359, + "step": 19430 + }, + { + "epoch": 1.3521596995200666, + "grad_norm": 2.247032198761093, + "learning_rate": 6.698497505989507e-06, + "loss": 0.7631, + "step": 19440 + }, + { + "epoch": 1.3528552549210544, + "grad_norm": 1.4370114330523476, + "learning_rate": 6.6946909182637685e-06, + "loss": 0.7463, + "step": 19450 + }, + { + "epoch": 1.3535508103220422, + "grad_norm": 1.8597581322769527, + "learning_rate": 6.69088322047845e-06, + "loss": 0.7821, + "step": 19460 + }, + { + "epoch": 1.35424636572303, + "grad_norm": 4.539003984905689, + "learning_rate": 6.68707441512768e-06, + "loss": 0.7909, + "step": 19470 + }, + { + "epoch": 1.3549419211240175, + "grad_norm": 1.9000199033348237, + "learning_rate": 6.683264504706306e-06, + "loss": 0.7034, + "step": 19480 + }, + { + "epoch": 1.3556374765250052, + "grad_norm": 1.9678920641323652, + "learning_rate": 6.679453491709904e-06, + "loss": 0.7952, + "step": 19490 + }, + { + "epoch": 1.356333031925993, + "grad_norm": 2.880719300731536, + "learning_rate": 6.675641378634772e-06, + "loss": 0.8311, + "step": 19500 + }, + { + "epoch": 1.356333031925993, + "eval_loss": 0.9837368726730347, + "eval_runtime": 1323.6639, + "eval_samples_per_second": 13.72, + "eval_steps_per_second": 2.287, + "step": 19500 + }, + { + "epoch": 1.3570285873269805, + "grad_norm": 1.79576417593382, + "learning_rate": 6.671828167977925e-06, + "loss": 0.7995, + "step": 19510 + }, + { + "epoch": 1.3577241427279683, + "grad_norm": 2.244440011714235, + "learning_rate": 6.668013862237101e-06, + "loss": 0.7854, + "step": 19520 + }, + { + "epoch": 1.358419698128956, + "grad_norm": 2.007222062975729, + "learning_rate": 6.664198463910752e-06, + "loss": 0.7494, + "step": 19530 + }, + { + "epoch": 1.3591152535299438, + "grad_norm": 1.6587027042695774, + "learning_rate": 6.660381975498051e-06, + "loss": 0.747, + "step": 19540 + }, + { + "epoch": 1.3598108089309313, + "grad_norm": 1.8101703077184073, + "learning_rate": 6.656564399498876e-06, + "loss": 0.7485, + "step": 19550 + }, + { + "epoch": 1.360506364331919, + "grad_norm": 5.280598436282064, + "learning_rate": 6.652745738413826e-06, + "loss": 0.7921, + "step": 19560 + }, + { + "epoch": 1.3612019197329066, + "grad_norm": 2.9073104912073893, + "learning_rate": 6.648925994744208e-06, + "loss": 0.8113, + "step": 19570 + }, + { + "epoch": 1.3618974751338944, + "grad_norm": 2.5120560595116737, + "learning_rate": 6.645105170992035e-06, + "loss": 0.7559, + "step": 19580 + }, + { + "epoch": 1.3625930305348821, + "grad_norm": 4.9594114814303705, + "learning_rate": 6.641283269660031e-06, + "loss": 0.7221, + "step": 19590 + }, + { + "epoch": 1.3632885859358699, + "grad_norm": 2.2576605440027544, + "learning_rate": 6.6374602932516275e-06, + "loss": 0.7676, + "step": 19600 + }, + { + "epoch": 1.3639841413368574, + "grad_norm": 2.016446075888395, + "learning_rate": 6.633636244270953e-06, + "loss": 0.815, + "step": 19610 + }, + { + "epoch": 1.3646796967378452, + "grad_norm": 1.8634262322421398, + "learning_rate": 6.629811125222847e-06, + "loss": 0.8141, + "step": 19620 + }, + { + "epoch": 1.365375252138833, + "grad_norm": 2.3116481535099584, + "learning_rate": 6.6259849386128435e-06, + "loss": 0.818, + "step": 19630 + }, + { + "epoch": 1.3660708075398205, + "grad_norm": 1.9578497592795572, + "learning_rate": 6.62215768694718e-06, + "loss": 0.7314, + "step": 19640 + }, + { + "epoch": 1.3667663629408082, + "grad_norm": 1.997398194577256, + "learning_rate": 6.618329372732788e-06, + "loss": 0.7379, + "step": 19650 + }, + { + "epoch": 1.367461918341796, + "grad_norm": 1.7032141211683876, + "learning_rate": 6.6144999984773e-06, + "loss": 0.7976, + "step": 19660 + }, + { + "epoch": 1.3681574737427837, + "grad_norm": 3.1016633121660253, + "learning_rate": 6.610669566689038e-06, + "loss": 0.7596, + "step": 19670 + }, + { + "epoch": 1.3688530291437713, + "grad_norm": 1.6551182511938953, + "learning_rate": 6.606838079877017e-06, + "loss": 0.7485, + "step": 19680 + }, + { + "epoch": 1.369548584544759, + "grad_norm": 2.4041400475217403, + "learning_rate": 6.603005540550946e-06, + "loss": 0.795, + "step": 19690 + }, + { + "epoch": 1.3702441399457466, + "grad_norm": 2.067071189741745, + "learning_rate": 6.599171951221224e-06, + "loss": 0.7412, + "step": 19700 + }, + { + "epoch": 1.3709396953467343, + "grad_norm": 2.0912437247249094, + "learning_rate": 6.595337314398933e-06, + "loss": 0.7344, + "step": 19710 + }, + { + "epoch": 1.371635250747722, + "grad_norm": 1.8337442642753947, + "learning_rate": 6.591501632595845e-06, + "loss": 0.7903, + "step": 19720 + }, + { + "epoch": 1.3723308061487098, + "grad_norm": 2.0161195319484846, + "learning_rate": 6.587664908324415e-06, + "loss": 0.7397, + "step": 19730 + }, + { + "epoch": 1.3730263615496974, + "grad_norm": 2.0317507817392157, + "learning_rate": 6.58382714409778e-06, + "loss": 0.7301, + "step": 19740 + }, + { + "epoch": 1.3737219169506851, + "grad_norm": 1.829998401245201, + "learning_rate": 6.579988342429764e-06, + "loss": 0.796, + "step": 19750 + }, + { + "epoch": 1.3744174723516729, + "grad_norm": 1.8490663801192384, + "learning_rate": 6.576148505834861e-06, + "loss": 0.7951, + "step": 19760 + }, + { + "epoch": 1.3751130277526604, + "grad_norm": 1.8879897240648675, + "learning_rate": 6.572307636828249e-06, + "loss": 0.7063, + "step": 19770 + }, + { + "epoch": 1.3758085831536482, + "grad_norm": 2.289528279788842, + "learning_rate": 6.568465737925782e-06, + "loss": 0.742, + "step": 19780 + }, + { + "epoch": 1.376504138554636, + "grad_norm": 3.1438287812708703, + "learning_rate": 6.5646228116439895e-06, + "loss": 0.7889, + "step": 19790 + }, + { + "epoch": 1.3771996939556237, + "grad_norm": 2.201873206326993, + "learning_rate": 6.560778860500068e-06, + "loss": 0.7933, + "step": 19800 + }, + { + "epoch": 1.3778952493566112, + "grad_norm": 2.5065388701290967, + "learning_rate": 6.556933887011891e-06, + "loss": 0.7441, + "step": 19810 + }, + { + "epoch": 1.378590804757599, + "grad_norm": 1.996633271129021, + "learning_rate": 6.5530878936980034e-06, + "loss": 0.825, + "step": 19820 + }, + { + "epoch": 1.3792863601585865, + "grad_norm": 1.682388787557419, + "learning_rate": 6.549240883077611e-06, + "loss": 0.7222, + "step": 19830 + }, + { + "epoch": 1.3799819155595743, + "grad_norm": 2.246127182823078, + "learning_rate": 6.545392857670591e-06, + "loss": 0.8051, + "step": 19840 + }, + { + "epoch": 1.380677470960562, + "grad_norm": 2.002558459536705, + "learning_rate": 6.541543819997484e-06, + "loss": 0.7495, + "step": 19850 + }, + { + "epoch": 1.3813730263615498, + "grad_norm": 1.6505359745592234, + "learning_rate": 6.537693772579495e-06, + "loss": 0.7806, + "step": 19860 + }, + { + "epoch": 1.3820685817625373, + "grad_norm": 7.340308813171307, + "learning_rate": 6.533842717938487e-06, + "loss": 0.731, + "step": 19870 + }, + { + "epoch": 1.382764137163525, + "grad_norm": 2.0118757271109335, + "learning_rate": 6.529990658596986e-06, + "loss": 0.7921, + "step": 19880 + }, + { + "epoch": 1.3834596925645128, + "grad_norm": 2.3808686695731227, + "learning_rate": 6.526137597078177e-06, + "loss": 0.7823, + "step": 19890 + }, + { + "epoch": 1.3841552479655004, + "grad_norm": 2.116376511862388, + "learning_rate": 6.522283535905895e-06, + "loss": 0.7137, + "step": 19900 + }, + { + "epoch": 1.3848508033664881, + "grad_norm": 1.73610581540509, + "learning_rate": 6.518428477604638e-06, + "loss": 0.7625, + "step": 19910 + }, + { + "epoch": 1.385546358767476, + "grad_norm": 3.43083561903016, + "learning_rate": 6.514572424699552e-06, + "loss": 0.8112, + "step": 19920 + }, + { + "epoch": 1.3862419141684637, + "grad_norm": 2.0860792998591267, + "learning_rate": 6.510715379716438e-06, + "loss": 0.7487, + "step": 19930 + }, + { + "epoch": 1.3869374695694512, + "grad_norm": 2.1434790570175903, + "learning_rate": 6.50685734518174e-06, + "loss": 0.7599, + "step": 19940 + }, + { + "epoch": 1.387633024970439, + "grad_norm": 2.602662996428484, + "learning_rate": 6.50299832362256e-06, + "loss": 0.7161, + "step": 19950 + }, + { + "epoch": 1.3883285803714265, + "grad_norm": 4.222737536286101, + "learning_rate": 6.499138317566639e-06, + "loss": 0.7757, + "step": 19960 + }, + { + "epoch": 1.3890241357724142, + "grad_norm": 1.940507037442219, + "learning_rate": 6.495277329542364e-06, + "loss": 0.7812, + "step": 19970 + }, + { + "epoch": 1.389719691173402, + "grad_norm": 1.9047937389973002, + "learning_rate": 6.4914153620787705e-06, + "loss": 0.8077, + "step": 19980 + }, + { + "epoch": 1.3904152465743898, + "grad_norm": 1.678111449628478, + "learning_rate": 6.487552417705527e-06, + "loss": 0.7632, + "step": 19990 + }, + { + "epoch": 1.3911108019753773, + "grad_norm": 2.3368307182498045, + "learning_rate": 6.483688498952949e-06, + "loss": 0.7422, + "step": 20000 + }, + { + "epoch": 1.3911108019753773, + "eval_loss": 0.9796319603919983, + "eval_runtime": 1321.2873, + "eval_samples_per_second": 13.745, + "eval_steps_per_second": 2.291, + "step": 20000 + }, + { + "epoch": 1.391806357376365, + "grad_norm": 2.2447133118687366, + "learning_rate": 6.479823608351988e-06, + "loss": 0.795, + "step": 20010 + }, + { + "epoch": 1.3925019127773526, + "grad_norm": 5.779364312523271, + "learning_rate": 6.4759577484342306e-06, + "loss": 0.7667, + "step": 20020 + }, + { + "epoch": 1.3931974681783403, + "grad_norm": 2.502797735495768, + "learning_rate": 6.472090921731901e-06, + "loss": 0.783, + "step": 20030 + }, + { + "epoch": 1.393893023579328, + "grad_norm": 2.3392250470639055, + "learning_rate": 6.468223130777853e-06, + "loss": 0.801, + "step": 20040 + }, + { + "epoch": 1.3945885789803159, + "grad_norm": 3.3588699080697024, + "learning_rate": 6.464354378105575e-06, + "loss": 0.7851, + "step": 20050 + }, + { + "epoch": 1.3952841343813036, + "grad_norm": 1.8406239723131146, + "learning_rate": 6.460484666249187e-06, + "loss": 0.7913, + "step": 20060 + }, + { + "epoch": 1.3959796897822911, + "grad_norm": 2.434120024138934, + "learning_rate": 6.456613997743431e-06, + "loss": 0.7888, + "step": 20070 + }, + { + "epoch": 1.396675245183279, + "grad_norm": 2.091790686344413, + "learning_rate": 6.452742375123681e-06, + "loss": 0.7486, + "step": 20080 + }, + { + "epoch": 1.3973708005842664, + "grad_norm": 2.193875911044577, + "learning_rate": 6.448869800925936e-06, + "loss": 0.7997, + "step": 20090 + }, + { + "epoch": 1.3980663559852542, + "grad_norm": 1.9080334364482494, + "learning_rate": 6.444996277686813e-06, + "loss": 0.7345, + "step": 20100 + }, + { + "epoch": 1.398761911386242, + "grad_norm": 2.4273525741669535, + "learning_rate": 6.44112180794356e-06, + "loss": 0.7547, + "step": 20110 + }, + { + "epoch": 1.3994574667872297, + "grad_norm": 2.508106040711249, + "learning_rate": 6.437246394234034e-06, + "loss": 0.8079, + "step": 20120 + }, + { + "epoch": 1.4001530221882172, + "grad_norm": 3.7469942584460885, + "learning_rate": 6.4333700390967154e-06, + "loss": 0.7495, + "step": 20130 + }, + { + "epoch": 1.400848577589205, + "grad_norm": 2.4719419132222322, + "learning_rate": 6.429492745070708e-06, + "loss": 0.8078, + "step": 20140 + }, + { + "epoch": 1.4015441329901925, + "grad_norm": 2.1534228130175594, + "learning_rate": 6.425614514695717e-06, + "loss": 0.7642, + "step": 20150 + }, + { + "epoch": 1.4022396883911803, + "grad_norm": 2.0932094229179534, + "learning_rate": 6.421735350512071e-06, + "loss": 0.7377, + "step": 20160 + }, + { + "epoch": 1.402935243792168, + "grad_norm": 2.569430045375915, + "learning_rate": 6.417855255060708e-06, + "loss": 0.7357, + "step": 20170 + }, + { + "epoch": 1.4036307991931558, + "grad_norm": 2.2624878761454825, + "learning_rate": 6.413974230883176e-06, + "loss": 0.8093, + "step": 20180 + }, + { + "epoch": 1.4043263545941433, + "grad_norm": 2.3971622882225403, + "learning_rate": 6.4100922805216255e-06, + "loss": 0.7662, + "step": 20190 + }, + { + "epoch": 1.405021909995131, + "grad_norm": 2.534581436224185, + "learning_rate": 6.406209406518824e-06, + "loss": 0.7905, + "step": 20200 + }, + { + "epoch": 1.4057174653961189, + "grad_norm": 1.8958175244226922, + "learning_rate": 6.402325611418139e-06, + "loss": 0.8137, + "step": 20210 + }, + { + "epoch": 1.4064130207971064, + "grad_norm": 2.15722536378018, + "learning_rate": 6.398440897763536e-06, + "loss": 0.7629, + "step": 20220 + }, + { + "epoch": 1.4071085761980942, + "grad_norm": 2.0978553913090465, + "learning_rate": 6.394555268099593e-06, + "loss": 0.7198, + "step": 20230 + }, + { + "epoch": 1.407804131599082, + "grad_norm": 2.1388085281223916, + "learning_rate": 6.390668724971479e-06, + "loss": 0.7562, + "step": 20240 + }, + { + "epoch": 1.4084996870000697, + "grad_norm": 1.999477320073482, + "learning_rate": 6.386781270924968e-06, + "loss": 0.7172, + "step": 20250 + }, + { + "epoch": 1.4091952424010572, + "grad_norm": 2.0502344416619342, + "learning_rate": 6.382892908506422e-06, + "loss": 0.7342, + "step": 20260 + }, + { + "epoch": 1.409890797802045, + "grad_norm": 2.0642477893234035, + "learning_rate": 6.379003640262806e-06, + "loss": 0.7715, + "step": 20270 + }, + { + "epoch": 1.4105863532030325, + "grad_norm": 4.7612914746354065, + "learning_rate": 6.375113468741679e-06, + "loss": 0.7798, + "step": 20280 + }, + { + "epoch": 1.4112819086040203, + "grad_norm": 2.9658090732812448, + "learning_rate": 6.371222396491184e-06, + "loss": 0.8309, + "step": 20290 + }, + { + "epoch": 1.411977464005008, + "grad_norm": 2.1702824937068224, + "learning_rate": 6.367330426060059e-06, + "loss": 0.7711, + "step": 20300 + }, + { + "epoch": 1.4126730194059958, + "grad_norm": 2.9310925938193897, + "learning_rate": 6.363437559997631e-06, + "loss": 0.7947, + "step": 20310 + }, + { + "epoch": 1.4133685748069833, + "grad_norm": 2.4981765775313822, + "learning_rate": 6.359543800853811e-06, + "loss": 0.7894, + "step": 20320 + }, + { + "epoch": 1.414064130207971, + "grad_norm": 2.8328074591784245, + "learning_rate": 6.355649151179097e-06, + "loss": 0.7482, + "step": 20330 + }, + { + "epoch": 1.4147596856089588, + "grad_norm": 3.215985644385856, + "learning_rate": 6.35175361352457e-06, + "loss": 0.8108, + "step": 20340 + }, + { + "epoch": 1.4154552410099464, + "grad_norm": 3.1526227298439387, + "learning_rate": 6.347857190441893e-06, + "loss": 0.8006, + "step": 20350 + }, + { + "epoch": 1.416150796410934, + "grad_norm": 2.269902455230645, + "learning_rate": 6.343959884483305e-06, + "loss": 0.7989, + "step": 20360 + }, + { + "epoch": 1.4168463518119219, + "grad_norm": 2.236563917138059, + "learning_rate": 6.3400616982016305e-06, + "loss": 0.8261, + "step": 20370 + }, + { + "epoch": 1.4175419072129096, + "grad_norm": 1.7496781695085204, + "learning_rate": 6.336162634150264e-06, + "loss": 0.7373, + "step": 20380 + }, + { + "epoch": 1.4182374626138972, + "grad_norm": 1.8778405832743459, + "learning_rate": 6.332262694883179e-06, + "loss": 0.7752, + "step": 20390 + }, + { + "epoch": 1.418933018014885, + "grad_norm": 1.8609713577071798, + "learning_rate": 6.32836188295492e-06, + "loss": 0.7869, + "step": 20400 + }, + { + "epoch": 1.4196285734158725, + "grad_norm": 2.0589816965084844, + "learning_rate": 6.324460200920604e-06, + "loss": 0.7398, + "step": 20410 + }, + { + "epoch": 1.4203241288168602, + "grad_norm": 1.7746827401445178, + "learning_rate": 6.320557651335919e-06, + "loss": 0.7205, + "step": 20420 + }, + { + "epoch": 1.421019684217848, + "grad_norm": 1.953497992518339, + "learning_rate": 6.31665423675712e-06, + "loss": 0.7886, + "step": 20430 + }, + { + "epoch": 1.4217152396188357, + "grad_norm": 1.8773894539677192, + "learning_rate": 6.3127499597410295e-06, + "loss": 0.7679, + "step": 20440 + }, + { + "epoch": 1.4224107950198233, + "grad_norm": 2.1743770630279795, + "learning_rate": 6.30884482284503e-06, + "loss": 0.7841, + "step": 20450 + }, + { + "epoch": 1.423106350420811, + "grad_norm": 2.9190612328677696, + "learning_rate": 6.304938828627078e-06, + "loss": 0.8047, + "step": 20460 + }, + { + "epoch": 1.4238019058217988, + "grad_norm": 2.51533446944192, + "learning_rate": 6.301031979645682e-06, + "loss": 0.7184, + "step": 20470 + }, + { + "epoch": 1.4244974612227863, + "grad_norm": 4.269111397782711, + "learning_rate": 6.297124278459912e-06, + "loss": 0.7006, + "step": 20480 + }, + { + "epoch": 1.425193016623774, + "grad_norm": 2.154430146882827, + "learning_rate": 6.293215727629398e-06, + "loss": 0.7195, + "step": 20490 + }, + { + "epoch": 1.4258885720247618, + "grad_norm": 2.339669291778915, + "learning_rate": 6.28930632971433e-06, + "loss": 0.721, + "step": 20500 + }, + { + "epoch": 1.4258885720247618, + "eval_loss": 0.9766389727592468, + "eval_runtime": 1323.2135, + "eval_samples_per_second": 13.725, + "eval_steps_per_second": 2.288, + "step": 20500 + }, + { + "epoch": 1.4265841274257496, + "grad_norm": 2.409179403240494, + "learning_rate": 6.285396087275444e-06, + "loss": 0.7288, + "step": 20510 + }, + { + "epoch": 1.4272796828267371, + "grad_norm": 3.152357959633311, + "learning_rate": 6.281485002874036e-06, + "loss": 0.6896, + "step": 20520 + }, + { + "epoch": 1.4279752382277249, + "grad_norm": 2.106567275888273, + "learning_rate": 6.277573079071955e-06, + "loss": 0.8097, + "step": 20530 + }, + { + "epoch": 1.4286707936287124, + "grad_norm": 2.4845194814579803, + "learning_rate": 6.273660318431591e-06, + "loss": 0.8124, + "step": 20540 + }, + { + "epoch": 1.4293663490297002, + "grad_norm": 1.776348560352569, + "learning_rate": 6.26974672351589e-06, + "loss": 0.7289, + "step": 20550 + }, + { + "epoch": 1.430061904430688, + "grad_norm": 2.168733996076424, + "learning_rate": 6.265832296888344e-06, + "loss": 0.7385, + "step": 20560 + }, + { + "epoch": 1.4307574598316757, + "grad_norm": 1.872762573718865, + "learning_rate": 6.261917041112988e-06, + "loss": 0.7646, + "step": 20570 + }, + { + "epoch": 1.4314530152326632, + "grad_norm": 2.1971533360902344, + "learning_rate": 6.258000958754396e-06, + "loss": 0.762, + "step": 20580 + }, + { + "epoch": 1.432148570633651, + "grad_norm": 2.051077927839803, + "learning_rate": 6.254084052377691e-06, + "loss": 0.7922, + "step": 20590 + }, + { + "epoch": 1.4328441260346387, + "grad_norm": 5.439772515389704, + "learning_rate": 6.250166324548534e-06, + "loss": 0.8217, + "step": 20600 + }, + { + "epoch": 1.4335396814356263, + "grad_norm": 1.9825126314484207, + "learning_rate": 6.246247777833116e-06, + "loss": 0.7508, + "step": 20610 + }, + { + "epoch": 1.434235236836614, + "grad_norm": 2.608723993862648, + "learning_rate": 6.2423284147981755e-06, + "loss": 0.7909, + "step": 20620 + }, + { + "epoch": 1.4349307922376018, + "grad_norm": 2.5589009517343357, + "learning_rate": 6.23840823801098e-06, + "loss": 0.7755, + "step": 20630 + }, + { + "epoch": 1.4356263476385895, + "grad_norm": 3.9378290638049775, + "learning_rate": 6.2344872500393314e-06, + "loss": 0.7646, + "step": 20640 + }, + { + "epoch": 1.436321903039577, + "grad_norm": 2.3522748851258193, + "learning_rate": 6.230565453451562e-06, + "loss": 0.7765, + "step": 20650 + }, + { + "epoch": 1.4370174584405648, + "grad_norm": 1.979086268949043, + "learning_rate": 6.226642850816533e-06, + "loss": 0.7915, + "step": 20660 + }, + { + "epoch": 1.4377130138415524, + "grad_norm": 1.9221846415917339, + "learning_rate": 6.2227194447036374e-06, + "loss": 0.7509, + "step": 20670 + }, + { + "epoch": 1.4384085692425401, + "grad_norm": 8.437535335863394, + "learning_rate": 6.21879523768279e-06, + "loss": 0.766, + "step": 20680 + }, + { + "epoch": 1.4391041246435279, + "grad_norm": 2.2740973156975346, + "learning_rate": 6.214870232324432e-06, + "loss": 0.7907, + "step": 20690 + }, + { + "epoch": 1.4397996800445156, + "grad_norm": 2.4384125128889247, + "learning_rate": 6.21094443119953e-06, + "loss": 0.7236, + "step": 20700 + }, + { + "epoch": 1.4404952354455032, + "grad_norm": 3.4425225960378563, + "learning_rate": 6.207017836879565e-06, + "loss": 0.7638, + "step": 20710 + }, + { + "epoch": 1.441190790846491, + "grad_norm": 4.0593619656124, + "learning_rate": 6.2030904519365475e-06, + "loss": 0.7826, + "step": 20720 + }, + { + "epoch": 1.4418863462474787, + "grad_norm": 2.2760163500835167, + "learning_rate": 6.199162278942997e-06, + "loss": 0.7297, + "step": 20730 + }, + { + "epoch": 1.4425819016484662, + "grad_norm": 2.812238217040176, + "learning_rate": 6.1952333204719525e-06, + "loss": 0.8218, + "step": 20740 + }, + { + "epoch": 1.443277457049454, + "grad_norm": 2.168243880712413, + "learning_rate": 6.19130357909697e-06, + "loss": 0.7792, + "step": 20750 + }, + { + "epoch": 1.4439730124504417, + "grad_norm": 2.0201773734885244, + "learning_rate": 6.187373057392115e-06, + "loss": 0.8095, + "step": 20760 + }, + { + "epoch": 1.4446685678514295, + "grad_norm": 2.520671443391966, + "learning_rate": 6.183441757931963e-06, + "loss": 0.7647, + "step": 20770 + }, + { + "epoch": 1.445364123252417, + "grad_norm": 2.02492643099457, + "learning_rate": 6.179509683291605e-06, + "loss": 0.8316, + "step": 20780 + }, + { + "epoch": 1.4460596786534048, + "grad_norm": 3.082660021622515, + "learning_rate": 6.175576836046632e-06, + "loss": 0.781, + "step": 20790 + }, + { + "epoch": 1.4467552340543923, + "grad_norm": 2.260141509298753, + "learning_rate": 6.171643218773145e-06, + "loss": 0.6953, + "step": 20800 + }, + { + "epoch": 1.44745078945538, + "grad_norm": 2.2384350821066628, + "learning_rate": 6.167708834047752e-06, + "loss": 0.7407, + "step": 20810 + }, + { + "epoch": 1.4481463448563678, + "grad_norm": 2.1740786314766845, + "learning_rate": 6.163773684447561e-06, + "loss": 0.7321, + "step": 20820 + }, + { + "epoch": 1.4488419002573556, + "grad_norm": 1.7569779124000335, + "learning_rate": 6.159837772550179e-06, + "loss": 0.7731, + "step": 20830 + }, + { + "epoch": 1.4495374556583431, + "grad_norm": 2.564458505297365, + "learning_rate": 6.155901100933713e-06, + "loss": 0.7733, + "step": 20840 + }, + { + "epoch": 1.450233011059331, + "grad_norm": 2.2820088630098585, + "learning_rate": 6.1519636721767725e-06, + "loss": 0.821, + "step": 20850 + }, + { + "epoch": 1.4509285664603186, + "grad_norm": 1.913762641297967, + "learning_rate": 6.148025488858458e-06, + "loss": 0.7114, + "step": 20860 + }, + { + "epoch": 1.4516241218613062, + "grad_norm": 2.144642473520173, + "learning_rate": 6.144086553558364e-06, + "loss": 0.7674, + "step": 20870 + }, + { + "epoch": 1.452319677262294, + "grad_norm": 1.9494746594163108, + "learning_rate": 6.14014686885658e-06, + "loss": 0.7656, + "step": 20880 + }, + { + "epoch": 1.4530152326632817, + "grad_norm": 1.779343773761145, + "learning_rate": 6.136206437333688e-06, + "loss": 0.7325, + "step": 20890 + }, + { + "epoch": 1.4537107880642695, + "grad_norm": 2.6749234299902414, + "learning_rate": 6.1322652615707535e-06, + "loss": 0.7951, + "step": 20900 + }, + { + "epoch": 1.454406343465257, + "grad_norm": 2.48949134073514, + "learning_rate": 6.128323344149334e-06, + "loss": 0.791, + "step": 20910 + }, + { + "epoch": 1.4551018988662447, + "grad_norm": 2.1902825405776443, + "learning_rate": 6.124380687651472e-06, + "loss": 0.7997, + "step": 20920 + }, + { + "epoch": 1.4557974542672323, + "grad_norm": 2.8510864621511485, + "learning_rate": 6.120437294659692e-06, + "loss": 0.7888, + "step": 20930 + }, + { + "epoch": 1.45649300966822, + "grad_norm": 2.1000172297912325, + "learning_rate": 6.116493167757005e-06, + "loss": 0.7396, + "step": 20940 + }, + { + "epoch": 1.4571885650692078, + "grad_norm": 2.1741873025864513, + "learning_rate": 6.112548309526899e-06, + "loss": 0.8245, + "step": 20950 + }, + { + "epoch": 1.4578841204701956, + "grad_norm": 1.6943358277111327, + "learning_rate": 6.108602722553343e-06, + "loss": 0.7396, + "step": 20960 + }, + { + "epoch": 1.458579675871183, + "grad_norm": 2.04164447816251, + "learning_rate": 6.1046564094207805e-06, + "loss": 0.7793, + "step": 20970 + }, + { + "epoch": 1.4592752312721708, + "grad_norm": 2.0816708947936813, + "learning_rate": 6.100709372714136e-06, + "loss": 0.8032, + "step": 20980 + }, + { + "epoch": 1.4599707866731584, + "grad_norm": 1.614014446605692, + "learning_rate": 6.096761615018802e-06, + "loss": 0.7593, + "step": 20990 + }, + { + "epoch": 1.4606663420741461, + "grad_norm": 3.3020503407469177, + "learning_rate": 6.092813138920647e-06, + "loss": 0.788, + "step": 21000 + }, + { + "epoch": 1.4606663420741461, + "eval_loss": 0.9725631475448608, + "eval_runtime": 1320.2441, + "eval_samples_per_second": 13.756, + "eval_steps_per_second": 2.293, + "step": 21000 + }, + { + "epoch": 1.461361897475134, + "grad_norm": 2.525578029764264, + "learning_rate": 6.08886394700601e-06, + "loss": 0.7755, + "step": 21010 + }, + { + "epoch": 1.4620574528761217, + "grad_norm": 3.335408107101738, + "learning_rate": 6.084914041861697e-06, + "loss": 0.7802, + "step": 21020 + }, + { + "epoch": 1.4627530082771092, + "grad_norm": 1.725008854191789, + "learning_rate": 6.080963426074982e-06, + "loss": 0.7329, + "step": 21030 + }, + { + "epoch": 1.463448563678097, + "grad_norm": 2.0513765178182113, + "learning_rate": 6.077012102233606e-06, + "loss": 0.7169, + "step": 21040 + }, + { + "epoch": 1.4641441190790847, + "grad_norm": 1.8628791129147024, + "learning_rate": 6.073060072925772e-06, + "loss": 0.7475, + "step": 21050 + }, + { + "epoch": 1.4648396744800722, + "grad_norm": 3.3284975263772703, + "learning_rate": 6.069107340740143e-06, + "loss": 0.7456, + "step": 21060 + }, + { + "epoch": 1.46553522988106, + "grad_norm": 2.0514368834898127, + "learning_rate": 6.065153908265848e-06, + "loss": 0.7258, + "step": 21070 + }, + { + "epoch": 1.4662307852820478, + "grad_norm": 2.493896667047074, + "learning_rate": 6.061199778092473e-06, + "loss": 0.767, + "step": 21080 + }, + { + "epoch": 1.4669263406830355, + "grad_norm": 2.3334460891207365, + "learning_rate": 6.057244952810053e-06, + "loss": 0.7655, + "step": 21090 + }, + { + "epoch": 1.467621896084023, + "grad_norm": 2.0580430506203413, + "learning_rate": 6.053289435009093e-06, + "loss": 0.7312, + "step": 21100 + }, + { + "epoch": 1.4683174514850108, + "grad_norm": 5.808356317665404, + "learning_rate": 6.049333227280539e-06, + "loss": 0.7365, + "step": 21110 + }, + { + "epoch": 1.4690130068859983, + "grad_norm": 2.0154246692965936, + "learning_rate": 6.045376332215793e-06, + "loss": 0.7191, + "step": 21120 + }, + { + "epoch": 1.469708562286986, + "grad_norm": 2.144737309798918, + "learning_rate": 6.04141875240671e-06, + "loss": 0.7639, + "step": 21130 + }, + { + "epoch": 1.4704041176879739, + "grad_norm": 2.1347359209087973, + "learning_rate": 6.0374604904455925e-06, + "loss": 0.7604, + "step": 21140 + }, + { + "epoch": 1.4710996730889616, + "grad_norm": 1.5532620579443108, + "learning_rate": 6.0335015489251845e-06, + "loss": 0.7053, + "step": 21150 + }, + { + "epoch": 1.4717952284899491, + "grad_norm": 1.9608402259796645, + "learning_rate": 6.029541930438681e-06, + "loss": 0.7775, + "step": 21160 + }, + { + "epoch": 1.472490783890937, + "grad_norm": 4.523405263525245, + "learning_rate": 6.025581637579719e-06, + "loss": 0.8056, + "step": 21170 + }, + { + "epoch": 1.4731863392919247, + "grad_norm": 2.778935566211686, + "learning_rate": 6.021620672942376e-06, + "loss": 0.7743, + "step": 21180 + }, + { + "epoch": 1.4738818946929122, + "grad_norm": 2.2387997736777105, + "learning_rate": 6.017659039121172e-06, + "loss": 0.7551, + "step": 21190 + }, + { + "epoch": 1.4745774500939, + "grad_norm": 2.141058606358974, + "learning_rate": 6.0136967387110625e-06, + "loss": 0.8029, + "step": 21200 + }, + { + "epoch": 1.4752730054948877, + "grad_norm": 1.8893515364714477, + "learning_rate": 6.009733774307441e-06, + "loss": 0.7623, + "step": 21210 + }, + { + "epoch": 1.4759685608958755, + "grad_norm": 2.2630129600510864, + "learning_rate": 6.005770148506135e-06, + "loss": 0.7531, + "step": 21220 + }, + { + "epoch": 1.476664116296863, + "grad_norm": 2.251299671773106, + "learning_rate": 6.0018058639034086e-06, + "loss": 0.7131, + "step": 21230 + }, + { + "epoch": 1.4773596716978508, + "grad_norm": 2.06055010939884, + "learning_rate": 5.997840923095953e-06, + "loss": 0.7568, + "step": 21240 + }, + { + "epoch": 1.4780552270988383, + "grad_norm": 4.213130235252458, + "learning_rate": 5.993875328680888e-06, + "loss": 0.7534, + "step": 21250 + }, + { + "epoch": 1.478750782499826, + "grad_norm": 3.0781253147634273, + "learning_rate": 5.98990908325577e-06, + "loss": 0.7532, + "step": 21260 + }, + { + "epoch": 1.4794463379008138, + "grad_norm": 1.7674058267012773, + "learning_rate": 5.985942189418574e-06, + "loss": 0.7505, + "step": 21270 + }, + { + "epoch": 1.4801418933018016, + "grad_norm": 4.862981041520095, + "learning_rate": 5.981974649767702e-06, + "loss": 0.786, + "step": 21280 + }, + { + "epoch": 1.480837448702789, + "grad_norm": 2.622835004527119, + "learning_rate": 5.97800646690198e-06, + "loss": 0.7424, + "step": 21290 + }, + { + "epoch": 1.4815330041037769, + "grad_norm": 8.130413058548863, + "learning_rate": 5.974037643420654e-06, + "loss": 0.7804, + "step": 21300 + }, + { + "epoch": 1.4822285595047646, + "grad_norm": 2.120065808916282, + "learning_rate": 5.970068181923388e-06, + "loss": 0.7352, + "step": 21310 + }, + { + "epoch": 1.4829241149057522, + "grad_norm": 2.1668513573996813, + "learning_rate": 5.96609808501027e-06, + "loss": 0.7994, + "step": 21320 + }, + { + "epoch": 1.48361967030674, + "grad_norm": 2.2159811193182577, + "learning_rate": 5.962127355281798e-06, + "loss": 0.7838, + "step": 21330 + }, + { + "epoch": 1.4843152257077277, + "grad_norm": 1.7785247589095918, + "learning_rate": 5.9581559953388866e-06, + "loss": 0.7526, + "step": 21340 + }, + { + "epoch": 1.4850107811087154, + "grad_norm": 1.9969124033737091, + "learning_rate": 5.954184007782864e-06, + "loss": 0.7237, + "step": 21350 + }, + { + "epoch": 1.485706336509703, + "grad_norm": 1.9871301114589144, + "learning_rate": 5.950211395215468e-06, + "loss": 0.7481, + "step": 21360 + }, + { + "epoch": 1.4864018919106907, + "grad_norm": 1.9967011956092233, + "learning_rate": 5.946238160238847e-06, + "loss": 0.7823, + "step": 21370 + }, + { + "epoch": 1.4870974473116783, + "grad_norm": 2.0405059146676385, + "learning_rate": 5.9422643054555575e-06, + "loss": 0.7854, + "step": 21380 + }, + { + "epoch": 1.487793002712666, + "grad_norm": 2.2950011235484364, + "learning_rate": 5.93828983346856e-06, + "loss": 0.7781, + "step": 21390 + }, + { + "epoch": 1.4884885581136538, + "grad_norm": 2.294847118371643, + "learning_rate": 5.934314746881221e-06, + "loss": 0.7472, + "step": 21400 + }, + { + "epoch": 1.4891841135146415, + "grad_norm": 1.8735727621528149, + "learning_rate": 5.930339048297308e-06, + "loss": 0.7763, + "step": 21410 + }, + { + "epoch": 1.489879668915629, + "grad_norm": 1.73654288157397, + "learning_rate": 5.926362740320995e-06, + "loss": 0.7603, + "step": 21420 + }, + { + "epoch": 1.4905752243166168, + "grad_norm": 1.778854884671813, + "learning_rate": 5.922385825556844e-06, + "loss": 0.7183, + "step": 21430 + }, + { + "epoch": 1.4912707797176046, + "grad_norm": 4.6142537088484445, + "learning_rate": 5.918408306609825e-06, + "loss": 0.7741, + "step": 21440 + }, + { + "epoch": 1.4919663351185921, + "grad_norm": 2.1860881857708545, + "learning_rate": 5.9144301860852984e-06, + "loss": 0.846, + "step": 21450 + }, + { + "epoch": 1.4926618905195799, + "grad_norm": 3.637997813932914, + "learning_rate": 5.910451466589022e-06, + "loss": 0.8524, + "step": 21460 + }, + { + "epoch": 1.4933574459205676, + "grad_norm": 2.415830801695245, + "learning_rate": 5.906472150727143e-06, + "loss": 0.7295, + "step": 21470 + }, + { + "epoch": 1.4940530013215554, + "grad_norm": 1.4412794654511272, + "learning_rate": 5.902492241106197e-06, + "loss": 0.7604, + "step": 21480 + }, + { + "epoch": 1.494748556722543, + "grad_norm": 4.977789904656446, + "learning_rate": 5.898511740333118e-06, + "loss": 0.769, + "step": 21490 + }, + { + "epoch": 1.4954441121235307, + "grad_norm": 2.2129563492982918, + "learning_rate": 5.8945306510152165e-06, + "loss": 0.7625, + "step": 21500 + }, + { + "epoch": 1.4954441121235307, + "eval_loss": 0.9703852534294128, + "eval_runtime": 1322.8137, + "eval_samples_per_second": 13.729, + "eval_steps_per_second": 2.288, + "step": 21500 + }, + { + "epoch": 1.4961396675245182, + "grad_norm": 3.0139168677027093, + "learning_rate": 5.890548975760193e-06, + "loss": 0.6883, + "step": 21510 + }, + { + "epoch": 1.496835222925506, + "grad_norm": 2.0153398542850898, + "learning_rate": 5.8865667171761345e-06, + "loss": 0.8049, + "step": 21520 + }, + { + "epoch": 1.4975307783264937, + "grad_norm": 3.3678835914849463, + "learning_rate": 5.882583877871506e-06, + "loss": 0.8434, + "step": 21530 + }, + { + "epoch": 1.4982263337274815, + "grad_norm": 3.945832914905536, + "learning_rate": 5.878600460455152e-06, + "loss": 0.7768, + "step": 21540 + }, + { + "epoch": 1.498921889128469, + "grad_norm": 1.7688820947309494, + "learning_rate": 5.874616467536301e-06, + "loss": 0.7313, + "step": 21550 + }, + { + "epoch": 1.4996174445294568, + "grad_norm": 2.7507940482287374, + "learning_rate": 5.870631901724556e-06, + "loss": 0.7266, + "step": 21560 + }, + { + "epoch": 1.5003129999304443, + "grad_norm": 3.2078289771335258, + "learning_rate": 5.866646765629891e-06, + "loss": 0.7697, + "step": 21570 + }, + { + "epoch": 1.501008555331432, + "grad_norm": 2.1561515197476084, + "learning_rate": 5.86266106186266e-06, + "loss": 0.8055, + "step": 21580 + }, + { + "epoch": 1.5017041107324198, + "grad_norm": 2.098456756832535, + "learning_rate": 5.8586747930335856e-06, + "loss": 0.8178, + "step": 21590 + }, + { + "epoch": 1.5023996661334076, + "grad_norm": 2.0637317461974143, + "learning_rate": 5.85468796175376e-06, + "loss": 0.7738, + "step": 21600 + }, + { + "epoch": 1.5030952215343953, + "grad_norm": 2.0696283813307037, + "learning_rate": 5.850700570634646e-06, + "loss": 0.7493, + "step": 21610 + }, + { + "epoch": 1.5037907769353829, + "grad_norm": 1.8933288376697082, + "learning_rate": 5.846712622288071e-06, + "loss": 0.7617, + "step": 21620 + }, + { + "epoch": 1.5044863323363706, + "grad_norm": 2.5724832120095025, + "learning_rate": 5.8427241193262296e-06, + "loss": 0.7401, + "step": 21630 + }, + { + "epoch": 1.5051818877373582, + "grad_norm": 1.5035070522563645, + "learning_rate": 5.838735064361677e-06, + "loss": 0.7823, + "step": 21640 + }, + { + "epoch": 1.505877443138346, + "grad_norm": 2.1376266775900983, + "learning_rate": 5.8347454600073315e-06, + "loss": 0.7768, + "step": 21650 + }, + { + "epoch": 1.5065729985393337, + "grad_norm": 2.433882328655935, + "learning_rate": 5.830755308876473e-06, + "loss": 0.74, + "step": 21660 + }, + { + "epoch": 1.5072685539403214, + "grad_norm": 1.6989498124233287, + "learning_rate": 5.826764613582734e-06, + "loss": 0.8179, + "step": 21670 + }, + { + "epoch": 1.5079641093413092, + "grad_norm": 2.1253528413407725, + "learning_rate": 5.822773376740111e-06, + "loss": 0.7352, + "step": 21680 + }, + { + "epoch": 1.5086596647422967, + "grad_norm": 2.1263079393508413, + "learning_rate": 5.818781600962948e-06, + "loss": 0.7422, + "step": 21690 + }, + { + "epoch": 1.5093552201432843, + "grad_norm": 6.50002370772053, + "learning_rate": 5.814789288865949e-06, + "loss": 0.777, + "step": 21700 + }, + { + "epoch": 1.510050775544272, + "grad_norm": 2.2194995566180276, + "learning_rate": 5.810796443064161e-06, + "loss": 0.7313, + "step": 21710 + }, + { + "epoch": 1.5107463309452598, + "grad_norm": 2.405991622045553, + "learning_rate": 5.80680306617299e-06, + "loss": 0.7724, + "step": 21720 + }, + { + "epoch": 1.5114418863462475, + "grad_norm": 2.2089175194590145, + "learning_rate": 5.802809160808181e-06, + "loss": 0.7628, + "step": 21730 + }, + { + "epoch": 1.5121374417472353, + "grad_norm": 2.4754922578998015, + "learning_rate": 5.798814729585833e-06, + "loss": 0.759, + "step": 21740 + }, + { + "epoch": 1.5128329971482228, + "grad_norm": 2.0131121293890644, + "learning_rate": 5.794819775122385e-06, + "loss": 0.7481, + "step": 21750 + }, + { + "epoch": 1.5135285525492106, + "grad_norm": 2.056813218792843, + "learning_rate": 5.790824300034617e-06, + "loss": 0.7416, + "step": 21760 + }, + { + "epoch": 1.5142241079501981, + "grad_norm": 2.158425591645511, + "learning_rate": 5.786828306939653e-06, + "loss": 0.7292, + "step": 21770 + }, + { + "epoch": 1.5149196633511859, + "grad_norm": 2.293366739493596, + "learning_rate": 5.782831798454958e-06, + "loss": 0.8075, + "step": 21780 + }, + { + "epoch": 1.5156152187521736, + "grad_norm": 5.440030849787066, + "learning_rate": 5.77883477719833e-06, + "loss": 0.755, + "step": 21790 + }, + { + "epoch": 1.5163107741531614, + "grad_norm": 1.9810782091959125, + "learning_rate": 5.7748372457879055e-06, + "loss": 0.765, + "step": 21800 + }, + { + "epoch": 1.517006329554149, + "grad_norm": 1.7376575025190206, + "learning_rate": 5.770839206842158e-06, + "loss": 0.7274, + "step": 21810 + }, + { + "epoch": 1.5177018849551367, + "grad_norm": 2.4512409342111443, + "learning_rate": 5.766840662979887e-06, + "loss": 0.7878, + "step": 21820 + }, + { + "epoch": 1.5183974403561242, + "grad_norm": 1.6904299020929023, + "learning_rate": 5.762841616820226e-06, + "loss": 0.7935, + "step": 21830 + }, + { + "epoch": 1.519092995757112, + "grad_norm": 6.510510755685887, + "learning_rate": 5.75884207098264e-06, + "loss": 0.6993, + "step": 21840 + }, + { + "epoch": 1.5197885511580997, + "grad_norm": 2.0610957568943515, + "learning_rate": 5.754842028086919e-06, + "loss": 0.8328, + "step": 21850 + }, + { + "epoch": 1.5204841065590875, + "grad_norm": 2.7540216764973198, + "learning_rate": 5.750841490753174e-06, + "loss": 0.7836, + "step": 21860 + }, + { + "epoch": 1.5211796619600753, + "grad_norm": 1.8952501353200586, + "learning_rate": 5.746840461601849e-06, + "loss": 0.7479, + "step": 21870 + }, + { + "epoch": 1.5218752173610628, + "grad_norm": 2.0279918722250327, + "learning_rate": 5.742838943253706e-06, + "loss": 0.8039, + "step": 21880 + }, + { + "epoch": 1.5225707727620506, + "grad_norm": 1.8140352544740481, + "learning_rate": 5.738836938329823e-06, + "loss": 0.7489, + "step": 21890 + }, + { + "epoch": 1.523266328163038, + "grad_norm": 2.0170509133813095, + "learning_rate": 5.734834449451603e-06, + "loss": 0.8193, + "step": 21900 + }, + { + "epoch": 1.5239618835640258, + "grad_norm": 2.325609018721336, + "learning_rate": 5.730831479240763e-06, + "loss": 0.8077, + "step": 21910 + }, + { + "epoch": 1.5246574389650136, + "grad_norm": 1.9278246382696955, + "learning_rate": 5.726828030319337e-06, + "loss": 0.7312, + "step": 21920 + }, + { + "epoch": 1.5253529943660014, + "grad_norm": 2.2099623132953736, + "learning_rate": 5.72282410530967e-06, + "loss": 0.7372, + "step": 21930 + }, + { + "epoch": 1.526048549766989, + "grad_norm": 2.9481816442769166, + "learning_rate": 5.718819706834422e-06, + "loss": 0.7641, + "step": 21940 + }, + { + "epoch": 1.5267441051679767, + "grad_norm": 2.297721722726679, + "learning_rate": 5.71481483751656e-06, + "loss": 0.7546, + "step": 21950 + }, + { + "epoch": 1.5274396605689642, + "grad_norm": 3.3082605982215343, + "learning_rate": 5.710809499979362e-06, + "loss": 0.71, + "step": 21960 + }, + { + "epoch": 1.528135215969952, + "grad_norm": 2.6450798339375226, + "learning_rate": 5.706803696846411e-06, + "loss": 0.7715, + "step": 21970 + }, + { + "epoch": 1.5288307713709397, + "grad_norm": 1.7145010271706875, + "learning_rate": 5.702797430741596e-06, + "loss": 0.7585, + "step": 21980 + }, + { + "epoch": 1.5295263267719275, + "grad_norm": 2.2122836901676357, + "learning_rate": 5.698790704289108e-06, + "loss": 0.7556, + "step": 21990 + }, + { + "epoch": 1.5302218821729152, + "grad_norm": 2.589852490005932, + "learning_rate": 5.694783520113442e-06, + "loss": 0.6927, + "step": 22000 + }, + { + "epoch": 1.5302218821729152, + "eval_loss": 0.9693441390991211, + "eval_runtime": 1322.156, + "eval_samples_per_second": 13.736, + "eval_steps_per_second": 2.289, + "step": 22000 + }, + { + "epoch": 1.5309174375739028, + "grad_norm": 2.0491768744937917, + "learning_rate": 5.690775880839389e-06, + "loss": 0.7091, + "step": 22010 + }, + { + "epoch": 1.5316129929748903, + "grad_norm": 1.9023040597927776, + "learning_rate": 5.686767789092041e-06, + "loss": 0.7402, + "step": 22020 + }, + { + "epoch": 1.532308548375878, + "grad_norm": 1.9889607835062806, + "learning_rate": 5.6827592474967875e-06, + "loss": 0.802, + "step": 22030 + }, + { + "epoch": 1.5330041037768658, + "grad_norm": 2.789304609191272, + "learning_rate": 5.678750258679309e-06, + "loss": 0.7227, + "step": 22040 + }, + { + "epoch": 1.5336996591778536, + "grad_norm": 2.2828443469204442, + "learning_rate": 5.6747408252655815e-06, + "loss": 0.7744, + "step": 22050 + }, + { + "epoch": 1.5343952145788413, + "grad_norm": 1.9818700246760543, + "learning_rate": 5.67073094988187e-06, + "loss": 0.7475, + "step": 22060 + }, + { + "epoch": 1.5350907699798289, + "grad_norm": 1.9885023824669574, + "learning_rate": 5.6667206351547325e-06, + "loss": 0.7275, + "step": 22070 + }, + { + "epoch": 1.5357863253808166, + "grad_norm": 3.9409059752232167, + "learning_rate": 5.662709883711011e-06, + "loss": 0.7498, + "step": 22080 + }, + { + "epoch": 1.5364818807818041, + "grad_norm": 1.767806401030905, + "learning_rate": 5.658698698177837e-06, + "loss": 0.7551, + "step": 22090 + }, + { + "epoch": 1.537177436182792, + "grad_norm": 3.0681523902966994, + "learning_rate": 5.654687081182624e-06, + "loss": 0.7556, + "step": 22100 + }, + { + "epoch": 1.5378729915837797, + "grad_norm": 1.5639400532296532, + "learning_rate": 5.650675035353068e-06, + "loss": 0.8423, + "step": 22110 + }, + { + "epoch": 1.5385685469847674, + "grad_norm": 3.1444844723311425, + "learning_rate": 5.646662563317146e-06, + "loss": 0.7692, + "step": 22120 + }, + { + "epoch": 1.5392641023857552, + "grad_norm": 1.687075816269296, + "learning_rate": 5.642649667703119e-06, + "loss": 0.709, + "step": 22130 + }, + { + "epoch": 1.5399596577867427, + "grad_norm": 2.4073565980911633, + "learning_rate": 5.638636351139518e-06, + "loss": 0.748, + "step": 22140 + }, + { + "epoch": 1.5406552131877302, + "grad_norm": 1.9685290861869633, + "learning_rate": 5.634622616255152e-06, + "loss": 0.7194, + "step": 22150 + }, + { + "epoch": 1.541350768588718, + "grad_norm": 2.7618884797728294, + "learning_rate": 5.6306084656791074e-06, + "loss": 0.7446, + "step": 22160 + }, + { + "epoch": 1.5420463239897058, + "grad_norm": 5.160975170614174, + "learning_rate": 5.626593902040741e-06, + "loss": 0.7508, + "step": 22170 + }, + { + "epoch": 1.5427418793906935, + "grad_norm": 2.1228635826701603, + "learning_rate": 5.622578927969676e-06, + "loss": 0.7934, + "step": 22180 + }, + { + "epoch": 1.5434374347916813, + "grad_norm": 5.226906042965373, + "learning_rate": 5.618563546095812e-06, + "loss": 0.7275, + "step": 22190 + }, + { + "epoch": 1.5441329901926688, + "grad_norm": 2.016477976688773, + "learning_rate": 5.614547759049311e-06, + "loss": 0.7394, + "step": 22200 + }, + { + "epoch": 1.5448285455936566, + "grad_norm": 2.374464800107822, + "learning_rate": 5.610531569460599e-06, + "loss": 0.7376, + "step": 22210 + }, + { + "epoch": 1.545524100994644, + "grad_norm": 1.78443706603502, + "learning_rate": 5.606514979960372e-06, + "loss": 0.7247, + "step": 22220 + }, + { + "epoch": 1.5462196563956319, + "grad_norm": 2.987861195418624, + "learning_rate": 5.6024979931795786e-06, + "loss": 0.6865, + "step": 22230 + }, + { + "epoch": 1.5469152117966196, + "grad_norm": 2.4678683222360545, + "learning_rate": 5.598480611749437e-06, + "loss": 0.8123, + "step": 22240 + }, + { + "epoch": 1.5476107671976074, + "grad_norm": 2.0867922256123803, + "learning_rate": 5.59446283830142e-06, + "loss": 0.7682, + "step": 22250 + }, + { + "epoch": 1.5483063225985951, + "grad_norm": 2.432146587995162, + "learning_rate": 5.590444675467253e-06, + "loss": 0.7663, + "step": 22260 + }, + { + "epoch": 1.5490018779995827, + "grad_norm": 2.9282640954670525, + "learning_rate": 5.586426125878926e-06, + "loss": 0.7704, + "step": 22270 + }, + { + "epoch": 1.5496974334005702, + "grad_norm": 2.19713650555673, + "learning_rate": 5.582407192168672e-06, + "loss": 0.6973, + "step": 22280 + }, + { + "epoch": 1.550392988801558, + "grad_norm": 2.6679776574579934, + "learning_rate": 5.578387876968982e-06, + "loss": 0.6986, + "step": 22290 + }, + { + "epoch": 1.5510885442025457, + "grad_norm": 2.4690558222090786, + "learning_rate": 5.574368182912596e-06, + "loss": 0.7702, + "step": 22300 + }, + { + "epoch": 1.5517840996035335, + "grad_norm": 2.1084540480855867, + "learning_rate": 5.5703481126325006e-06, + "loss": 0.7525, + "step": 22310 + }, + { + "epoch": 1.5524796550045212, + "grad_norm": 2.1098224718286085, + "learning_rate": 5.56632766876193e-06, + "loss": 0.7439, + "step": 22320 + }, + { + "epoch": 1.5531752104055088, + "grad_norm": 2.1182845783877675, + "learning_rate": 5.562306853934364e-06, + "loss": 0.7697, + "step": 22330 + }, + { + "epoch": 1.5538707658064965, + "grad_norm": 1.9216837217049865, + "learning_rate": 5.558285670783521e-06, + "loss": 0.7557, + "step": 22340 + }, + { + "epoch": 1.554566321207484, + "grad_norm": 2.2811981186541375, + "learning_rate": 5.554264121943367e-06, + "loss": 0.673, + "step": 22350 + }, + { + "epoch": 1.5552618766084718, + "grad_norm": 4.814513362668669, + "learning_rate": 5.550242210048102e-06, + "loss": 0.6989, + "step": 22360 + }, + { + "epoch": 1.5559574320094596, + "grad_norm": 2.398116514510467, + "learning_rate": 5.546219937732169e-06, + "loss": 0.7726, + "step": 22370 + }, + { + "epoch": 1.5566529874104473, + "grad_norm": 2.8721027831434487, + "learning_rate": 5.542197307630241e-06, + "loss": 0.7868, + "step": 22380 + }, + { + "epoch": 1.557348542811435, + "grad_norm": 6.003691128632016, + "learning_rate": 5.538174322377231e-06, + "loss": 0.7451, + "step": 22390 + }, + { + "epoch": 1.5580440982124226, + "grad_norm": 2.314116366596435, + "learning_rate": 5.534150984608281e-06, + "loss": 0.8222, + "step": 22400 + }, + { + "epoch": 1.5587396536134102, + "grad_norm": 1.6406218824969285, + "learning_rate": 5.5301272969587665e-06, + "loss": 0.7016, + "step": 22410 + }, + { + "epoch": 1.559435209014398, + "grad_norm": 2.5356711554297107, + "learning_rate": 5.526103262064289e-06, + "loss": 0.7261, + "step": 22420 + }, + { + "epoch": 1.5601307644153857, + "grad_norm": 1.8317839395652284, + "learning_rate": 5.522078882560679e-06, + "loss": 0.734, + "step": 22430 + }, + { + "epoch": 1.5608263198163734, + "grad_norm": 2.082371993756292, + "learning_rate": 5.518054161083994e-06, + "loss": 0.7973, + "step": 22440 + }, + { + "epoch": 1.5615218752173612, + "grad_norm": 2.242635257563893, + "learning_rate": 5.514029100270517e-06, + "loss": 0.7826, + "step": 22450 + }, + { + "epoch": 1.5622174306183487, + "grad_norm": 2.7455195338392557, + "learning_rate": 5.5100037027567476e-06, + "loss": 0.7616, + "step": 22460 + }, + { + "epoch": 1.5629129860193365, + "grad_norm": 2.451227766363562, + "learning_rate": 5.5059779711794085e-06, + "loss": 0.7896, + "step": 22470 + }, + { + "epoch": 1.563608541420324, + "grad_norm": 2.25814581597676, + "learning_rate": 5.501951908175445e-06, + "loss": 0.8026, + "step": 22480 + }, + { + "epoch": 1.5643040968213118, + "grad_norm": 2.0351333132111904, + "learning_rate": 5.497925516382014e-06, + "loss": 0.7758, + "step": 22490 + }, + { + "epoch": 1.5649996522222995, + "grad_norm": 2.598803767542335, + "learning_rate": 5.493898798436489e-06, + "loss": 0.7156, + "step": 22500 + }, + { + "epoch": 1.5649996522222995, + "eval_loss": 0.9650686979293823, + "eval_runtime": 1324.2807, + "eval_samples_per_second": 13.714, + "eval_steps_per_second": 2.286, + "step": 22500 + }, + { + "epoch": 1.5656952076232873, + "grad_norm": 2.3262130226531026, + "learning_rate": 5.4898717569764615e-06, + "loss": 0.7184, + "step": 22510 + }, + { + "epoch": 1.566390763024275, + "grad_norm": 2.036095831406067, + "learning_rate": 5.48584439463973e-06, + "loss": 0.769, + "step": 22520 + }, + { + "epoch": 1.5670863184252626, + "grad_norm": 2.151904758544463, + "learning_rate": 5.481816714064304e-06, + "loss": 0.7273, + "step": 22530 + }, + { + "epoch": 1.5677818738262501, + "grad_norm": 2.130648135659355, + "learning_rate": 5.477788717888404e-06, + "loss": 0.7741, + "step": 22540 + }, + { + "epoch": 1.5684774292272379, + "grad_norm": 2.147043561758392, + "learning_rate": 5.473760408750455e-06, + "loss": 0.7201, + "step": 22550 + }, + { + "epoch": 1.5691729846282256, + "grad_norm": 1.758784405777554, + "learning_rate": 5.469731789289087e-06, + "loss": 0.7747, + "step": 22560 + }, + { + "epoch": 1.5698685400292134, + "grad_norm": 1.6793016459814032, + "learning_rate": 5.465702862143133e-06, + "loss": 0.7622, + "step": 22570 + }, + { + "epoch": 1.5705640954302011, + "grad_norm": 2.108144856932845, + "learning_rate": 5.461673629951629e-06, + "loss": 0.7502, + "step": 22580 + }, + { + "epoch": 1.5712596508311887, + "grad_norm": 1.9579807173143484, + "learning_rate": 5.457644095353812e-06, + "loss": 0.7737, + "step": 22590 + }, + { + "epoch": 1.5719552062321764, + "grad_norm": 2.262357266423172, + "learning_rate": 5.453614260989113e-06, + "loss": 0.7902, + "step": 22600 + }, + { + "epoch": 1.572650761633164, + "grad_norm": 1.6920117571559292, + "learning_rate": 5.4495841294971616e-06, + "loss": 0.7142, + "step": 22610 + }, + { + "epoch": 1.5733463170341517, + "grad_norm": 2.6464465448142462, + "learning_rate": 5.445553703517783e-06, + "loss": 0.8202, + "step": 22620 + }, + { + "epoch": 1.5740418724351395, + "grad_norm": 2.8255302037116667, + "learning_rate": 5.4415229856909936e-06, + "loss": 0.7635, + "step": 22630 + }, + { + "epoch": 1.5747374278361272, + "grad_norm": 1.9556234707409361, + "learning_rate": 5.4374919786570015e-06, + "loss": 0.7146, + "step": 22640 + }, + { + "epoch": 1.5754329832371148, + "grad_norm": 1.8482566950427606, + "learning_rate": 5.433460685056204e-06, + "loss": 0.7298, + "step": 22650 + }, + { + "epoch": 1.5761285386381025, + "grad_norm": 2.306149138098641, + "learning_rate": 5.42942910752919e-06, + "loss": 0.7631, + "step": 22660 + }, + { + "epoch": 1.57682409403909, + "grad_norm": 1.681869257080923, + "learning_rate": 5.425397248716725e-06, + "loss": 0.7823, + "step": 22670 + }, + { + "epoch": 1.5775196494400778, + "grad_norm": 2.3103327472857815, + "learning_rate": 5.4213651112597685e-06, + "loss": 0.6852, + "step": 22680 + }, + { + "epoch": 1.5782152048410656, + "grad_norm": 3.1445928350016543, + "learning_rate": 5.417332697799459e-06, + "loss": 0.7961, + "step": 22690 + }, + { + "epoch": 1.5789107602420533, + "grad_norm": 2.152549424350282, + "learning_rate": 5.413300010977113e-06, + "loss": 0.7951, + "step": 22700 + }, + { + "epoch": 1.579606315643041, + "grad_norm": 2.306210253306877, + "learning_rate": 5.40926705343423e-06, + "loss": 0.7857, + "step": 22710 + }, + { + "epoch": 1.5803018710440286, + "grad_norm": 2.324333672284284, + "learning_rate": 5.405233827812485e-06, + "loss": 0.7202, + "step": 22720 + }, + { + "epoch": 1.5809974264450164, + "grad_norm": 4.687354431943819, + "learning_rate": 5.401200336753729e-06, + "loss": 0.7729, + "step": 22730 + }, + { + "epoch": 1.581692981846004, + "grad_norm": 2.396615624029379, + "learning_rate": 5.397166582899987e-06, + "loss": 0.772, + "step": 22740 + }, + { + "epoch": 1.5823885372469917, + "grad_norm": 1.829345282192994, + "learning_rate": 5.393132568893454e-06, + "loss": 0.7578, + "step": 22750 + }, + { + "epoch": 1.5830840926479794, + "grad_norm": 1.771670895378471, + "learning_rate": 5.389098297376499e-06, + "loss": 0.7326, + "step": 22760 + }, + { + "epoch": 1.5837796480489672, + "grad_norm": 2.4577467108497655, + "learning_rate": 5.3850637709916596e-06, + "loss": 0.7836, + "step": 22770 + }, + { + "epoch": 1.5844752034499547, + "grad_norm": 1.6916181496691374, + "learning_rate": 5.381028992381637e-06, + "loss": 0.7114, + "step": 22780 + }, + { + "epoch": 1.5851707588509425, + "grad_norm": 1.885504151141085, + "learning_rate": 5.3769939641892975e-06, + "loss": 0.7451, + "step": 22790 + }, + { + "epoch": 1.58586631425193, + "grad_norm": 1.8959538821813628, + "learning_rate": 5.372958689057677e-06, + "loss": 0.7637, + "step": 22800 + }, + { + "epoch": 1.5865618696529178, + "grad_norm": 1.9298579581750404, + "learning_rate": 5.368923169629965e-06, + "loss": 0.7089, + "step": 22810 + }, + { + "epoch": 1.5872574250539055, + "grad_norm": 1.8757102680425193, + "learning_rate": 5.364887408549515e-06, + "loss": 0.754, + "step": 22820 + }, + { + "epoch": 1.5879529804548933, + "grad_norm": 2.7006176996106537, + "learning_rate": 5.360851408459842e-06, + "loss": 0.7637, + "step": 22830 + }, + { + "epoch": 1.588648535855881, + "grad_norm": 3.4474644965720422, + "learning_rate": 5.356815172004613e-06, + "loss": 0.7627, + "step": 22840 + }, + { + "epoch": 1.5893440912568686, + "grad_norm": 2.252959124320293, + "learning_rate": 5.352778701827648e-06, + "loss": 0.7636, + "step": 22850 + }, + { + "epoch": 1.5900396466578564, + "grad_norm": 2.652647819460997, + "learning_rate": 5.348742000572926e-06, + "loss": 0.7813, + "step": 22860 + }, + { + "epoch": 1.590735202058844, + "grad_norm": 1.6224826150191327, + "learning_rate": 5.344705070884575e-06, + "loss": 0.7502, + "step": 22870 + }, + { + "epoch": 1.5914307574598316, + "grad_norm": 2.0366487803273405, + "learning_rate": 5.340667915406871e-06, + "loss": 0.6844, + "step": 22880 + }, + { + "epoch": 1.5921263128608194, + "grad_norm": 2.290734844680201, + "learning_rate": 5.3366305367842395e-06, + "loss": 0.7307, + "step": 22890 + }, + { + "epoch": 1.5928218682618072, + "grad_norm": 1.8794237567572478, + "learning_rate": 5.33259293766125e-06, + "loss": 0.7603, + "step": 22900 + }, + { + "epoch": 1.5935174236627947, + "grad_norm": 2.471509237040509, + "learning_rate": 5.328555120682622e-06, + "loss": 0.7885, + "step": 22910 + }, + { + "epoch": 1.5942129790637825, + "grad_norm": 2.039838337218338, + "learning_rate": 5.324517088493209e-06, + "loss": 0.686, + "step": 22920 + }, + { + "epoch": 1.59490853446477, + "grad_norm": 2.1871796196854665, + "learning_rate": 5.320478843738014e-06, + "loss": 0.7634, + "step": 22930 + }, + { + "epoch": 1.5956040898657577, + "grad_norm": 5.385316183891809, + "learning_rate": 5.316440389062174e-06, + "loss": 0.752, + "step": 22940 + }, + { + "epoch": 1.5962996452667455, + "grad_norm": 2.181236174541731, + "learning_rate": 5.312401727110965e-06, + "loss": 0.7258, + "step": 22950 + }, + { + "epoch": 1.5969952006677333, + "grad_norm": 2.9021802541377006, + "learning_rate": 5.3083628605298e-06, + "loss": 0.7791, + "step": 22960 + }, + { + "epoch": 1.597690756068721, + "grad_norm": 1.9175959967321283, + "learning_rate": 5.304323791964223e-06, + "loss": 0.8041, + "step": 22970 + }, + { + "epoch": 1.5983863114697086, + "grad_norm": 1.9563579561044449, + "learning_rate": 5.300284524059913e-06, + "loss": 0.7453, + "step": 22980 + }, + { + "epoch": 1.599081866870696, + "grad_norm": 2.085865012430991, + "learning_rate": 5.296245059462679e-06, + "loss": 0.7334, + "step": 22990 + }, + { + "epoch": 1.5997774222716838, + "grad_norm": 1.9645199658193513, + "learning_rate": 5.29220540081846e-06, + "loss": 0.7371, + "step": 23000 + }, + { + "epoch": 1.5997774222716838, + "eval_loss": 0.9619734287261963, + "eval_runtime": 1321.956, + "eval_samples_per_second": 13.738, + "eval_steps_per_second": 2.29, + "step": 23000 + }, + { + "epoch": 1.6004729776726716, + "grad_norm": 2.693615247839893, + "learning_rate": 5.288165550773318e-06, + "loss": 0.7576, + "step": 23010 + }, + { + "epoch": 1.6011685330736594, + "grad_norm": 1.8903552155646066, + "learning_rate": 5.284125511973444e-06, + "loss": 0.7655, + "step": 23020 + }, + { + "epoch": 1.6018640884746471, + "grad_norm": 1.6867368368679174, + "learning_rate": 5.2800852870651505e-06, + "loss": 0.7145, + "step": 23030 + }, + { + "epoch": 1.6025596438756347, + "grad_norm": 1.9209681328513388, + "learning_rate": 5.276044878694877e-06, + "loss": 0.7852, + "step": 23040 + }, + { + "epoch": 1.6032551992766224, + "grad_norm": 2.5716705359777055, + "learning_rate": 5.272004289509175e-06, + "loss": 0.6851, + "step": 23050 + }, + { + "epoch": 1.60395075467761, + "grad_norm": 3.6156282007436324, + "learning_rate": 5.2679635221547205e-06, + "loss": 0.7653, + "step": 23060 + }, + { + "epoch": 1.6046463100785977, + "grad_norm": 1.646549955354649, + "learning_rate": 5.263922579278306e-06, + "loss": 0.7211, + "step": 23070 + }, + { + "epoch": 1.6053418654795855, + "grad_norm": 1.804976470100993, + "learning_rate": 5.259881463526832e-06, + "loss": 0.7812, + "step": 23080 + }, + { + "epoch": 1.6060374208805732, + "grad_norm": 1.9538207152473415, + "learning_rate": 5.255840177547326e-06, + "loss": 0.7702, + "step": 23090 + }, + { + "epoch": 1.606732976281561, + "grad_norm": 2.6463158879131052, + "learning_rate": 5.251798723986912e-06, + "loss": 0.7653, + "step": 23100 + }, + { + "epoch": 1.6074285316825485, + "grad_norm": 2.264303491304014, + "learning_rate": 5.24775710549283e-06, + "loss": 0.7353, + "step": 23110 + }, + { + "epoch": 1.608124087083536, + "grad_norm": 1.8974362189998375, + "learning_rate": 5.243715324712434e-06, + "loss": 0.724, + "step": 23120 + }, + { + "epoch": 1.6088196424845238, + "grad_norm": 2.64210095924254, + "learning_rate": 5.239673384293173e-06, + "loss": 0.7518, + "step": 23130 + }, + { + "epoch": 1.6095151978855116, + "grad_norm": 2.0503699489464573, + "learning_rate": 5.23563128688261e-06, + "loss": 0.7564, + "step": 23140 + }, + { + "epoch": 1.6102107532864993, + "grad_norm": 2.9977016537758345, + "learning_rate": 5.231589035128405e-06, + "loss": 0.7275, + "step": 23150 + }, + { + "epoch": 1.610906308687487, + "grad_norm": 2.075438894361721, + "learning_rate": 5.227546631678323e-06, + "loss": 0.7614, + "step": 23160 + }, + { + "epoch": 1.6116018640884746, + "grad_norm": 2.289822568728102, + "learning_rate": 5.223504079180225e-06, + "loss": 0.7472, + "step": 23170 + }, + { + "epoch": 1.6122974194894624, + "grad_norm": 2.2779791990430165, + "learning_rate": 5.219461380282071e-06, + "loss": 0.7375, + "step": 23180 + }, + { + "epoch": 1.61299297489045, + "grad_norm": 1.8596672936234964, + "learning_rate": 5.215418537631921e-06, + "loss": 0.6972, + "step": 23190 + }, + { + "epoch": 1.6136885302914377, + "grad_norm": 1.5588623186975423, + "learning_rate": 5.2113755538779195e-06, + "loss": 0.7362, + "step": 23200 + }, + { + "epoch": 1.6143840856924254, + "grad_norm": 3.5022402458768056, + "learning_rate": 5.207332431668311e-06, + "loss": 0.7339, + "step": 23210 + }, + { + "epoch": 1.6150796410934132, + "grad_norm": 2.062349089405743, + "learning_rate": 5.203289173651432e-06, + "loss": 0.7433, + "step": 23220 + }, + { + "epoch": 1.615775196494401, + "grad_norm": 2.8715047985493642, + "learning_rate": 5.199245782475703e-06, + "loss": 0.8067, + "step": 23230 + }, + { + "epoch": 1.6164707518953885, + "grad_norm": 1.8824223336099006, + "learning_rate": 5.195202260789631e-06, + "loss": 0.7359, + "step": 23240 + }, + { + "epoch": 1.617166307296376, + "grad_norm": 2.6366063905137045, + "learning_rate": 5.191158611241815e-06, + "loss": 0.7521, + "step": 23250 + }, + { + "epoch": 1.6178618626973638, + "grad_norm": 2.396727013688804, + "learning_rate": 5.187114836480931e-06, + "loss": 0.7182, + "step": 23260 + }, + { + "epoch": 1.6185574180983515, + "grad_norm": 2.696503219976849, + "learning_rate": 5.183070939155741e-06, + "loss": 0.7294, + "step": 23270 + }, + { + "epoch": 1.6192529734993393, + "grad_norm": 2.7765004741366175, + "learning_rate": 5.1790269219150866e-06, + "loss": 0.6905, + "step": 23280 + }, + { + "epoch": 1.619948528900327, + "grad_norm": 2.1089325370128575, + "learning_rate": 5.174982787407886e-06, + "loss": 0.7898, + "step": 23290 + }, + { + "epoch": 1.6206440843013146, + "grad_norm": 1.944664624452356, + "learning_rate": 5.1709385382831374e-06, + "loss": 0.7408, + "step": 23300 + }, + { + "epoch": 1.6213396397023023, + "grad_norm": 1.5910886071450405, + "learning_rate": 5.1668941771899115e-06, + "loss": 0.741, + "step": 23310 + }, + { + "epoch": 1.6220351951032899, + "grad_norm": 1.8736044129565468, + "learning_rate": 5.162849706777352e-06, + "loss": 0.7475, + "step": 23320 + }, + { + "epoch": 1.6227307505042776, + "grad_norm": 2.6756016408139898, + "learning_rate": 5.158805129694677e-06, + "loss": 0.7979, + "step": 23330 + }, + { + "epoch": 1.6234263059052654, + "grad_norm": 2.314619797369536, + "learning_rate": 5.154760448591173e-06, + "loss": 0.7159, + "step": 23340 + }, + { + "epoch": 1.6241218613062531, + "grad_norm": 1.6960713036235795, + "learning_rate": 5.150715666116193e-06, + "loss": 0.7501, + "step": 23350 + }, + { + "epoch": 1.624817416707241, + "grad_norm": 2.1824718900220055, + "learning_rate": 5.146670784919159e-06, + "loss": 0.7279, + "step": 23360 + }, + { + "epoch": 1.6255129721082284, + "grad_norm": 2.609486058549625, + "learning_rate": 5.142625807649556e-06, + "loss": 0.7256, + "step": 23370 + }, + { + "epoch": 1.626208527509216, + "grad_norm": 2.451626356058015, + "learning_rate": 5.138580736956933e-06, + "loss": 0.6772, + "step": 23380 + }, + { + "epoch": 1.6269040829102037, + "grad_norm": 2.7696106931538997, + "learning_rate": 5.1345355754909e-06, + "loss": 0.7426, + "step": 23390 + }, + { + "epoch": 1.6275996383111915, + "grad_norm": 1.5744448405915064, + "learning_rate": 5.130490325901124e-06, + "loss": 0.7317, + "step": 23400 + }, + { + "epoch": 1.6282951937121792, + "grad_norm": 1.9168186996523964, + "learning_rate": 5.126444990837336e-06, + "loss": 0.7999, + "step": 23410 + }, + { + "epoch": 1.628990749113167, + "grad_norm": 1.8913084863631549, + "learning_rate": 5.122399572949315e-06, + "loss": 0.7688, + "step": 23420 + }, + { + "epoch": 1.6296863045141545, + "grad_norm": 1.9405437293782435, + "learning_rate": 5.118354074886898e-06, + "loss": 0.8008, + "step": 23430 + }, + { + "epoch": 1.6303818599151423, + "grad_norm": 2.054191997782326, + "learning_rate": 5.114308499299978e-06, + "loss": 0.7859, + "step": 23440 + }, + { + "epoch": 1.6310774153161298, + "grad_norm": 1.7770263145910008, + "learning_rate": 5.110262848838493e-06, + "loss": 0.7772, + "step": 23450 + }, + { + "epoch": 1.6317729707171176, + "grad_norm": 2.4936892007181, + "learning_rate": 5.106217126152432e-06, + "loss": 0.7253, + "step": 23460 + }, + { + "epoch": 1.6324685261181053, + "grad_norm": 2.2677402815213994, + "learning_rate": 5.1021713338918335e-06, + "loss": 0.767, + "step": 23470 + }, + { + "epoch": 1.633164081519093, + "grad_norm": 2.0759722386963824, + "learning_rate": 5.09812547470678e-06, + "loss": 0.7388, + "step": 23480 + }, + { + "epoch": 1.6338596369200808, + "grad_norm": 10.7411444896871, + "learning_rate": 5.094079551247394e-06, + "loss": 0.7852, + "step": 23490 + }, + { + "epoch": 1.6345551923210684, + "grad_norm": 2.2483929352924443, + "learning_rate": 5.090033566163848e-06, + "loss": 0.7436, + "step": 23500 + }, + { + "epoch": 1.6345551923210684, + "eval_loss": 0.9591112732887268, + "eval_runtime": 1322.8839, + "eval_samples_per_second": 13.728, + "eval_steps_per_second": 2.288, + "step": 23500 + }, + { + "epoch": 1.635250747722056, + "grad_norm": 2.46093885631651, + "learning_rate": 5.0859875221063504e-06, + "loss": 0.7197, + "step": 23510 + }, + { + "epoch": 1.6359463031230437, + "grad_norm": 1.815159376769892, + "learning_rate": 5.081941421725145e-06, + "loss": 0.6926, + "step": 23520 + }, + { + "epoch": 1.6366418585240314, + "grad_norm": 2.268972434943771, + "learning_rate": 5.077895267670518e-06, + "loss": 0.7636, + "step": 23530 + }, + { + "epoch": 1.6373374139250192, + "grad_norm": 1.937064612078116, + "learning_rate": 5.073849062592789e-06, + "loss": 0.7618, + "step": 23540 + }, + { + "epoch": 1.638032969326007, + "grad_norm": 2.5046474264285283, + "learning_rate": 5.069802809142312e-06, + "loss": 0.687, + "step": 23550 + }, + { + "epoch": 1.6387285247269945, + "grad_norm": 2.1527889293703795, + "learning_rate": 5.0657565099694685e-06, + "loss": 0.7393, + "step": 23560 + }, + { + "epoch": 1.6394240801279822, + "grad_norm": 3.5018716861727714, + "learning_rate": 5.061710167724675e-06, + "loss": 0.775, + "step": 23570 + }, + { + "epoch": 1.6401196355289698, + "grad_norm": 2.103332736057363, + "learning_rate": 5.057663785058372e-06, + "loss": 0.7605, + "step": 23580 + }, + { + "epoch": 1.6408151909299575, + "grad_norm": 2.094844584006805, + "learning_rate": 5.053617364621031e-06, + "loss": 0.7033, + "step": 23590 + }, + { + "epoch": 1.6415107463309453, + "grad_norm": 2.7465530559418654, + "learning_rate": 5.049570909063145e-06, + "loss": 0.7764, + "step": 23600 + }, + { + "epoch": 1.642206301731933, + "grad_norm": 2.2196493242168485, + "learning_rate": 5.0455244210352296e-06, + "loss": 0.7967, + "step": 23610 + }, + { + "epoch": 1.6429018571329206, + "grad_norm": 2.274111875558805, + "learning_rate": 5.041477903187824e-06, + "loss": 0.7337, + "step": 23620 + }, + { + "epoch": 1.6435974125339083, + "grad_norm": 1.5427136935304882, + "learning_rate": 5.037431358171486e-06, + "loss": 0.6875, + "step": 23630 + }, + { + "epoch": 1.6442929679348959, + "grad_norm": 2.184792391458079, + "learning_rate": 5.03338478863679e-06, + "loss": 0.7578, + "step": 23640 + }, + { + "epoch": 1.6449885233358836, + "grad_norm": 1.9210629623837066, + "learning_rate": 5.029338197234329e-06, + "loss": 0.7819, + "step": 23650 + }, + { + "epoch": 1.6456840787368714, + "grad_norm": 2.095199160642623, + "learning_rate": 5.025291586614707e-06, + "loss": 0.7627, + "step": 23660 + }, + { + "epoch": 1.6463796341378591, + "grad_norm": 2.2344044176827373, + "learning_rate": 5.021244959428544e-06, + "loss": 0.7231, + "step": 23670 + }, + { + "epoch": 1.647075189538847, + "grad_norm": 1.9010858012413907, + "learning_rate": 5.017198318326467e-06, + "loss": 0.7372, + "step": 23680 + }, + { + "epoch": 1.6477707449398344, + "grad_norm": 2.508802620717389, + "learning_rate": 5.013151665959116e-06, + "loss": 0.7563, + "step": 23690 + }, + { + "epoch": 1.6484663003408222, + "grad_norm": 3.0424683909778416, + "learning_rate": 5.009105004977137e-06, + "loss": 0.7315, + "step": 23700 + }, + { + "epoch": 1.6491618557418097, + "grad_norm": 4.339141549029034, + "learning_rate": 5.005058338031181e-06, + "loss": 0.7892, + "step": 23710 + }, + { + "epoch": 1.6498574111427975, + "grad_norm": 5.683462170925539, + "learning_rate": 5.001011667771902e-06, + "loss": 0.7459, + "step": 23720 + }, + { + "epoch": 1.6505529665437852, + "grad_norm": 3.139986966216504, + "learning_rate": 4.9969649968499606e-06, + "loss": 0.7837, + "step": 23730 + }, + { + "epoch": 1.651248521944773, + "grad_norm": 2.784654271174112, + "learning_rate": 4.992918327916011e-06, + "loss": 0.7342, + "step": 23740 + }, + { + "epoch": 1.6519440773457605, + "grad_norm": 3.7589369418714313, + "learning_rate": 4.988871663620711e-06, + "loss": 0.7673, + "step": 23750 + }, + { + "epoch": 1.6526396327467483, + "grad_norm": 1.9543233301071026, + "learning_rate": 4.984825006614715e-06, + "loss": 0.6658, + "step": 23760 + }, + { + "epoch": 1.6533351881477358, + "grad_norm": 2.171883275423217, + "learning_rate": 4.980778359548671e-06, + "loss": 0.8155, + "step": 23770 + }, + { + "epoch": 1.6540307435487236, + "grad_norm": 2.3653358871217365, + "learning_rate": 4.976731725073223e-06, + "loss": 0.7465, + "step": 23780 + }, + { + "epoch": 1.6547262989497113, + "grad_norm": 1.6620099279177407, + "learning_rate": 4.9726851058390026e-06, + "loss": 0.7411, + "step": 23790 + }, + { + "epoch": 1.655421854350699, + "grad_norm": 2.7764045208673203, + "learning_rate": 4.968638504496634e-06, + "loss": 0.7475, + "step": 23800 + }, + { + "epoch": 1.6561174097516869, + "grad_norm": 1.8200153824304126, + "learning_rate": 4.964591923696731e-06, + "loss": 0.8069, + "step": 23810 + }, + { + "epoch": 1.6568129651526744, + "grad_norm": 2.599865114346215, + "learning_rate": 4.9605453660898895e-06, + "loss": 0.7402, + "step": 23820 + }, + { + "epoch": 1.657508520553662, + "grad_norm": 2.643293749275079, + "learning_rate": 4.956498834326697e-06, + "loss": 0.7406, + "step": 23830 + }, + { + "epoch": 1.6582040759546497, + "grad_norm": 2.6149330643746427, + "learning_rate": 4.952452331057718e-06, + "loss": 0.7265, + "step": 23840 + }, + { + "epoch": 1.6588996313556374, + "grad_norm": 3.7921945415989926, + "learning_rate": 4.948405858933503e-06, + "loss": 0.72, + "step": 23850 + }, + { + "epoch": 1.6595951867566252, + "grad_norm": 2.0765133685804575, + "learning_rate": 4.944359420604576e-06, + "loss": 0.6994, + "step": 23860 + }, + { + "epoch": 1.660290742157613, + "grad_norm": 1.7489509798487088, + "learning_rate": 4.940313018721444e-06, + "loss": 0.7193, + "step": 23870 + }, + { + "epoch": 1.6609862975586005, + "grad_norm": 2.604664296438381, + "learning_rate": 4.936266655934588e-06, + "loss": 0.7301, + "step": 23880 + }, + { + "epoch": 1.6616818529595883, + "grad_norm": 1.8833973854250423, + "learning_rate": 4.932220334894466e-06, + "loss": 0.7654, + "step": 23890 + }, + { + "epoch": 1.6623774083605758, + "grad_norm": 2.1869558802963396, + "learning_rate": 4.9281740582515055e-06, + "loss": 0.7162, + "step": 23900 + }, + { + "epoch": 1.6630729637615635, + "grad_norm": 2.726867323647052, + "learning_rate": 4.9241278286561055e-06, + "loss": 0.7332, + "step": 23910 + }, + { + "epoch": 1.6637685191625513, + "grad_norm": 1.935592393722165, + "learning_rate": 4.9200816487586375e-06, + "loss": 0.7387, + "step": 23920 + }, + { + "epoch": 1.664464074563539, + "grad_norm": 3.157982412627815, + "learning_rate": 4.9160355212094344e-06, + "loss": 0.7298, + "step": 23930 + }, + { + "epoch": 1.6651596299645268, + "grad_norm": 2.203739622497441, + "learning_rate": 4.911989448658798e-06, + "loss": 0.7636, + "step": 23940 + }, + { + "epoch": 1.6658551853655144, + "grad_norm": 2.1234662415860632, + "learning_rate": 4.907943433756996e-06, + "loss": 0.7681, + "step": 23950 + }, + { + "epoch": 1.666550740766502, + "grad_norm": 2.595951009949283, + "learning_rate": 4.903897479154258e-06, + "loss": 0.7606, + "step": 23960 + }, + { + "epoch": 1.6672462961674896, + "grad_norm": 2.3214114350548636, + "learning_rate": 4.899851587500769e-06, + "loss": 0.7759, + "step": 23970 + }, + { + "epoch": 1.6679418515684774, + "grad_norm": 2.441878700282588, + "learning_rate": 4.895805761446679e-06, + "loss": 0.7173, + "step": 23980 + }, + { + "epoch": 1.6686374069694652, + "grad_norm": 2.8442694493911906, + "learning_rate": 4.891760003642094e-06, + "loss": 0.7551, + "step": 23990 + }, + { + "epoch": 1.669332962370453, + "grad_norm": 6.1223027048012035, + "learning_rate": 4.887714316737069e-06, + "loss": 0.788, + "step": 24000 + }, + { + "epoch": 1.669332962370453, + "eval_loss": 0.9537361264228821, + "eval_runtime": 1319.6165, + "eval_samples_per_second": 13.762, + "eval_steps_per_second": 2.294, + "step": 24000 + }, + { + "epoch": 1.6700285177714405, + "grad_norm": 2.465832172703665, + "learning_rate": 4.8836687033816205e-06, + "loss": 0.6901, + "step": 24010 + }, + { + "epoch": 1.6707240731724282, + "grad_norm": 1.7490493079008025, + "learning_rate": 4.8796231662257125e-06, + "loss": 0.779, + "step": 24020 + }, + { + "epoch": 1.6714196285734157, + "grad_norm": 2.9407573255645256, + "learning_rate": 4.875577707919261e-06, + "loss": 0.7514, + "step": 24030 + }, + { + "epoch": 1.6721151839744035, + "grad_norm": 1.857709221022987, + "learning_rate": 4.871532331112129e-06, + "loss": 0.7114, + "step": 24040 + }, + { + "epoch": 1.6728107393753913, + "grad_norm": 2.4473967560930894, + "learning_rate": 4.867487038454128e-06, + "loss": 0.7698, + "step": 24050 + }, + { + "epoch": 1.673506294776379, + "grad_norm": 3.557654193881929, + "learning_rate": 4.8634418325950136e-06, + "loss": 0.7381, + "step": 24060 + }, + { + "epoch": 1.6742018501773668, + "grad_norm": 2.0812864080696154, + "learning_rate": 4.859396716184479e-06, + "loss": 0.7198, + "step": 24070 + }, + { + "epoch": 1.6748974055783543, + "grad_norm": 2.2624533481756623, + "learning_rate": 4.855351691872169e-06, + "loss": 0.6965, + "step": 24080 + }, + { + "epoch": 1.6755929609793418, + "grad_norm": 2.174756763511052, + "learning_rate": 4.85130676230766e-06, + "loss": 0.7511, + "step": 24090 + }, + { + "epoch": 1.6762885163803296, + "grad_norm": 2.2250489071742647, + "learning_rate": 4.847261930140472e-06, + "loss": 0.7509, + "step": 24100 + }, + { + "epoch": 1.6769840717813174, + "grad_norm": 1.5755417677992507, + "learning_rate": 4.843217198020056e-06, + "loss": 0.744, + "step": 24110 + }, + { + "epoch": 1.6776796271823051, + "grad_norm": 1.9981126277776027, + "learning_rate": 4.8391725685958e-06, + "loss": 0.7322, + "step": 24120 + }, + { + "epoch": 1.6783751825832929, + "grad_norm": 2.286332559645664, + "learning_rate": 4.8351280445170265e-06, + "loss": 0.7675, + "step": 24130 + }, + { + "epoch": 1.6790707379842804, + "grad_norm": 1.8999543033200483, + "learning_rate": 4.831083628432988e-06, + "loss": 0.738, + "step": 24140 + }, + { + "epoch": 1.6797662933852682, + "grad_norm": 2.510577373484926, + "learning_rate": 4.827039322992861e-06, + "loss": 0.7889, + "step": 24150 + }, + { + "epoch": 1.6804618487862557, + "grad_norm": 3.089093234995672, + "learning_rate": 4.8229951308457575e-06, + "loss": 0.7382, + "step": 24160 + }, + { + "epoch": 1.6811574041872435, + "grad_norm": 2.0500757980333733, + "learning_rate": 4.818951054640709e-06, + "loss": 0.7111, + "step": 24170 + }, + { + "epoch": 1.6818529595882312, + "grad_norm": 3.189278246741981, + "learning_rate": 4.814907097026677e-06, + "loss": 0.7259, + "step": 24180 + }, + { + "epoch": 1.682548514989219, + "grad_norm": 1.8211373841456922, + "learning_rate": 4.81086326065254e-06, + "loss": 0.7434, + "step": 24190 + }, + { + "epoch": 1.6832440703902067, + "grad_norm": 1.9528604420913074, + "learning_rate": 4.806819548167101e-06, + "loss": 0.7732, + "step": 24200 + }, + { + "epoch": 1.6839396257911943, + "grad_norm": 2.000709107849356, + "learning_rate": 4.802775962219078e-06, + "loss": 0.7874, + "step": 24210 + }, + { + "epoch": 1.6846351811921818, + "grad_norm": 1.972511810436105, + "learning_rate": 4.798732505457108e-06, + "loss": 0.6854, + "step": 24220 + }, + { + "epoch": 1.6853307365931696, + "grad_norm": 1.879430232336285, + "learning_rate": 4.794689180529744e-06, + "loss": 0.7377, + "step": 24230 + }, + { + "epoch": 1.6860262919941573, + "grad_norm": 2.1302840945624206, + "learning_rate": 4.7906459900854534e-06, + "loss": 0.7399, + "step": 24240 + }, + { + "epoch": 1.686721847395145, + "grad_norm": 2.0324976219372344, + "learning_rate": 4.786602936772613e-06, + "loss": 0.8016, + "step": 24250 + }, + { + "epoch": 1.6874174027961328, + "grad_norm": 1.6450442680122774, + "learning_rate": 4.782560023239512e-06, + "loss": 0.7448, + "step": 24260 + }, + { + "epoch": 1.6881129581971204, + "grad_norm": 1.8968982980273728, + "learning_rate": 4.778517252134346e-06, + "loss": 0.6605, + "step": 24270 + }, + { + "epoch": 1.6888085135981081, + "grad_norm": 1.8184000202760777, + "learning_rate": 4.774474626105222e-06, + "loss": 0.7184, + "step": 24280 + }, + { + "epoch": 1.6895040689990957, + "grad_norm": 2.120210231057554, + "learning_rate": 4.770432147800141e-06, + "loss": 0.7548, + "step": 24290 + }, + { + "epoch": 1.6901996244000834, + "grad_norm": 1.8295771151400517, + "learning_rate": 4.7663898198670215e-06, + "loss": 0.7204, + "step": 24300 + }, + { + "epoch": 1.6908951798010712, + "grad_norm": 2.588191379519147, + "learning_rate": 4.762347644953674e-06, + "loss": 0.7198, + "step": 24310 + }, + { + "epoch": 1.691590735202059, + "grad_norm": 2.6421259508432673, + "learning_rate": 4.758305625707811e-06, + "loss": 0.7725, + "step": 24320 + }, + { + "epoch": 1.6922862906030467, + "grad_norm": 2.0371180747086646, + "learning_rate": 4.754263764777042e-06, + "loss": 0.7846, + "step": 24330 + }, + { + "epoch": 1.6929818460040342, + "grad_norm": 2.197320372207382, + "learning_rate": 4.750222064808878e-06, + "loss": 0.7454, + "step": 24340 + }, + { + "epoch": 1.6936774014050218, + "grad_norm": 2.2523228481071507, + "learning_rate": 4.74618052845072e-06, + "loss": 0.747, + "step": 24350 + }, + { + "epoch": 1.6943729568060095, + "grad_norm": 2.0754283457277483, + "learning_rate": 4.742139158349861e-06, + "loss": 0.7529, + "step": 24360 + }, + { + "epoch": 1.6950685122069973, + "grad_norm": 2.256320338355575, + "learning_rate": 4.738097957153486e-06, + "loss": 0.7789, + "step": 24370 + }, + { + "epoch": 1.695764067607985, + "grad_norm": 1.7430283141297291, + "learning_rate": 4.734056927508672e-06, + "loss": 0.7182, + "step": 24380 + }, + { + "epoch": 1.6964596230089728, + "grad_norm": 1.9103932598808484, + "learning_rate": 4.730016072062381e-06, + "loss": 0.7785, + "step": 24390 + }, + { + "epoch": 1.6971551784099603, + "grad_norm": 2.7410085186678264, + "learning_rate": 4.725975393461461e-06, + "loss": 0.7536, + "step": 24400 + }, + { + "epoch": 1.697850733810948, + "grad_norm": 9.944590792796285, + "learning_rate": 4.721934894352646e-06, + "loss": 0.7405, + "step": 24410 + }, + { + "epoch": 1.6985462892119356, + "grad_norm": 1.8011369442744438, + "learning_rate": 4.717894577382552e-06, + "loss": 0.7342, + "step": 24420 + }, + { + "epoch": 1.6992418446129234, + "grad_norm": 1.8359252627415599, + "learning_rate": 4.713854445197672e-06, + "loss": 0.7091, + "step": 24430 + }, + { + "epoch": 1.6999374000139111, + "grad_norm": 1.5968418121300563, + "learning_rate": 4.709814500444382e-06, + "loss": 0.7074, + "step": 24440 + }, + { + "epoch": 1.700632955414899, + "grad_norm": 1.689456448807815, + "learning_rate": 4.705774745768935e-06, + "loss": 0.7713, + "step": 24450 + }, + { + "epoch": 1.7013285108158864, + "grad_norm": 2.4264608990534193, + "learning_rate": 4.701735183817457e-06, + "loss": 0.7533, + "step": 24460 + }, + { + "epoch": 1.7020240662168742, + "grad_norm": 8.40439738065537, + "learning_rate": 4.6976958172359515e-06, + "loss": 0.7389, + "step": 24470 + }, + { + "epoch": 1.7027196216178617, + "grad_norm": 1.9754886649555004, + "learning_rate": 4.69365664867029e-06, + "loss": 0.7473, + "step": 24480 + }, + { + "epoch": 1.7034151770188495, + "grad_norm": 2.6016081016374866, + "learning_rate": 4.689617680766219e-06, + "loss": 0.8454, + "step": 24490 + }, + { + "epoch": 1.7041107324198372, + "grad_norm": 2.0049345369015286, + "learning_rate": 4.685578916169346e-06, + "loss": 0.7324, + "step": 24500 + }, + { + "epoch": 1.7041107324198372, + "eval_loss": 0.9538396000862122, + "eval_runtime": 1321.6865, + "eval_samples_per_second": 13.741, + "eval_steps_per_second": 2.29, + "step": 24500 + }, + { + "epoch": 1.704806287820825, + "grad_norm": 1.5086502115310165, + "learning_rate": 4.681540357525154e-06, + "loss": 0.7213, + "step": 24510 + }, + { + "epoch": 1.7055018432218128, + "grad_norm": 2.5806073096630007, + "learning_rate": 4.677502007478984e-06, + "loss": 0.6887, + "step": 24520 + }, + { + "epoch": 1.7061973986228003, + "grad_norm": 2.5436330429651237, + "learning_rate": 4.673463868676047e-06, + "loss": 0.7166, + "step": 24530 + }, + { + "epoch": 1.706892954023788, + "grad_norm": 1.9625675567767393, + "learning_rate": 4.66942594376141e-06, + "loss": 0.7182, + "step": 24540 + }, + { + "epoch": 1.7075885094247756, + "grad_norm": 2.1136355371524345, + "learning_rate": 4.665388235380003e-06, + "loss": 0.7903, + "step": 24550 + }, + { + "epoch": 1.7082840648257633, + "grad_norm": 2.2406739525020494, + "learning_rate": 4.661350746176613e-06, + "loss": 0.816, + "step": 24560 + }, + { + "epoch": 1.708979620226751, + "grad_norm": 2.7644738133970486, + "learning_rate": 4.657313478795883e-06, + "loss": 0.7257, + "step": 24570 + }, + { + "epoch": 1.7096751756277389, + "grad_norm": 1.7646342924680496, + "learning_rate": 4.65327643588231e-06, + "loss": 0.7884, + "step": 24580 + }, + { + "epoch": 1.7103707310287264, + "grad_norm": 5.281329611660646, + "learning_rate": 4.649239620080248e-06, + "loss": 0.746, + "step": 24590 + }, + { + "epoch": 1.7110662864297141, + "grad_norm": 2.312385722350251, + "learning_rate": 4.6452030340339e-06, + "loss": 0.677, + "step": 24600 + }, + { + "epoch": 1.7117618418307017, + "grad_norm": 4.895234668933546, + "learning_rate": 4.641166680387314e-06, + "loss": 0.7384, + "step": 24610 + }, + { + "epoch": 1.7124573972316894, + "grad_norm": 1.8247929603778201, + "learning_rate": 4.637130561784393e-06, + "loss": 0.7396, + "step": 24620 + }, + { + "epoch": 1.7131529526326772, + "grad_norm": 1.9146526610239676, + "learning_rate": 4.633094680868886e-06, + "loss": 0.7784, + "step": 24630 + }, + { + "epoch": 1.713848508033665, + "grad_norm": 2.2262027954707837, + "learning_rate": 4.629059040284375e-06, + "loss": 0.7095, + "step": 24640 + }, + { + "epoch": 1.7145440634346527, + "grad_norm": 2.293076518768032, + "learning_rate": 4.6250236426743e-06, + "loss": 0.7779, + "step": 24650 + }, + { + "epoch": 1.7152396188356402, + "grad_norm": 3.3241577227034163, + "learning_rate": 4.62098849068193e-06, + "loss": 0.7854, + "step": 24660 + }, + { + "epoch": 1.715935174236628, + "grad_norm": 2.5169997719353803, + "learning_rate": 4.61695358695038e-06, + "loss": 0.7327, + "step": 24670 + }, + { + "epoch": 1.7166307296376155, + "grad_norm": 1.8769945861559756, + "learning_rate": 4.6129189341226e-06, + "loss": 0.6947, + "step": 24680 + }, + { + "epoch": 1.7173262850386033, + "grad_norm": 2.3457595791454593, + "learning_rate": 4.608884534841375e-06, + "loss": 0.7449, + "step": 24690 + }, + { + "epoch": 1.718021840439591, + "grad_norm": 1.9251372338614758, + "learning_rate": 4.6048503917493284e-06, + "loss": 0.7942, + "step": 24700 + }, + { + "epoch": 1.7187173958405788, + "grad_norm": 5.56029782963525, + "learning_rate": 4.600816507488905e-06, + "loss": 0.7317, + "step": 24710 + }, + { + "epoch": 1.7194129512415663, + "grad_norm": 2.030058559720567, + "learning_rate": 4.596782884702391e-06, + "loss": 0.7485, + "step": 24720 + }, + { + "epoch": 1.720108506642554, + "grad_norm": 1.887285667368009, + "learning_rate": 4.592749526031898e-06, + "loss": 0.7583, + "step": 24730 + }, + { + "epoch": 1.7208040620435416, + "grad_norm": 3.05738989371679, + "learning_rate": 4.588716434119364e-06, + "loss": 0.7501, + "step": 24740 + }, + { + "epoch": 1.7214996174445294, + "grad_norm": 2.0395093690148665, + "learning_rate": 4.58468361160655e-06, + "loss": 0.7598, + "step": 24750 + }, + { + "epoch": 1.7221951728455172, + "grad_norm": 1.6599869513783867, + "learning_rate": 4.580651061135046e-06, + "loss": 0.7368, + "step": 24760 + }, + { + "epoch": 1.722890728246505, + "grad_norm": 6.8192215862980445, + "learning_rate": 4.5766187853462605e-06, + "loss": 0.7142, + "step": 24770 + }, + { + "epoch": 1.7235862836474927, + "grad_norm": 2.0855640527736377, + "learning_rate": 4.572586786881419e-06, + "loss": 0.7463, + "step": 24780 + }, + { + "epoch": 1.7242818390484802, + "grad_norm": 2.85745834648721, + "learning_rate": 4.568555068381571e-06, + "loss": 0.7781, + "step": 24790 + }, + { + "epoch": 1.7249773944494677, + "grad_norm": 2.789406042830605, + "learning_rate": 4.56452363248758e-06, + "loss": 0.7214, + "step": 24800 + }, + { + "epoch": 1.7256729498504555, + "grad_norm": 1.7980016133716965, + "learning_rate": 4.560492481840124e-06, + "loss": 0.7886, + "step": 24810 + }, + { + "epoch": 1.7263685052514433, + "grad_norm": 1.969338253598016, + "learning_rate": 4.556461619079695e-06, + "loss": 0.7348, + "step": 24820 + }, + { + "epoch": 1.727064060652431, + "grad_norm": 2.821450464717551, + "learning_rate": 4.552431046846597e-06, + "loss": 0.7086, + "step": 24830 + }, + { + "epoch": 1.7277596160534188, + "grad_norm": 4.788250180845083, + "learning_rate": 4.548400767780942e-06, + "loss": 0.767, + "step": 24840 + }, + { + "epoch": 1.7284551714544063, + "grad_norm": 3.363833558579405, + "learning_rate": 4.5443707845226515e-06, + "loss": 0.8071, + "step": 24850 + }, + { + "epoch": 1.729150726855394, + "grad_norm": 2.8585697778204926, + "learning_rate": 4.5403410997114514e-06, + "loss": 0.7528, + "step": 24860 + }, + { + "epoch": 1.7298462822563816, + "grad_norm": 2.3167922860613825, + "learning_rate": 4.536311715986873e-06, + "loss": 0.8258, + "step": 24870 + }, + { + "epoch": 1.7305418376573694, + "grad_norm": 1.95788071659064, + "learning_rate": 4.532282635988253e-06, + "loss": 0.712, + "step": 24880 + }, + { + "epoch": 1.731237393058357, + "grad_norm": 2.0881828261145823, + "learning_rate": 4.528253862354726e-06, + "loss": 0.7143, + "step": 24890 + }, + { + "epoch": 1.7319329484593449, + "grad_norm": 1.874803164281738, + "learning_rate": 4.524225397725225e-06, + "loss": 0.7243, + "step": 24900 + }, + { + "epoch": 1.7326285038603326, + "grad_norm": 2.5770647582847976, + "learning_rate": 4.520197244738483e-06, + "loss": 0.7813, + "step": 24910 + }, + { + "epoch": 1.7333240592613202, + "grad_norm": 1.6612799995038474, + "learning_rate": 4.516169406033033e-06, + "loss": 0.7531, + "step": 24920 + }, + { + "epoch": 1.7340196146623077, + "grad_norm": 3.000809881730268, + "learning_rate": 4.512141884247189e-06, + "loss": 0.7442, + "step": 24930 + }, + { + "epoch": 1.7347151700632955, + "grad_norm": 3.5238870976281853, + "learning_rate": 4.508114682019071e-06, + "loss": 0.743, + "step": 24940 + }, + { + "epoch": 1.7354107254642832, + "grad_norm": 2.6860121328856748, + "learning_rate": 4.504087801986583e-06, + "loss": 0.7447, + "step": 24950 + }, + { + "epoch": 1.736106280865271, + "grad_norm": 2.0260037477695425, + "learning_rate": 4.50006124678742e-06, + "loss": 0.7222, + "step": 24960 + }, + { + "epoch": 1.7368018362662587, + "grad_norm": 2.058700924132277, + "learning_rate": 4.496035019059063e-06, + "loss": 0.7656, + "step": 24970 + }, + { + "epoch": 1.7374973916672463, + "grad_norm": 2.8021103903413116, + "learning_rate": 4.492009121438781e-06, + "loss": 0.7791, + "step": 24980 + }, + { + "epoch": 1.738192947068234, + "grad_norm": 2.321160125498219, + "learning_rate": 4.487983556563624e-06, + "loss": 0.7404, + "step": 24990 + }, + { + "epoch": 1.7388885024692216, + "grad_norm": 2.264344809600643, + "learning_rate": 4.483958327070423e-06, + "loss": 0.7653, + "step": 25000 + }, + { + "epoch": 1.7388885024692216, + "eval_loss": 0.9515901803970337, + "eval_runtime": 1320.8212, + "eval_samples_per_second": 13.75, + "eval_steps_per_second": 2.292, + "step": 25000 + }, + { + "epoch": 1.7395840578702093, + "grad_norm": 3.743751202191862, + "learning_rate": 4.479933435595792e-06, + "loss": 0.7366, + "step": 25010 + }, + { + "epoch": 1.740279613271197, + "grad_norm": 4.493254390410092, + "learning_rate": 4.475908884776125e-06, + "loss": 0.7789, + "step": 25020 + }, + { + "epoch": 1.7409751686721848, + "grad_norm": 2.0472891204729815, + "learning_rate": 4.471884677247589e-06, + "loss": 0.7256, + "step": 25030 + }, + { + "epoch": 1.7416707240731726, + "grad_norm": 2.118940213483039, + "learning_rate": 4.467860815646127e-06, + "loss": 0.6932, + "step": 25040 + }, + { + "epoch": 1.7423662794741601, + "grad_norm": 3.084983934835264, + "learning_rate": 4.463837302607458e-06, + "loss": 0.7361, + "step": 25050 + }, + { + "epoch": 1.7430618348751477, + "grad_norm": 2.482124960740585, + "learning_rate": 4.4598141407670714e-06, + "loss": 0.7456, + "step": 25060 + }, + { + "epoch": 1.7437573902761354, + "grad_norm": 2.011872857359219, + "learning_rate": 4.455791332760224e-06, + "loss": 0.6918, + "step": 25070 + }, + { + "epoch": 1.7444529456771232, + "grad_norm": 2.378616582001213, + "learning_rate": 4.451768881221944e-06, + "loss": 0.7386, + "step": 25080 + }, + { + "epoch": 1.745148501078111, + "grad_norm": 2.198573127424139, + "learning_rate": 4.447746788787025e-06, + "loss": 0.7833, + "step": 25090 + }, + { + "epoch": 1.7458440564790987, + "grad_norm": 1.6600191991701976, + "learning_rate": 4.443725058090025e-06, + "loss": 0.7169, + "step": 25100 + }, + { + "epoch": 1.7465396118800862, + "grad_norm": 3.3494317722559845, + "learning_rate": 4.4397036917652654e-06, + "loss": 0.7826, + "step": 25110 + }, + { + "epoch": 1.747235167281074, + "grad_norm": 2.1229314525461738, + "learning_rate": 4.435682692446831e-06, + "loss": 0.7927, + "step": 25120 + }, + { + "epoch": 1.7479307226820615, + "grad_norm": 1.9544301306062029, + "learning_rate": 4.431662062768564e-06, + "loss": 0.7044, + "step": 25130 + }, + { + "epoch": 1.7486262780830493, + "grad_norm": 2.2802552373774505, + "learning_rate": 4.427641805364061e-06, + "loss": 0.7497, + "step": 25140 + }, + { + "epoch": 1.749321833484037, + "grad_norm": 2.0942394173964276, + "learning_rate": 4.423621922866682e-06, + "loss": 0.7215, + "step": 25150 + }, + { + "epoch": 1.7500173888850248, + "grad_norm": 3.763353730530524, + "learning_rate": 4.4196024179095344e-06, + "loss": 0.7508, + "step": 25160 + }, + { + "epoch": 1.7507129442860125, + "grad_norm": 1.9097212051150494, + "learning_rate": 4.415583293125486e-06, + "loss": 0.7604, + "step": 25170 + }, + { + "epoch": 1.751408499687, + "grad_norm": 2.1438947046711085, + "learning_rate": 4.41156455114715e-06, + "loss": 0.7434, + "step": 25180 + }, + { + "epoch": 1.7521040550879876, + "grad_norm": 2.2406985510649338, + "learning_rate": 4.407546194606888e-06, + "loss": 0.7371, + "step": 25190 + }, + { + "epoch": 1.7527996104889754, + "grad_norm": 1.923694670916089, + "learning_rate": 4.403528226136816e-06, + "loss": 0.6794, + "step": 25200 + }, + { + "epoch": 1.7534951658899631, + "grad_norm": 2.401804869093556, + "learning_rate": 4.399510648368786e-06, + "loss": 0.7405, + "step": 25210 + }, + { + "epoch": 1.7541907212909509, + "grad_norm": 2.0719105233354456, + "learning_rate": 4.3954934639344e-06, + "loss": 0.7222, + "step": 25220 + }, + { + "epoch": 1.7548862766919386, + "grad_norm": 1.9730246743981947, + "learning_rate": 4.3914766754650046e-06, + "loss": 0.7586, + "step": 25230 + }, + { + "epoch": 1.7555818320929262, + "grad_norm": 2.0866457052275926, + "learning_rate": 4.387460285591682e-06, + "loss": 0.7591, + "step": 25240 + }, + { + "epoch": 1.756277387493914, + "grad_norm": 1.7477507693267402, + "learning_rate": 4.383444296945256e-06, + "loss": 0.7568, + "step": 25250 + }, + { + "epoch": 1.7569729428949015, + "grad_norm": 3.2335927168431366, + "learning_rate": 4.379428712156284e-06, + "loss": 0.7175, + "step": 25260 + }, + { + "epoch": 1.7576684982958892, + "grad_norm": 2.069988871668673, + "learning_rate": 4.375413533855069e-06, + "loss": 0.738, + "step": 25270 + }, + { + "epoch": 1.758364053696877, + "grad_norm": 1.852737889106253, + "learning_rate": 4.371398764671633e-06, + "loss": 0.754, + "step": 25280 + }, + { + "epoch": 1.7590596090978647, + "grad_norm": 2.2644815705539902, + "learning_rate": 4.36738440723574e-06, + "loss": 0.7388, + "step": 25290 + }, + { + "epoch": 1.7597551644988525, + "grad_norm": 1.8545200072035912, + "learning_rate": 4.3633704641768825e-06, + "loss": 0.7432, + "step": 25300 + }, + { + "epoch": 1.76045071989984, + "grad_norm": 4.2221136049196915, + "learning_rate": 4.3593569381242805e-06, + "loss": 0.7645, + "step": 25310 + }, + { + "epoch": 1.7611462753008276, + "grad_norm": 5.250787892286246, + "learning_rate": 4.355343831706882e-06, + "loss": 0.7638, + "step": 25320 + }, + { + "epoch": 1.7618418307018153, + "grad_norm": 2.1208558162213063, + "learning_rate": 4.351331147553357e-06, + "loss": 0.7497, + "step": 25330 + }, + { + "epoch": 1.762537386102803, + "grad_norm": 4.951059446755641, + "learning_rate": 4.347318888292107e-06, + "loss": 0.794, + "step": 25340 + }, + { + "epoch": 1.7632329415037908, + "grad_norm": 1.8297160296425896, + "learning_rate": 4.343307056551241e-06, + "loss": 0.7385, + "step": 25350 + }, + { + "epoch": 1.7639284969047786, + "grad_norm": 1.9981863620242295, + "learning_rate": 4.339295654958603e-06, + "loss": 0.738, + "step": 25360 + }, + { + "epoch": 1.7646240523057661, + "grad_norm": 2.527096114867278, + "learning_rate": 4.335284686141746e-06, + "loss": 0.7417, + "step": 25370 + }, + { + "epoch": 1.765319607706754, + "grad_norm": 1.849564995986072, + "learning_rate": 4.331274152727944e-06, + "loss": 0.7042, + "step": 25380 + }, + { + "epoch": 1.7660151631077414, + "grad_norm": 1.7264177485715582, + "learning_rate": 4.327264057344181e-06, + "loss": 0.74, + "step": 25390 + }, + { + "epoch": 1.7667107185087292, + "grad_norm": 2.025465738566715, + "learning_rate": 4.323254402617159e-06, + "loss": 0.7882, + "step": 25400 + }, + { + "epoch": 1.767406273909717, + "grad_norm": 2.1825830950749285, + "learning_rate": 4.319245191173292e-06, + "loss": 0.7398, + "step": 25410 + }, + { + "epoch": 1.7681018293107047, + "grad_norm": 1.724005131342312, + "learning_rate": 4.315236425638697e-06, + "loss": 0.7542, + "step": 25420 + }, + { + "epoch": 1.7687973847116922, + "grad_norm": 2.117564907832831, + "learning_rate": 4.311228108639205e-06, + "loss": 0.754, + "step": 25430 + }, + { + "epoch": 1.76949294011268, + "grad_norm": 2.937770089025937, + "learning_rate": 4.30722024280035e-06, + "loss": 0.7176, + "step": 25440 + }, + { + "epoch": 1.7701884955136675, + "grad_norm": 2.0861975068172223, + "learning_rate": 4.303212830747371e-06, + "loss": 0.7139, + "step": 25450 + }, + { + "epoch": 1.7708840509146553, + "grad_norm": 1.6756753887900464, + "learning_rate": 4.299205875105214e-06, + "loss": 0.7425, + "step": 25460 + }, + { + "epoch": 1.771579606315643, + "grad_norm": 2.278098342629176, + "learning_rate": 4.295199378498519e-06, + "loss": 0.7547, + "step": 25470 + }, + { + "epoch": 1.7722751617166308, + "grad_norm": 1.9942980680267435, + "learning_rate": 4.291193343551632e-06, + "loss": 0.7624, + "step": 25480 + }, + { + "epoch": 1.7729707171176186, + "grad_norm": 7.320261978354622, + "learning_rate": 4.28718777288859e-06, + "loss": 0.7507, + "step": 25490 + }, + { + "epoch": 1.773666272518606, + "grad_norm": 1.9620153107873546, + "learning_rate": 4.283182669133132e-06, + "loss": 0.7596, + "step": 25500 + }, + { + "epoch": 1.773666272518606, + "eval_loss": 0.9496564269065857, + "eval_runtime": 1324.2603, + "eval_samples_per_second": 13.714, + "eval_steps_per_second": 2.286, + "step": 25500 + }, + { + "epoch": 1.7743618279195938, + "grad_norm": 1.9207293642616299, + "learning_rate": 4.279178034908685e-06, + "loss": 0.7122, + "step": 25510 + }, + { + "epoch": 1.7750573833205814, + "grad_norm": 3.2159229822513558, + "learning_rate": 4.2751738728383765e-06, + "loss": 0.7512, + "step": 25520 + }, + { + "epoch": 1.7757529387215691, + "grad_norm": 2.1685279425255994, + "learning_rate": 4.271170185545017e-06, + "loss": 0.7391, + "step": 25530 + }, + { + "epoch": 1.776448494122557, + "grad_norm": 1.9839863259532775, + "learning_rate": 4.2671669756511106e-06, + "loss": 0.7102, + "step": 25540 + }, + { + "epoch": 1.7771440495235447, + "grad_norm": 2.3025351376962893, + "learning_rate": 4.263164245778845e-06, + "loss": 0.733, + "step": 25550 + }, + { + "epoch": 1.7778396049245322, + "grad_norm": 2.031548379669346, + "learning_rate": 4.259161998550101e-06, + "loss": 0.7298, + "step": 25560 + }, + { + "epoch": 1.77853516032552, + "grad_norm": 1.9709037965800555, + "learning_rate": 4.255160236586432e-06, + "loss": 0.7741, + "step": 25570 + }, + { + "epoch": 1.7792307157265075, + "grad_norm": 1.7684946701352267, + "learning_rate": 4.251158962509081e-06, + "loss": 0.6939, + "step": 25580 + }, + { + "epoch": 1.7799262711274952, + "grad_norm": 1.8752644978420498, + "learning_rate": 4.247158178938973e-06, + "loss": 0.801, + "step": 25590 + }, + { + "epoch": 1.780621826528483, + "grad_norm": 1.7141387427222865, + "learning_rate": 4.243157888496705e-06, + "loss": 0.7237, + "step": 25600 + }, + { + "epoch": 1.7813173819294708, + "grad_norm": 3.124195642118829, + "learning_rate": 4.2391580938025564e-06, + "loss": 0.6938, + "step": 25610 + }, + { + "epoch": 1.7820129373304585, + "grad_norm": 2.1751075982266146, + "learning_rate": 4.235158797476479e-06, + "loss": 0.7355, + "step": 25620 + }, + { + "epoch": 1.782708492731446, + "grad_norm": 2.4252106228280916, + "learning_rate": 4.231160002138104e-06, + "loss": 0.6974, + "step": 25630 + }, + { + "epoch": 1.7834040481324336, + "grad_norm": 2.2386045331290947, + "learning_rate": 4.227161710406722e-06, + "loss": 0.737, + "step": 25640 + }, + { + "epoch": 1.7840996035334213, + "grad_norm": 1.8832640024698586, + "learning_rate": 4.223163924901306e-06, + "loss": 0.7844, + "step": 25650 + }, + { + "epoch": 1.784795158934409, + "grad_norm": 3.018546998201008, + "learning_rate": 4.219166648240493e-06, + "loss": 0.7068, + "step": 25660 + }, + { + "epoch": 1.7854907143353969, + "grad_norm": 2.6290147403470665, + "learning_rate": 4.2151698830425854e-06, + "loss": 0.7699, + "step": 25670 + }, + { + "epoch": 1.7861862697363846, + "grad_norm": 1.8288587791462116, + "learning_rate": 4.2111736319255505e-06, + "loss": 0.7543, + "step": 25680 + }, + { + "epoch": 1.7868818251373721, + "grad_norm": 2.2695648255717087, + "learning_rate": 4.207177897507023e-06, + "loss": 0.7582, + "step": 25690 + }, + { + "epoch": 1.78757738053836, + "grad_norm": 1.9588385894678373, + "learning_rate": 4.203182682404297e-06, + "loss": 0.7371, + "step": 25700 + }, + { + "epoch": 1.7882729359393474, + "grad_norm": 1.910969503056137, + "learning_rate": 4.1991879892343206e-06, + "loss": 0.7432, + "step": 25710 + }, + { + "epoch": 1.7889684913403352, + "grad_norm": 2.462624196948302, + "learning_rate": 4.1951938206137085e-06, + "loss": 0.7741, + "step": 25720 + }, + { + "epoch": 1.789664046741323, + "grad_norm": 2.159182639645443, + "learning_rate": 4.1912001791587294e-06, + "loss": 0.7329, + "step": 25730 + }, + { + "epoch": 1.7903596021423107, + "grad_norm": 2.671766240205284, + "learning_rate": 4.187207067485301e-06, + "loss": 0.7447, + "step": 25740 + }, + { + "epoch": 1.7910551575432985, + "grad_norm": 4.377285580602421, + "learning_rate": 4.1832144882090054e-06, + "loss": 0.6895, + "step": 25750 + }, + { + "epoch": 1.791750712944286, + "grad_norm": 2.1012631894196545, + "learning_rate": 4.179222443945064e-06, + "loss": 0.7147, + "step": 25760 + }, + { + "epoch": 1.7924462683452735, + "grad_norm": 1.6498404180962745, + "learning_rate": 4.1752309373083574e-06, + "loss": 0.7219, + "step": 25770 + }, + { + "epoch": 1.7931418237462613, + "grad_norm": 1.8981196102504714, + "learning_rate": 4.171239970913405e-06, + "loss": 0.6932, + "step": 25780 + }, + { + "epoch": 1.793837379147249, + "grad_norm": 1.97024831170369, + "learning_rate": 4.167249547374379e-06, + "loss": 0.7509, + "step": 25790 + }, + { + "epoch": 1.7945329345482368, + "grad_norm": 1.7681326186309148, + "learning_rate": 4.163259669305094e-06, + "loss": 0.7097, + "step": 25800 + }, + { + "epoch": 1.7952284899492246, + "grad_norm": 3.2906702237961842, + "learning_rate": 4.159270339319007e-06, + "loss": 0.735, + "step": 25810 + }, + { + "epoch": 1.795924045350212, + "grad_norm": 1.8842699051928586, + "learning_rate": 4.155281560029218e-06, + "loss": 0.7048, + "step": 25820 + }, + { + "epoch": 1.7966196007511999, + "grad_norm": 2.1325892018307084, + "learning_rate": 4.151293334048463e-06, + "loss": 0.7516, + "step": 25830 + }, + { + "epoch": 1.7973151561521874, + "grad_norm": 1.5694539027436105, + "learning_rate": 4.1473056639891185e-06, + "loss": 0.7045, + "step": 25840 + }, + { + "epoch": 1.7980107115531752, + "grad_norm": 1.6833247971092635, + "learning_rate": 4.143318552463193e-06, + "loss": 0.702, + "step": 25850 + }, + { + "epoch": 1.798706266954163, + "grad_norm": 2.43207677082238, + "learning_rate": 4.139332002082333e-06, + "loss": 0.7711, + "step": 25860 + }, + { + "epoch": 1.7994018223551507, + "grad_norm": 2.3410967894574646, + "learning_rate": 4.135346015457816e-06, + "loss": 0.7518, + "step": 25870 + }, + { + "epoch": 1.8000973777561384, + "grad_norm": 2.103574050820867, + "learning_rate": 4.131360595200552e-06, + "loss": 0.7702, + "step": 25880 + }, + { + "epoch": 1.800792933157126, + "grad_norm": 2.8040251813606782, + "learning_rate": 4.127375743921076e-06, + "loss": 0.7579, + "step": 25890 + }, + { + "epoch": 1.8014884885581135, + "grad_norm": 1.775948641884433, + "learning_rate": 4.1233914642295524e-06, + "loss": 0.7355, + "step": 25900 + }, + { + "epoch": 1.8021840439591013, + "grad_norm": 3.8194909622037443, + "learning_rate": 4.119407758735776e-06, + "loss": 0.7244, + "step": 25910 + }, + { + "epoch": 1.802879599360089, + "grad_norm": 2.7331907385457845, + "learning_rate": 4.115424630049156e-06, + "loss": 0.7452, + "step": 25920 + }, + { + "epoch": 1.8035751547610768, + "grad_norm": 2.129020231989996, + "learning_rate": 4.111442080778728e-06, + "loss": 0.7451, + "step": 25930 + }, + { + "epoch": 1.8042707101620645, + "grad_norm": 2.3974456859308724, + "learning_rate": 4.107460113533153e-06, + "loss": 0.6827, + "step": 25940 + }, + { + "epoch": 1.804966265563052, + "grad_norm": 1.956613006902284, + "learning_rate": 4.103478730920704e-06, + "loss": 0.6889, + "step": 25950 + }, + { + "epoch": 1.8056618209640398, + "grad_norm": 1.9172280361956606, + "learning_rate": 4.099497935549275e-06, + "loss": 0.6941, + "step": 25960 + }, + { + "epoch": 1.8063573763650274, + "grad_norm": 2.3914589662453887, + "learning_rate": 4.095517730026371e-06, + "loss": 0.6761, + "step": 25970 + }, + { + "epoch": 1.8070529317660151, + "grad_norm": 2.294012163121128, + "learning_rate": 4.09153811695912e-06, + "loss": 0.749, + "step": 25980 + }, + { + "epoch": 1.8077484871670029, + "grad_norm": 2.205252718741433, + "learning_rate": 4.087559098954247e-06, + "loss": 0.7005, + "step": 25990 + }, + { + "epoch": 1.8084440425679906, + "grad_norm": 2.1629248899398337, + "learning_rate": 4.083580678618102e-06, + "loss": 0.7282, + "step": 26000 + }, + { + "epoch": 1.8084440425679906, + "eval_loss": 0.9468376040458679, + "eval_runtime": 1322.135, + "eval_samples_per_second": 13.736, + "eval_steps_per_second": 2.289, + "step": 26000 + }, + { + "epoch": 1.8091395979689784, + "grad_norm": 2.5435151610509465, + "learning_rate": 4.079602858556635e-06, + "loss": 0.7435, + "step": 26010 + }, + { + "epoch": 1.809835153369966, + "grad_norm": 2.106443206394638, + "learning_rate": 4.075625641375405e-06, + "loss": 0.7418, + "step": 26020 + }, + { + "epoch": 1.8105307087709535, + "grad_norm": 2.119261772565286, + "learning_rate": 4.071649029679575e-06, + "loss": 0.8071, + "step": 26030 + }, + { + "epoch": 1.8112262641719412, + "grad_norm": 2.192023064311098, + "learning_rate": 4.067673026073916e-06, + "loss": 0.7358, + "step": 26040 + }, + { + "epoch": 1.811921819572929, + "grad_norm": 3.7972563459040085, + "learning_rate": 4.0636976331627954e-06, + "loss": 0.696, + "step": 26050 + }, + { + "epoch": 1.8126173749739167, + "grad_norm": 1.9909248511552562, + "learning_rate": 4.059722853550181e-06, + "loss": 0.7647, + "step": 26060 + }, + { + "epoch": 1.8133129303749045, + "grad_norm": 2.0194610632189707, + "learning_rate": 4.055748689839642e-06, + "loss": 0.6978, + "step": 26070 + }, + { + "epoch": 1.814008485775892, + "grad_norm": 2.6854164615050164, + "learning_rate": 4.051775144634342e-06, + "loss": 0.735, + "step": 26080 + }, + { + "epoch": 1.8147040411768798, + "grad_norm": 1.874786608760331, + "learning_rate": 4.047802220537038e-06, + "loss": 0.7456, + "step": 26090 + }, + { + "epoch": 1.8153995965778673, + "grad_norm": 2.110845781301349, + "learning_rate": 4.043829920150086e-06, + "loss": 0.7026, + "step": 26100 + }, + { + "epoch": 1.816095151978855, + "grad_norm": 2.3192054700203273, + "learning_rate": 4.039858246075427e-06, + "loss": 0.7744, + "step": 26110 + }, + { + "epoch": 1.8167907073798428, + "grad_norm": 3.319360196658644, + "learning_rate": 4.0358872009145964e-06, + "loss": 0.7256, + "step": 26120 + }, + { + "epoch": 1.8174862627808306, + "grad_norm": 2.165841919310701, + "learning_rate": 4.031916787268712e-06, + "loss": 0.716, + "step": 26130 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 1.9179620296003483, + "learning_rate": 4.027947007738484e-06, + "loss": 0.772, + "step": 26140 + }, + { + "epoch": 1.8188773735828059, + "grad_norm": 1.8150782288626675, + "learning_rate": 4.023977864924203e-06, + "loss": 0.6914, + "step": 26150 + }, + { + "epoch": 1.8195729289837934, + "grad_norm": 2.293382713008229, + "learning_rate": 4.020009361425746e-06, + "loss": 0.6759, + "step": 26160 + }, + { + "epoch": 1.8202684843847812, + "grad_norm": 1.9963820364778788, + "learning_rate": 4.01604149984257e-06, + "loss": 0.748, + "step": 26170 + }, + { + "epoch": 1.820964039785769, + "grad_norm": 1.582426206206012, + "learning_rate": 4.012074282773709e-06, + "loss": 0.6459, + "step": 26180 + }, + { + "epoch": 1.8216595951867567, + "grad_norm": 2.1119156856722188, + "learning_rate": 4.00810771281778e-06, + "loss": 0.7569, + "step": 26190 + }, + { + "epoch": 1.8223551505877444, + "grad_norm": 2.3765160619708814, + "learning_rate": 4.004141792572968e-06, + "loss": 0.6908, + "step": 26200 + }, + { + "epoch": 1.823050705988732, + "grad_norm": 3.1983673744513914, + "learning_rate": 4.000176524637042e-06, + "loss": 0.7387, + "step": 26210 + }, + { + "epoch": 1.8237462613897197, + "grad_norm": 1.6402038690706624, + "learning_rate": 3.996211911607335e-06, + "loss": 0.7383, + "step": 26220 + }, + { + "epoch": 1.8244418167907073, + "grad_norm": 2.5888540234218453, + "learning_rate": 3.992247956080759e-06, + "loss": 0.7314, + "step": 26230 + }, + { + "epoch": 1.825137372191695, + "grad_norm": 2.3424826221747996, + "learning_rate": 3.988284660653789e-06, + "loss": 0.7505, + "step": 26240 + }, + { + "epoch": 1.8258329275926828, + "grad_norm": 1.7108169043882773, + "learning_rate": 3.984322027922473e-06, + "loss": 0.7514, + "step": 26250 + }, + { + "epoch": 1.8265284829936705, + "grad_norm": 3.4935519483466275, + "learning_rate": 3.980360060482418e-06, + "loss": 0.7393, + "step": 26260 + }, + { + "epoch": 1.8272240383946583, + "grad_norm": 1.9787329988473497, + "learning_rate": 3.976398760928805e-06, + "loss": 0.7283, + "step": 26270 + }, + { + "epoch": 1.8279195937956458, + "grad_norm": 1.622366508766944, + "learning_rate": 3.9724381318563645e-06, + "loss": 0.7562, + "step": 26280 + }, + { + "epoch": 1.8286151491966334, + "grad_norm": 1.7366667896446193, + "learning_rate": 3.968478175859399e-06, + "loss": 0.715, + "step": 26290 + }, + { + "epoch": 1.8293107045976211, + "grad_norm": 7.815320999276022, + "learning_rate": 3.964518895531767e-06, + "loss": 0.7782, + "step": 26300 + }, + { + "epoch": 1.8300062599986089, + "grad_norm": 2.8292069939177504, + "learning_rate": 3.960560293466882e-06, + "loss": 0.7401, + "step": 26310 + }, + { + "epoch": 1.8307018153995966, + "grad_norm": 7.594625931830247, + "learning_rate": 3.956602372257714e-06, + "loss": 0.7008, + "step": 26320 + }, + { + "epoch": 1.8313973708005844, + "grad_norm": 7.167904216300585, + "learning_rate": 3.952645134496791e-06, + "loss": 0.6893, + "step": 26330 + }, + { + "epoch": 1.832092926201572, + "grad_norm": 2.245748175824168, + "learning_rate": 3.948688582776189e-06, + "loss": 0.7319, + "step": 26340 + }, + { + "epoch": 1.8327884816025597, + "grad_norm": 3.2950820833597922, + "learning_rate": 3.944732719687534e-06, + "loss": 0.7698, + "step": 26350 + }, + { + "epoch": 1.8334840370035472, + "grad_norm": 4.702666316629816, + "learning_rate": 3.9407775478220036e-06, + "loss": 0.7127, + "step": 26360 + }, + { + "epoch": 1.834179592404535, + "grad_norm": 2.3561046637402536, + "learning_rate": 3.936823069770323e-06, + "loss": 0.7416, + "step": 26370 + }, + { + "epoch": 1.8348751478055227, + "grad_norm": 2.0423206312574638, + "learning_rate": 3.93286928812276e-06, + "loss": 0.7039, + "step": 26380 + }, + { + "epoch": 1.8355707032065105, + "grad_norm": 2.150994067721087, + "learning_rate": 3.928916205469129e-06, + "loss": 0.6723, + "step": 26390 + }, + { + "epoch": 1.836266258607498, + "grad_norm": 1.6285073124773723, + "learning_rate": 3.924963824398786e-06, + "loss": 0.702, + "step": 26400 + }, + { + "epoch": 1.8369618140084858, + "grad_norm": 2.3651833601142274, + "learning_rate": 3.9210121475006265e-06, + "loss": 0.7233, + "step": 26410 + }, + { + "epoch": 1.8376573694094733, + "grad_norm": 4.257591639487418, + "learning_rate": 3.917061177363085e-06, + "loss": 0.7251, + "step": 26420 + }, + { + "epoch": 1.838352924810461, + "grad_norm": 1.648269899273396, + "learning_rate": 3.913110916574133e-06, + "loss": 0.725, + "step": 26430 + }, + { + "epoch": 1.8390484802114488, + "grad_norm": 4.564149090697968, + "learning_rate": 3.909161367721277e-06, + "loss": 0.7843, + "step": 26440 + }, + { + "epoch": 1.8397440356124366, + "grad_norm": 1.8109206328338112, + "learning_rate": 3.9052125333915596e-06, + "loss": 0.6923, + "step": 26450 + }, + { + "epoch": 1.8404395910134244, + "grad_norm": 1.891991960714292, + "learning_rate": 3.901264416171552e-06, + "loss": 0.7962, + "step": 26460 + }, + { + "epoch": 1.841135146414412, + "grad_norm": 5.688885164001301, + "learning_rate": 3.897317018647359e-06, + "loss": 0.7446, + "step": 26470 + }, + { + "epoch": 1.8418307018153997, + "grad_norm": 2.3116883079346047, + "learning_rate": 3.893370343404611e-06, + "loss": 0.7467, + "step": 26480 + }, + { + "epoch": 1.8425262572163872, + "grad_norm": 2.9969210583395305, + "learning_rate": 3.8894243930284665e-06, + "loss": 0.7675, + "step": 26490 + }, + { + "epoch": 1.843221812617375, + "grad_norm": 2.013498826888565, + "learning_rate": 3.885479170103609e-06, + "loss": 0.7434, + "step": 26500 + }, + { + "epoch": 1.843221812617375, + "eval_loss": 0.9437658786773682, + "eval_runtime": 1323.9711, + "eval_samples_per_second": 13.717, + "eval_steps_per_second": 2.286, + "step": 26500 + }, + { + "epoch": 1.8439173680183627, + "grad_norm": 5.976965808331218, + "learning_rate": 3.881534677214245e-06, + "loss": 0.7328, + "step": 26510 + }, + { + "epoch": 1.8446129234193505, + "grad_norm": 3.206815244201167, + "learning_rate": 3.877590916944106e-06, + "loss": 0.767, + "step": 26520 + }, + { + "epoch": 1.845308478820338, + "grad_norm": 2.111768410843262, + "learning_rate": 3.873647891876439e-06, + "loss": 0.7081, + "step": 26530 + }, + { + "epoch": 1.8460040342213258, + "grad_norm": 2.3908557033219906, + "learning_rate": 3.869705604594013e-06, + "loss": 0.6955, + "step": 26540 + }, + { + "epoch": 1.8466995896223133, + "grad_norm": 2.0357867568390837, + "learning_rate": 3.865764057679112e-06, + "loss": 0.7305, + "step": 26550 + }, + { + "epoch": 1.847395145023301, + "grad_norm": 1.766771983079912, + "learning_rate": 3.861823253713535e-06, + "loss": 0.7678, + "step": 26560 + }, + { + "epoch": 1.8480907004242888, + "grad_norm": 5.558926903757594, + "learning_rate": 3.857883195278593e-06, + "loss": 0.7469, + "step": 26570 + }, + { + "epoch": 1.8487862558252766, + "grad_norm": 2.106119348004014, + "learning_rate": 3.853943884955113e-06, + "loss": 0.7376, + "step": 26580 + }, + { + "epoch": 1.8494818112262643, + "grad_norm": 2.5437740779661833, + "learning_rate": 3.850005325323428e-06, + "loss": 0.7524, + "step": 26590 + }, + { + "epoch": 1.8501773666272519, + "grad_norm": 2.0081333945064395, + "learning_rate": 3.846067518963381e-06, + "loss": 0.6475, + "step": 26600 + }, + { + "epoch": 1.8508729220282394, + "grad_norm": 1.994313701910116, + "learning_rate": 3.8421304684543205e-06, + "loss": 0.7826, + "step": 26610 + }, + { + "epoch": 1.8515684774292271, + "grad_norm": 2.354511040502367, + "learning_rate": 3.838194176375104e-06, + "loss": 0.7443, + "step": 26620 + }, + { + "epoch": 1.852264032830215, + "grad_norm": 2.0630976404066894, + "learning_rate": 3.834258645304084e-06, + "loss": 0.7219, + "step": 26630 + }, + { + "epoch": 1.8529595882312027, + "grad_norm": 1.8412569095209645, + "learning_rate": 3.830323877819121e-06, + "loss": 0.7683, + "step": 26640 + }, + { + "epoch": 1.8536551436321904, + "grad_norm": 2.540001135752198, + "learning_rate": 3.826389876497575e-06, + "loss": 0.7621, + "step": 26650 + }, + { + "epoch": 1.854350699033178, + "grad_norm": 2.4998892949306715, + "learning_rate": 3.822456643916302e-06, + "loss": 0.7019, + "step": 26660 + }, + { + "epoch": 1.8550462544341657, + "grad_norm": 1.9702960350543168, + "learning_rate": 3.8185241826516536e-06, + "loss": 0.7651, + "step": 26670 + }, + { + "epoch": 1.8557418098351532, + "grad_norm": 2.3284415829846723, + "learning_rate": 3.814592495279481e-06, + "loss": 0.7564, + "step": 26680 + }, + { + "epoch": 1.856437365236141, + "grad_norm": 1.8929281487816716, + "learning_rate": 3.8106615843751244e-06, + "loss": 0.7914, + "step": 26690 + }, + { + "epoch": 1.8571329206371288, + "grad_norm": 1.8880416177857826, + "learning_rate": 3.8067314525134147e-06, + "loss": 0.7447, + "step": 26700 + }, + { + "epoch": 1.8578284760381165, + "grad_norm": 1.9939880412124684, + "learning_rate": 3.802802102268675e-06, + "loss": 0.759, + "step": 26710 + }, + { + "epoch": 1.8585240314391043, + "grad_norm": 2.0382312738733255, + "learning_rate": 3.798873536214716e-06, + "loss": 0.7222, + "step": 26720 + }, + { + "epoch": 1.8592195868400918, + "grad_norm": 2.011516309215394, + "learning_rate": 3.7949457569248328e-06, + "loss": 0.7205, + "step": 26730 + }, + { + "epoch": 1.8599151422410793, + "grad_norm": 2.2102212293936794, + "learning_rate": 3.791018766971809e-06, + "loss": 0.7775, + "step": 26740 + }, + { + "epoch": 1.860610697642067, + "grad_norm": 2.11792235903374, + "learning_rate": 3.7870925689279075e-06, + "loss": 0.7812, + "step": 26750 + }, + { + "epoch": 1.8613062530430549, + "grad_norm": 2.88162513136176, + "learning_rate": 3.7831671653648754e-06, + "loss": 0.7495, + "step": 26760 + }, + { + "epoch": 1.8620018084440426, + "grad_norm": 3.1344146216688866, + "learning_rate": 3.779242558853935e-06, + "loss": 0.8041, + "step": 26770 + }, + { + "epoch": 1.8626973638450304, + "grad_norm": 2.8050404405439635, + "learning_rate": 3.775318751965791e-06, + "loss": 0.7395, + "step": 26780 + }, + { + "epoch": 1.863392919246018, + "grad_norm": 1.70957217465524, + "learning_rate": 3.771395747270622e-06, + "loss": 0.7251, + "step": 26790 + }, + { + "epoch": 1.8640884746470057, + "grad_norm": 1.8773598778872909, + "learning_rate": 3.7674735473380807e-06, + "loss": 0.7503, + "step": 26800 + }, + { + "epoch": 1.8647840300479932, + "grad_norm": 2.4243123523615924, + "learning_rate": 3.7635521547372966e-06, + "loss": 0.6955, + "step": 26810 + }, + { + "epoch": 1.865479585448981, + "grad_norm": 1.84428525021793, + "learning_rate": 3.7596315720368664e-06, + "loss": 0.7112, + "step": 26820 + }, + { + "epoch": 1.8661751408499687, + "grad_norm": 2.1156564897321433, + "learning_rate": 3.7557118018048577e-06, + "loss": 0.7565, + "step": 26830 + }, + { + "epoch": 1.8668706962509565, + "grad_norm": 2.922468228263847, + "learning_rate": 3.7517928466088044e-06, + "loss": 0.742, + "step": 26840 + }, + { + "epoch": 1.8675662516519442, + "grad_norm": 3.153304004056719, + "learning_rate": 3.747874709015708e-06, + "loss": 0.761, + "step": 26850 + }, + { + "epoch": 1.8682618070529318, + "grad_norm": 1.75137929007339, + "learning_rate": 3.7439573915920343e-06, + "loss": 0.6703, + "step": 26860 + }, + { + "epoch": 1.8689573624539193, + "grad_norm": 1.9181684641042362, + "learning_rate": 3.740040896903713e-06, + "loss": 0.7257, + "step": 26870 + }, + { + "epoch": 1.869652917854907, + "grad_norm": 3.322165540746283, + "learning_rate": 3.7361252275161317e-06, + "loss": 0.6757, + "step": 26880 + }, + { + "epoch": 1.8703484732558948, + "grad_norm": 2.1333324912487326, + "learning_rate": 3.7322103859941417e-06, + "loss": 0.7897, + "step": 26890 + }, + { + "epoch": 1.8710440286568826, + "grad_norm": 1.9407180376669364, + "learning_rate": 3.7282963749020496e-06, + "loss": 0.7005, + "step": 26900 + }, + { + "epoch": 1.8717395840578703, + "grad_norm": 1.6206130280227895, + "learning_rate": 3.724383196803616e-06, + "loss": 0.7507, + "step": 26910 + }, + { + "epoch": 1.8724351394588579, + "grad_norm": 1.3572088732469472, + "learning_rate": 3.720470854262058e-06, + "loss": 0.7113, + "step": 26920 + }, + { + "epoch": 1.8731306948598456, + "grad_norm": 3.1547747456735777, + "learning_rate": 3.7165593498400487e-06, + "loss": 0.7077, + "step": 26930 + }, + { + "epoch": 1.8738262502608332, + "grad_norm": 1.9408390448324786, + "learning_rate": 3.7126486860997056e-06, + "loss": 0.7425, + "step": 26940 + }, + { + "epoch": 1.874521805661821, + "grad_norm": 2.005765615933561, + "learning_rate": 3.7087388656026013e-06, + "loss": 0.7588, + "step": 26950 + }, + { + "epoch": 1.8752173610628087, + "grad_norm": 2.923249342883143, + "learning_rate": 3.704829890909751e-06, + "loss": 0.7273, + "step": 26960 + }, + { + "epoch": 1.8759129164637964, + "grad_norm": 2.7767350091375333, + "learning_rate": 3.700921764581621e-06, + "loss": 0.7845, + "step": 26970 + }, + { + "epoch": 1.8766084718647842, + "grad_norm": 2.475248495028692, + "learning_rate": 3.6970144891781203e-06, + "loss": 0.7801, + "step": 26980 + }, + { + "epoch": 1.8773040272657717, + "grad_norm": 5.9000529354512095, + "learning_rate": 3.693108067258596e-06, + "loss": 0.7898, + "step": 26990 + }, + { + "epoch": 1.8779995826667593, + "grad_norm": 2.323137883301424, + "learning_rate": 3.6892025013818423e-06, + "loss": 0.7515, + "step": 27000 + }, + { + "epoch": 1.8779995826667593, + "eval_loss": 0.9409797787666321, + "eval_runtime": 1321.6667, + "eval_samples_per_second": 13.741, + "eval_steps_per_second": 2.29, + "step": 27000 + }, + { + "epoch": 1.878695138067747, + "grad_norm": 2.879042619007983, + "learning_rate": 3.6852977941060887e-06, + "loss": 0.7837, + "step": 27010 + }, + { + "epoch": 1.8793906934687348, + "grad_norm": 1.8728129112306033, + "learning_rate": 3.6813939479890048e-06, + "loss": 0.7122, + "step": 27020 + }, + { + "epoch": 1.8800862488697225, + "grad_norm": 1.9759866401180137, + "learning_rate": 3.6774909655876957e-06, + "loss": 0.7461, + "step": 27030 + }, + { + "epoch": 1.8807818042707103, + "grad_norm": 1.6572241167158626, + "learning_rate": 3.6735888494586992e-06, + "loss": 0.7295, + "step": 27040 + }, + { + "epoch": 1.8814773596716978, + "grad_norm": 2.236688559501781, + "learning_rate": 3.6696876021579876e-06, + "loss": 0.6881, + "step": 27050 + }, + { + "epoch": 1.8821729150726856, + "grad_norm": 1.8262670445094737, + "learning_rate": 3.6657872262409616e-06, + "loss": 0.6885, + "step": 27060 + }, + { + "epoch": 1.8828684704736731, + "grad_norm": 2.780407053679724, + "learning_rate": 3.6618877242624536e-06, + "loss": 0.7491, + "step": 27070 + }, + { + "epoch": 1.8835640258746609, + "grad_norm": 1.6652925953007502, + "learning_rate": 3.657989098776722e-06, + "loss": 0.7223, + "step": 27080 + }, + { + "epoch": 1.8842595812756486, + "grad_norm": 1.586935120910969, + "learning_rate": 3.654091352337451e-06, + "loss": 0.7056, + "step": 27090 + }, + { + "epoch": 1.8849551366766364, + "grad_norm": 2.6610763673023454, + "learning_rate": 3.6501944874977524e-06, + "loss": 0.7118, + "step": 27100 + }, + { + "epoch": 1.8856506920776241, + "grad_norm": 2.196293286779499, + "learning_rate": 3.6462985068101557e-06, + "loss": 0.7055, + "step": 27110 + }, + { + "epoch": 1.8863462474786117, + "grad_norm": 2.2943105739603005, + "learning_rate": 3.642403412826615e-06, + "loss": 0.7542, + "step": 27120 + }, + { + "epoch": 1.8870418028795992, + "grad_norm": 2.075691840831412, + "learning_rate": 3.6385092080984997e-06, + "loss": 0.7717, + "step": 27130 + }, + { + "epoch": 1.887737358280587, + "grad_norm": 2.956969379731906, + "learning_rate": 3.6346158951766005e-06, + "loss": 0.7353, + "step": 27140 + }, + { + "epoch": 1.8884329136815747, + "grad_norm": 2.4453371374716766, + "learning_rate": 3.6307234766111206e-06, + "loss": 0.7615, + "step": 27150 + }, + { + "epoch": 1.8891284690825625, + "grad_norm": 2.5078508012332317, + "learning_rate": 3.6268319549516816e-06, + "loss": 0.7546, + "step": 27160 + }, + { + "epoch": 1.8898240244835502, + "grad_norm": 2.631234443798877, + "learning_rate": 3.622941332747314e-06, + "loss": 0.7831, + "step": 27170 + }, + { + "epoch": 1.8905195798845378, + "grad_norm": 1.9609415354552282, + "learning_rate": 3.6190516125464616e-06, + "loss": 0.8023, + "step": 27180 + }, + { + "epoch": 1.8912151352855255, + "grad_norm": 3.078152489274002, + "learning_rate": 3.6151627968969747e-06, + "loss": 0.7465, + "step": 27190 + }, + { + "epoch": 1.891910690686513, + "grad_norm": 1.9290883625445618, + "learning_rate": 3.611274888346113e-06, + "loss": 0.7194, + "step": 27200 + }, + { + "epoch": 1.8926062460875008, + "grad_norm": 1.668470261183572, + "learning_rate": 3.60738788944054e-06, + "loss": 0.7393, + "step": 27210 + }, + { + "epoch": 1.8933018014884886, + "grad_norm": 1.7559538247524336, + "learning_rate": 3.6035018027263272e-06, + "loss": 0.6993, + "step": 27220 + }, + { + "epoch": 1.8939973568894763, + "grad_norm": 2.4418248563656735, + "learning_rate": 3.599616630748946e-06, + "loss": 0.7401, + "step": 27230 + }, + { + "epoch": 1.8946929122904639, + "grad_norm": 1.9176366693268079, + "learning_rate": 3.595732376053268e-06, + "loss": 0.7049, + "step": 27240 + }, + { + "epoch": 1.8953884676914516, + "grad_norm": 2.520310619711514, + "learning_rate": 3.5918490411835647e-06, + "loss": 0.7428, + "step": 27250 + }, + { + "epoch": 1.8960840230924392, + "grad_norm": 3.068896340720195, + "learning_rate": 3.5879666286835084e-06, + "loss": 0.7386, + "step": 27260 + }, + { + "epoch": 1.896779578493427, + "grad_norm": 2.4057061461647593, + "learning_rate": 3.5840851410961585e-06, + "loss": 0.7693, + "step": 27270 + }, + { + "epoch": 1.8974751338944147, + "grad_norm": 2.1398973193124675, + "learning_rate": 3.580204580963979e-06, + "loss": 0.7225, + "step": 27280 + }, + { + "epoch": 1.8981706892954024, + "grad_norm": 1.7600406564055677, + "learning_rate": 3.5763249508288197e-06, + "loss": 0.7158, + "step": 27290 + }, + { + "epoch": 1.8988662446963902, + "grad_norm": 5.324611434642918, + "learning_rate": 3.5724462532319225e-06, + "loss": 0.7363, + "step": 27300 + }, + { + "epoch": 1.8995618000973777, + "grad_norm": 1.9088789939796014, + "learning_rate": 3.5685684907139195e-06, + "loss": 0.7482, + "step": 27310 + }, + { + "epoch": 1.9002573554983655, + "grad_norm": 2.1606470674791627, + "learning_rate": 3.564691665814831e-06, + "loss": 0.6915, + "step": 27320 + }, + { + "epoch": 1.900952910899353, + "grad_norm": 3.042251062816875, + "learning_rate": 3.5608157810740635e-06, + "loss": 0.7847, + "step": 27330 + }, + { + "epoch": 1.9016484663003408, + "grad_norm": 4.0678796788145, + "learning_rate": 3.5569408390304007e-06, + "loss": 0.7305, + "step": 27340 + }, + { + "epoch": 1.9023440217013285, + "grad_norm": 1.819001740408555, + "learning_rate": 3.553066842222018e-06, + "loss": 0.748, + "step": 27350 + }, + { + "epoch": 1.9030395771023163, + "grad_norm": 2.7966861811647696, + "learning_rate": 3.549193793186468e-06, + "loss": 0.6913, + "step": 27360 + }, + { + "epoch": 1.9037351325033038, + "grad_norm": 2.246196640882327, + "learning_rate": 3.5453216944606804e-06, + "loss": 0.7127, + "step": 27370 + }, + { + "epoch": 1.9044306879042916, + "grad_norm": 1.888216629941478, + "learning_rate": 3.5414505485809677e-06, + "loss": 0.7307, + "step": 27380 + }, + { + "epoch": 1.9051262433052791, + "grad_norm": 1.9683135448518752, + "learning_rate": 3.5375803580830125e-06, + "loss": 0.7043, + "step": 27390 + }, + { + "epoch": 1.905821798706267, + "grad_norm": 2.241750473058003, + "learning_rate": 3.5337111255018765e-06, + "loss": 0.7216, + "step": 27400 + }, + { + "epoch": 1.9065173541072546, + "grad_norm": 2.49123264771791, + "learning_rate": 3.5298428533719885e-06, + "loss": 0.7498, + "step": 27410 + }, + { + "epoch": 1.9072129095082424, + "grad_norm": 2.010544609927867, + "learning_rate": 3.525975544227154e-06, + "loss": 0.7264, + "step": 27420 + }, + { + "epoch": 1.9079084649092302, + "grad_norm": 2.549814396433616, + "learning_rate": 3.522109200600542e-06, + "loss": 0.7128, + "step": 27430 + }, + { + "epoch": 1.9086040203102177, + "grad_norm": 2.3534576863865895, + "learning_rate": 3.5182438250246936e-06, + "loss": 0.6964, + "step": 27440 + }, + { + "epoch": 1.9092995757112052, + "grad_norm": 5.673669235182949, + "learning_rate": 3.5143794200315156e-06, + "loss": 0.7391, + "step": 27450 + }, + { + "epoch": 1.909995131112193, + "grad_norm": 1.5604232743051925, + "learning_rate": 3.5105159881522767e-06, + "loss": 0.779, + "step": 27460 + }, + { + "epoch": 1.9106906865131807, + "grad_norm": 3.6621745749887498, + "learning_rate": 3.5066535319176098e-06, + "loss": 0.7534, + "step": 27470 + }, + { + "epoch": 1.9113862419141685, + "grad_norm": 3.045422707336111, + "learning_rate": 3.502792053857506e-06, + "loss": 0.7476, + "step": 27480 + }, + { + "epoch": 1.9120817973151563, + "grad_norm": 2.4694936665312035, + "learning_rate": 3.498931556501319e-06, + "loss": 0.7208, + "step": 27490 + }, + { + "epoch": 1.9127773527161438, + "grad_norm": 3.916229377546053, + "learning_rate": 3.4950720423777596e-06, + "loss": 0.7553, + "step": 27500 + }, + { + "epoch": 1.9127773527161438, + "eval_loss": 0.9387843608856201, + "eval_runtime": 1323.4659, + "eval_samples_per_second": 13.722, + "eval_steps_per_second": 2.287, + "step": 27500 + }, + { + "epoch": 1.9134729081171316, + "grad_norm": 1.6068060244082223, + "learning_rate": 3.4912135140148928e-06, + "loss": 0.6631, + "step": 27510 + }, + { + "epoch": 1.914168463518119, + "grad_norm": 3.955715300246021, + "learning_rate": 3.48735597394014e-06, + "loss": 0.7511, + "step": 27520 + }, + { + "epoch": 1.9148640189191068, + "grad_norm": 1.9078303717044451, + "learning_rate": 3.4834994246802744e-06, + "loss": 0.7523, + "step": 27530 + }, + { + "epoch": 1.9155595743200946, + "grad_norm": 2.0819792027221182, + "learning_rate": 3.47964386876142e-06, + "loss": 0.7833, + "step": 27540 + }, + { + "epoch": 1.9162551297210824, + "grad_norm": 5.970337199217039, + "learning_rate": 3.4757893087090483e-06, + "loss": 0.7445, + "step": 27550 + }, + { + "epoch": 1.9169506851220701, + "grad_norm": 1.8292513952433844, + "learning_rate": 3.471935747047981e-06, + "loss": 0.7405, + "step": 27560 + }, + { + "epoch": 1.9176462405230577, + "grad_norm": 1.8500100537956834, + "learning_rate": 3.4680831863023866e-06, + "loss": 0.7301, + "step": 27570 + }, + { + "epoch": 1.9183417959240452, + "grad_norm": 2.0295083132252487, + "learning_rate": 3.4642316289957755e-06, + "loss": 0.7024, + "step": 27580 + }, + { + "epoch": 1.919037351325033, + "grad_norm": 2.4343806031585813, + "learning_rate": 3.4603810776510026e-06, + "loss": 0.7247, + "step": 27590 + }, + { + "epoch": 1.9197329067260207, + "grad_norm": 4.442339961945882, + "learning_rate": 3.4565315347902615e-06, + "loss": 0.8052, + "step": 27600 + }, + { + "epoch": 1.9204284621270085, + "grad_norm": 1.8731008796353903, + "learning_rate": 3.452683002935091e-06, + "loss": 0.7096, + "step": 27610 + }, + { + "epoch": 1.9211240175279962, + "grad_norm": 1.9995520597003416, + "learning_rate": 3.4488354846063577e-06, + "loss": 0.734, + "step": 27620 + }, + { + "epoch": 1.9218195729289838, + "grad_norm": 2.3536722741991407, + "learning_rate": 3.4449889823242744e-06, + "loss": 0.6806, + "step": 27630 + }, + { + "epoch": 1.9225151283299715, + "grad_norm": 3.626896322899432, + "learning_rate": 3.4411434986083827e-06, + "loss": 0.7454, + "step": 27640 + }, + { + "epoch": 1.923210683730959, + "grad_norm": 2.282510509448647, + "learning_rate": 3.4372990359775587e-06, + "loss": 0.7285, + "step": 27650 + }, + { + "epoch": 1.9239062391319468, + "grad_norm": 2.4944037658370473, + "learning_rate": 3.433455596950008e-06, + "loss": 0.7392, + "step": 27660 + }, + { + "epoch": 1.9246017945329346, + "grad_norm": 1.9754471535063818, + "learning_rate": 3.42961318404327e-06, + "loss": 0.6819, + "step": 27670 + }, + { + "epoch": 1.9252973499339223, + "grad_norm": 4.446560170975328, + "learning_rate": 3.4257717997742073e-06, + "loss": 0.6852, + "step": 27680 + }, + { + "epoch": 1.92599290533491, + "grad_norm": 5.641725807463967, + "learning_rate": 3.4219314466590113e-06, + "loss": 0.6407, + "step": 27690 + }, + { + "epoch": 1.9266884607358976, + "grad_norm": 2.4386399430830212, + "learning_rate": 3.4180921272131968e-06, + "loss": 0.8259, + "step": 27700 + }, + { + "epoch": 1.9273840161368851, + "grad_norm": 2.652413533419073, + "learning_rate": 3.4142538439516017e-06, + "loss": 0.7854, + "step": 27710 + }, + { + "epoch": 1.928079571537873, + "grad_norm": 4.384143781068988, + "learning_rate": 3.4104165993883843e-06, + "loss": 0.7421, + "step": 27720 + }, + { + "epoch": 1.9287751269388607, + "grad_norm": 2.3022345610558226, + "learning_rate": 3.406580396037025e-06, + "loss": 0.7225, + "step": 27730 + }, + { + "epoch": 1.9294706823398484, + "grad_norm": 3.791901000868261, + "learning_rate": 3.402745236410321e-06, + "loss": 0.7809, + "step": 27740 + }, + { + "epoch": 1.9301662377408362, + "grad_norm": 3.76397547872346, + "learning_rate": 3.398911123020385e-06, + "loss": 0.7165, + "step": 27750 + }, + { + "epoch": 1.9308617931418237, + "grad_norm": 2.5837893509621686, + "learning_rate": 3.3950780583786476e-06, + "loss": 0.6949, + "step": 27760 + }, + { + "epoch": 1.9315573485428115, + "grad_norm": 2.4353870934147612, + "learning_rate": 3.3912460449958456e-06, + "loss": 0.6988, + "step": 27770 + }, + { + "epoch": 1.932252903943799, + "grad_norm": 2.0907756230202543, + "learning_rate": 3.3874150853820342e-06, + "loss": 0.7456, + "step": 27780 + }, + { + "epoch": 1.9329484593447868, + "grad_norm": 2.0666501758579146, + "learning_rate": 3.3835851820465736e-06, + "loss": 0.7584, + "step": 27790 + }, + { + "epoch": 1.9336440147457745, + "grad_norm": 2.5365996238606305, + "learning_rate": 3.379756337498137e-06, + "loss": 0.7336, + "step": 27800 + }, + { + "epoch": 1.9343395701467623, + "grad_norm": 2.2727178122696, + "learning_rate": 3.3759285542446983e-06, + "loss": 0.692, + "step": 27810 + }, + { + "epoch": 1.93503512554775, + "grad_norm": 1.5790205926649854, + "learning_rate": 3.372101834793542e-06, + "loss": 0.7817, + "step": 27820 + }, + { + "epoch": 1.9357306809487376, + "grad_norm": 1.9969111663089205, + "learning_rate": 3.368276181651252e-06, + "loss": 0.7386, + "step": 27830 + }, + { + "epoch": 1.936426236349725, + "grad_norm": 2.4655035196586343, + "learning_rate": 3.364451597323714e-06, + "loss": 0.7361, + "step": 27840 + }, + { + "epoch": 1.9371217917507129, + "grad_norm": 1.9099780194211418, + "learning_rate": 3.3606280843161134e-06, + "loss": 0.6903, + "step": 27850 + }, + { + "epoch": 1.9378173471517006, + "grad_norm": 2.038135641410283, + "learning_rate": 3.3568056451329366e-06, + "loss": 0.7882, + "step": 27860 + }, + { + "epoch": 1.9385129025526884, + "grad_norm": 2.6184281532656115, + "learning_rate": 3.352984282277964e-06, + "loss": 0.7365, + "step": 27870 + }, + { + "epoch": 1.9392084579536761, + "grad_norm": 2.1279436905603593, + "learning_rate": 3.349163998254272e-06, + "loss": 0.7119, + "step": 27880 + }, + { + "epoch": 1.9399040133546637, + "grad_norm": 3.3613594360284447, + "learning_rate": 3.3453447955642293e-06, + "loss": 0.6933, + "step": 27890 + }, + { + "epoch": 1.9405995687556514, + "grad_norm": 2.3208970675063156, + "learning_rate": 3.3415266767095005e-06, + "loss": 0.7097, + "step": 27900 + }, + { + "epoch": 1.941295124156639, + "grad_norm": 3.0008087960469734, + "learning_rate": 3.337709644191031e-06, + "loss": 0.7663, + "step": 27910 + }, + { + "epoch": 1.9419906795576267, + "grad_norm": 2.238688653982131, + "learning_rate": 3.3338937005090655e-06, + "loss": 0.6929, + "step": 27920 + }, + { + "epoch": 1.9426862349586145, + "grad_norm": 2.964152233315608, + "learning_rate": 3.3300788481631276e-06, + "loss": 0.7681, + "step": 27930 + }, + { + "epoch": 1.9433817903596022, + "grad_norm": 2.993541075555904, + "learning_rate": 3.326265089652031e-06, + "loss": 0.7481, + "step": 27940 + }, + { + "epoch": 1.94407734576059, + "grad_norm": 1.7215937834539607, + "learning_rate": 3.3224524274738678e-06, + "loss": 0.6797, + "step": 27950 + }, + { + "epoch": 1.9447729011615775, + "grad_norm": 2.5809225537372873, + "learning_rate": 3.318640864126019e-06, + "loss": 0.761, + "step": 27960 + }, + { + "epoch": 1.945468456562565, + "grad_norm": 4.650571190507667, + "learning_rate": 3.3148304021051413e-06, + "loss": 0.722, + "step": 27970 + }, + { + "epoch": 1.9461640119635528, + "grad_norm": 3.4803950572647087, + "learning_rate": 3.3110210439071667e-06, + "loss": 0.7133, + "step": 27980 + }, + { + "epoch": 1.9468595673645406, + "grad_norm": 2.5221506980800497, + "learning_rate": 3.307212792027311e-06, + "loss": 0.7824, + "step": 27990 + }, + { + "epoch": 1.9475551227655283, + "grad_norm": 2.0186710280302056, + "learning_rate": 3.303405648960062e-06, + "loss": 0.7284, + "step": 28000 + }, + { + "epoch": 1.9475551227655283, + "eval_loss": 0.934053361415863, + "eval_runtime": 1319.4133, + "eval_samples_per_second": 13.764, + "eval_steps_per_second": 2.294, + "step": 28000 + } + ], + "logging_steps": 10, + "max_steps": 43131, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3243481757450240.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}