{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.500428728920828, "eval_steps": 766, "global_step": 1532, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00032665060634518803, "grad_norm": 0.22650057077407837, "learning_rate": 2.0000000000000003e-06, "loss": 0.8308, "step": 1 }, { "epoch": 0.00032665060634518803, "eval_loss": 1.508634090423584, "eval_runtime": 502.8072, "eval_samples_per_second": 5.127, "eval_steps_per_second": 2.564, "step": 1 }, { "epoch": 0.0006533012126903761, "grad_norm": 0.2044776827096939, "learning_rate": 4.000000000000001e-06, "loss": 0.9804, "step": 2 }, { "epoch": 0.0009799518190355642, "grad_norm": 0.2313065379858017, "learning_rate": 6e-06, "loss": 1.067, "step": 3 }, { "epoch": 0.0013066024253807521, "grad_norm": 0.26630067825317383, "learning_rate": 8.000000000000001e-06, "loss": 1.0556, "step": 4 }, { "epoch": 0.0016332530317259401, "grad_norm": 0.2948451340198517, "learning_rate": 1e-05, "loss": 1.117, "step": 5 }, { "epoch": 0.0019599036380711283, "grad_norm": 0.2725197970867157, "learning_rate": 1.2e-05, "loss": 1.1418, "step": 6 }, { "epoch": 0.0022865542444163163, "grad_norm": 0.29806721210479736, "learning_rate": 1.4000000000000001e-05, "loss": 1.0675, "step": 7 }, { "epoch": 0.0026132048507615043, "grad_norm": 0.2886989414691925, "learning_rate": 1.6000000000000003e-05, "loss": 1.1935, "step": 8 }, { "epoch": 0.0029398554571066922, "grad_norm": 0.2739482522010803, "learning_rate": 1.8e-05, "loss": 1.2054, "step": 9 }, { "epoch": 0.0032665060634518802, "grad_norm": 0.28947803378105164, "learning_rate": 2e-05, "loss": 1.1734, "step": 10 }, { "epoch": 0.003593156669797068, "grad_norm": 0.3251422941684723, "learning_rate": 2.2000000000000003e-05, "loss": 1.1786, "step": 11 }, { "epoch": 0.003919807276142257, "grad_norm": 0.2556726038455963, "learning_rate": 2.4e-05, "loss": 1.25, "step": 12 }, { "epoch": 0.004246457882487445, "grad_norm": 0.30588826537132263, "learning_rate": 2.6000000000000002e-05, "loss": 1.2101, "step": 13 }, { "epoch": 0.0045731084888326326, "grad_norm": 0.3118837773799896, "learning_rate": 2.8000000000000003e-05, "loss": 1.2211, "step": 14 }, { "epoch": 0.0048997590951778205, "grad_norm": 0.41697078943252563, "learning_rate": 3e-05, "loss": 1.4213, "step": 15 }, { "epoch": 0.0052264097015230085, "grad_norm": 0.354201078414917, "learning_rate": 3.2000000000000005e-05, "loss": 1.3762, "step": 16 }, { "epoch": 0.0055530603078681965, "grad_norm": 0.43570902943611145, "learning_rate": 3.4000000000000007e-05, "loss": 1.394, "step": 17 }, { "epoch": 0.0058797109142133845, "grad_norm": 0.4452773928642273, "learning_rate": 3.6e-05, "loss": 1.4338, "step": 18 }, { "epoch": 0.0062063615205585725, "grad_norm": 0.3898261487483978, "learning_rate": 3.8e-05, "loss": 1.4906, "step": 19 }, { "epoch": 0.0065330121269037604, "grad_norm": 0.4592307507991791, "learning_rate": 4e-05, "loss": 1.752, "step": 20 }, { "epoch": 0.006859662733248948, "grad_norm": 0.590508759021759, "learning_rate": 4.2e-05, "loss": 1.7154, "step": 21 }, { "epoch": 0.007186313339594136, "grad_norm": 0.6255390048027039, "learning_rate": 4.4000000000000006e-05, "loss": 1.8005, "step": 22 }, { "epoch": 0.007512963945939324, "grad_norm": 0.7723678946495056, "learning_rate": 4.600000000000001e-05, "loss": 2.1126, "step": 23 }, { "epoch": 0.007839614552284513, "grad_norm": 1.2195842266082764, "learning_rate": 4.8e-05, "loss": 2.4707, "step": 24 }, { "epoch": 0.0081662651586297, "grad_norm": 1.7578188180923462, "learning_rate": 5e-05, "loss": 3.5729, "step": 25 }, { "epoch": 0.00849291576497489, "grad_norm": 0.24383209645748138, "learning_rate": 5.2000000000000004e-05, "loss": 0.8761, "step": 26 }, { "epoch": 0.008819566371320076, "grad_norm": 0.34012460708618164, "learning_rate": 5.4000000000000005e-05, "loss": 0.9546, "step": 27 }, { "epoch": 0.009146216977665265, "grad_norm": 0.36797860264778137, "learning_rate": 5.6000000000000006e-05, "loss": 1.0231, "step": 28 }, { "epoch": 0.009472867584010452, "grad_norm": 0.35745948553085327, "learning_rate": 5.8e-05, "loss": 0.9924, "step": 29 }, { "epoch": 0.009799518190355641, "grad_norm": 0.3466523587703705, "learning_rate": 6e-05, "loss": 1.0294, "step": 30 }, { "epoch": 0.010126168796700828, "grad_norm": 0.3309411406517029, "learning_rate": 6.2e-05, "loss": 1.0698, "step": 31 }, { "epoch": 0.010452819403046017, "grad_norm": 0.3155065178871155, "learning_rate": 6.400000000000001e-05, "loss": 1.0731, "step": 32 }, { "epoch": 0.010779470009391204, "grad_norm": 0.2679576277732849, "learning_rate": 6.6e-05, "loss": 1.0446, "step": 33 }, { "epoch": 0.011106120615736393, "grad_norm": 0.2870541214942932, "learning_rate": 6.800000000000001e-05, "loss": 1.0705, "step": 34 }, { "epoch": 0.011432771222081582, "grad_norm": 0.25413063168525696, "learning_rate": 7e-05, "loss": 1.0629, "step": 35 }, { "epoch": 0.011759421828426769, "grad_norm": 0.2709527611732483, "learning_rate": 7.2e-05, "loss": 1.0573, "step": 36 }, { "epoch": 0.012086072434771958, "grad_norm": 0.3181741237640381, "learning_rate": 7.4e-05, "loss": 1.1551, "step": 37 }, { "epoch": 0.012412723041117145, "grad_norm": 0.32461702823638916, "learning_rate": 7.6e-05, "loss": 1.2136, "step": 38 }, { "epoch": 0.012739373647462334, "grad_norm": 0.3279399871826172, "learning_rate": 7.800000000000001e-05, "loss": 1.1586, "step": 39 }, { "epoch": 0.013066024253807521, "grad_norm": 0.4179019033908844, "learning_rate": 8e-05, "loss": 1.1029, "step": 40 }, { "epoch": 0.01339267486015271, "grad_norm": 0.3966725170612335, "learning_rate": 8.2e-05, "loss": 1.1146, "step": 41 }, { "epoch": 0.013719325466497897, "grad_norm": 0.40615811944007874, "learning_rate": 8.4e-05, "loss": 1.2046, "step": 42 }, { "epoch": 0.014045976072843086, "grad_norm": 0.47675764560699463, "learning_rate": 8.6e-05, "loss": 1.3448, "step": 43 }, { "epoch": 0.014372626679188273, "grad_norm": 0.5963281393051147, "learning_rate": 8.800000000000001e-05, "loss": 1.4645, "step": 44 }, { "epoch": 0.014699277285533462, "grad_norm": 0.6444793343544006, "learning_rate": 9e-05, "loss": 1.5156, "step": 45 }, { "epoch": 0.015025927891878649, "grad_norm": 0.946388304233551, "learning_rate": 9.200000000000001e-05, "loss": 1.6008, "step": 46 }, { "epoch": 0.015352578498223838, "grad_norm": 1.00883150100708, "learning_rate": 9.4e-05, "loss": 1.3275, "step": 47 }, { "epoch": 0.015679229104569026, "grad_norm": 1.5137501955032349, "learning_rate": 9.6e-05, "loss": 1.7482, "step": 48 }, { "epoch": 0.016005879710914214, "grad_norm": 1.725042700767517, "learning_rate": 9.8e-05, "loss": 1.568, "step": 49 }, { "epoch": 0.0163325303172594, "grad_norm": 2.654313564300537, "learning_rate": 0.0001, "loss": 2.1327, "step": 50 }, { "epoch": 0.016659180923604588, "grad_norm": 0.2734032869338989, "learning_rate": 9.999997278438182e-05, "loss": 0.8992, "step": 51 }, { "epoch": 0.01698583152994978, "grad_norm": 0.34941044449806213, "learning_rate": 9.999989113755686e-05, "loss": 0.8448, "step": 52 }, { "epoch": 0.017312482136294965, "grad_norm": 0.4039852023124695, "learning_rate": 9.999975505961402e-05, "loss": 1.0142, "step": 53 }, { "epoch": 0.017639132742640153, "grad_norm": 0.41006627678871155, "learning_rate": 9.999956455070144e-05, "loss": 0.9617, "step": 54 }, { "epoch": 0.017965783348985343, "grad_norm": 0.3158213198184967, "learning_rate": 9.99993196110265e-05, "loss": 1.0481, "step": 55 }, { "epoch": 0.01829243395533053, "grad_norm": 0.34790340065956116, "learning_rate": 9.99990202408559e-05, "loss": 0.9547, "step": 56 }, { "epoch": 0.018619084561675717, "grad_norm": 0.3128991723060608, "learning_rate": 9.999866644051546e-05, "loss": 0.9676, "step": 57 }, { "epoch": 0.018945735168020904, "grad_norm": 0.338254451751709, "learning_rate": 9.99982582103904e-05, "loss": 1.0016, "step": 58 }, { "epoch": 0.019272385774366095, "grad_norm": 0.30693137645721436, "learning_rate": 9.999779555092509e-05, "loss": 0.9375, "step": 59 }, { "epoch": 0.019599036380711282, "grad_norm": 0.35348108410835266, "learning_rate": 9.999727846262321e-05, "loss": 1.074, "step": 60 }, { "epoch": 0.01992568698705647, "grad_norm": 0.3581663966178894, "learning_rate": 9.999670694604768e-05, "loss": 1.0606, "step": 61 }, { "epoch": 0.020252337593401656, "grad_norm": 0.3733651638031006, "learning_rate": 9.999608100182066e-05, "loss": 1.1619, "step": 62 }, { "epoch": 0.020578988199746847, "grad_norm": 0.4465019702911377, "learning_rate": 9.999540063062356e-05, "loss": 1.1113, "step": 63 }, { "epoch": 0.020905638806092034, "grad_norm": 0.5543861985206604, "learning_rate": 9.999466583319708e-05, "loss": 1.1657, "step": 64 }, { "epoch": 0.02123228941243722, "grad_norm": 0.4180990755558014, "learning_rate": 9.99938766103411e-05, "loss": 1.1938, "step": 65 }, { "epoch": 0.02155894001878241, "grad_norm": 0.4393234848976135, "learning_rate": 9.99930329629148e-05, "loss": 1.1258, "step": 66 }, { "epoch": 0.0218855906251276, "grad_norm": 0.4633965790271759, "learning_rate": 9.999213489183659e-05, "loss": 1.2432, "step": 67 }, { "epoch": 0.022212241231472786, "grad_norm": 0.5040664672851562, "learning_rate": 9.999118239808416e-05, "loss": 1.1911, "step": 68 }, { "epoch": 0.022538891837817973, "grad_norm": 0.6548953652381897, "learning_rate": 9.99901754826944e-05, "loss": 1.3242, "step": 69 }, { "epoch": 0.022865542444163164, "grad_norm": 0.733322024345398, "learning_rate": 9.998911414676346e-05, "loss": 1.4367, "step": 70 }, { "epoch": 0.02319219305050835, "grad_norm": 1.140091896057129, "learning_rate": 9.998799839144675e-05, "loss": 1.745, "step": 71 }, { "epoch": 0.023518843656853538, "grad_norm": 1.2057875394821167, "learning_rate": 9.998682821795888e-05, "loss": 1.5357, "step": 72 }, { "epoch": 0.023845494263198725, "grad_norm": 1.2741526365280151, "learning_rate": 9.998560362757376e-05, "loss": 1.5184, "step": 73 }, { "epoch": 0.024172144869543916, "grad_norm": 1.6893465518951416, "learning_rate": 9.998432462162449e-05, "loss": 1.7909, "step": 74 }, { "epoch": 0.024498795475889103, "grad_norm": 2.0071094036102295, "learning_rate": 9.998299120150342e-05, "loss": 2.2062, "step": 75 }, { "epoch": 0.02482544608223429, "grad_norm": 0.21972136199474335, "learning_rate": 9.998160336866219e-05, "loss": 0.8062, "step": 76 }, { "epoch": 0.025152096688579477, "grad_norm": 0.3399101793766022, "learning_rate": 9.998016112461158e-05, "loss": 0.9169, "step": 77 }, { "epoch": 0.025478747294924668, "grad_norm": 0.3948090374469757, "learning_rate": 9.997866447092168e-05, "loss": 1.0111, "step": 78 }, { "epoch": 0.025805397901269855, "grad_norm": 0.3974277675151825, "learning_rate": 9.997711340922177e-05, "loss": 0.9985, "step": 79 }, { "epoch": 0.026132048507615042, "grad_norm": 0.3781171143054962, "learning_rate": 9.997550794120039e-05, "loss": 0.8534, "step": 80 }, { "epoch": 0.02645869911396023, "grad_norm": 0.44847583770751953, "learning_rate": 9.997384806860526e-05, "loss": 0.9546, "step": 81 }, { "epoch": 0.02678534972030542, "grad_norm": 0.33786600828170776, "learning_rate": 9.99721337932434e-05, "loss": 0.9573, "step": 82 }, { "epoch": 0.027112000326650607, "grad_norm": 0.30058935284614563, "learning_rate": 9.997036511698098e-05, "loss": 0.8942, "step": 83 }, { "epoch": 0.027438650932995794, "grad_norm": 0.2836489677429199, "learning_rate": 9.996854204174344e-05, "loss": 0.9423, "step": 84 }, { "epoch": 0.02776530153934098, "grad_norm": 0.29414936900138855, "learning_rate": 9.996666456951542e-05, "loss": 1.0213, "step": 85 }, { "epoch": 0.02809195214568617, "grad_norm": 0.2995252311229706, "learning_rate": 9.99647327023408e-05, "loss": 0.9666, "step": 86 }, { "epoch": 0.02841860275203136, "grad_norm": 0.34682995080947876, "learning_rate": 9.996274644232261e-05, "loss": 1.0678, "step": 87 }, { "epoch": 0.028745253358376546, "grad_norm": 0.3426355719566345, "learning_rate": 9.99607057916232e-05, "loss": 1.0813, "step": 88 }, { "epoch": 0.029071903964721736, "grad_norm": 0.3705107271671295, "learning_rate": 9.995861075246405e-05, "loss": 1.0771, "step": 89 }, { "epoch": 0.029398554571066923, "grad_norm": 0.37640541791915894, "learning_rate": 9.995646132712586e-05, "loss": 1.1776, "step": 90 }, { "epoch": 0.02972520517741211, "grad_norm": 0.4144461452960968, "learning_rate": 9.995425751794856e-05, "loss": 1.0535, "step": 91 }, { "epoch": 0.030051855783757297, "grad_norm": 0.5124989748001099, "learning_rate": 9.995199932733126e-05, "loss": 1.2394, "step": 92 }, { "epoch": 0.030378506390102488, "grad_norm": 0.4783899784088135, "learning_rate": 9.994968675773228e-05, "loss": 1.0975, "step": 93 }, { "epoch": 0.030705156996447675, "grad_norm": 0.568493127822876, "learning_rate": 9.994731981166918e-05, "loss": 1.1831, "step": 94 }, { "epoch": 0.031031807602792862, "grad_norm": 0.7573971748352051, "learning_rate": 9.994489849171863e-05, "loss": 1.5349, "step": 95 }, { "epoch": 0.03135845820913805, "grad_norm": 0.8267963528633118, "learning_rate": 9.994242280051656e-05, "loss": 1.4655, "step": 96 }, { "epoch": 0.03168510881548324, "grad_norm": 1.118879795074463, "learning_rate": 9.993989274075806e-05, "loss": 1.7716, "step": 97 }, { "epoch": 0.03201175942182843, "grad_norm": 1.2966365814208984, "learning_rate": 9.99373083151974e-05, "loss": 1.5928, "step": 98 }, { "epoch": 0.032338410028173614, "grad_norm": 1.382350206375122, "learning_rate": 9.99346695266481e-05, "loss": 1.5455, "step": 99 }, { "epoch": 0.0326650606345188, "grad_norm": 2.4098920822143555, "learning_rate": 9.993197637798277e-05, "loss": 2.2737, "step": 100 }, { "epoch": 0.03299171124086399, "grad_norm": 0.18814091384410858, "learning_rate": 9.992922887213324e-05, "loss": 0.7321, "step": 101 }, { "epoch": 0.033318361847209176, "grad_norm": 0.21184566617012024, "learning_rate": 9.992642701209051e-05, "loss": 0.8945, "step": 102 }, { "epoch": 0.03364501245355437, "grad_norm": 0.21299266815185547, "learning_rate": 9.992357080090479e-05, "loss": 0.9516, "step": 103 }, { "epoch": 0.03397166305989956, "grad_norm": 0.21715138852596283, "learning_rate": 9.992066024168539e-05, "loss": 0.9654, "step": 104 }, { "epoch": 0.034298313666244744, "grad_norm": 0.25743889808654785, "learning_rate": 9.991769533760082e-05, "loss": 0.8758, "step": 105 }, { "epoch": 0.03462496427258993, "grad_norm": 0.24404282867908478, "learning_rate": 9.991467609187875e-05, "loss": 0.9195, "step": 106 }, { "epoch": 0.03495161487893512, "grad_norm": 0.24752084910869598, "learning_rate": 9.9911602507806e-05, "loss": 0.8898, "step": 107 }, { "epoch": 0.035278265485280305, "grad_norm": 0.28402474522590637, "learning_rate": 9.990847458872857e-05, "loss": 0.9336, "step": 108 }, { "epoch": 0.03560491609162549, "grad_norm": 0.24856555461883545, "learning_rate": 9.990529233805157e-05, "loss": 0.9425, "step": 109 }, { "epoch": 0.035931566697970686, "grad_norm": 0.28930795192718506, "learning_rate": 9.990205575923927e-05, "loss": 0.9346, "step": 110 }, { "epoch": 0.03625821730431587, "grad_norm": 0.3242413401603699, "learning_rate": 9.989876485581513e-05, "loss": 0.8545, "step": 111 }, { "epoch": 0.03658486791066106, "grad_norm": 0.3469846844673157, "learning_rate": 9.989541963136166e-05, "loss": 0.9725, "step": 112 }, { "epoch": 0.03691151851700625, "grad_norm": 0.42651814222335815, "learning_rate": 9.98920200895206e-05, "loss": 1.136, "step": 113 }, { "epoch": 0.037238169123351435, "grad_norm": 0.4039931297302246, "learning_rate": 9.988856623399272e-05, "loss": 1.0276, "step": 114 }, { "epoch": 0.03756481972969662, "grad_norm": 0.46665310859680176, "learning_rate": 9.988505806853803e-05, "loss": 1.1116, "step": 115 }, { "epoch": 0.03789147033604181, "grad_norm": 0.46870648860931396, "learning_rate": 9.988149559697556e-05, "loss": 1.09, "step": 116 }, { "epoch": 0.038218120942386996, "grad_norm": 0.5010095238685608, "learning_rate": 9.987787882318353e-05, "loss": 1.2364, "step": 117 }, { "epoch": 0.03854477154873219, "grad_norm": 0.5289409756660461, "learning_rate": 9.987420775109926e-05, "loss": 1.1562, "step": 118 }, { "epoch": 0.03887142215507738, "grad_norm": 0.5613381862640381, "learning_rate": 9.987048238471913e-05, "loss": 1.3162, "step": 119 }, { "epoch": 0.039198072761422564, "grad_norm": 0.6171817779541016, "learning_rate": 9.98667027280987e-05, "loss": 1.2698, "step": 120 }, { "epoch": 0.03952472336776775, "grad_norm": 0.7703320384025574, "learning_rate": 9.986286878535258e-05, "loss": 1.4442, "step": 121 }, { "epoch": 0.03985137397411294, "grad_norm": 0.8612945675849915, "learning_rate": 9.98589805606545e-05, "loss": 1.4071, "step": 122 }, { "epoch": 0.040178024580458126, "grad_norm": 1.3705520629882812, "learning_rate": 9.985503805823729e-05, "loss": 1.4146, "step": 123 }, { "epoch": 0.04050467518680331, "grad_norm": 1.9134920835494995, "learning_rate": 9.985104128239284e-05, "loss": 2.0188, "step": 124 }, { "epoch": 0.04083132579314851, "grad_norm": 2.2865049839019775, "learning_rate": 9.984699023747215e-05, "loss": 1.9591, "step": 125 }, { "epoch": 0.041157976399493694, "grad_norm": 0.17751002311706543, "learning_rate": 9.984288492788527e-05, "loss": 0.8177, "step": 126 }, { "epoch": 0.04148462700583888, "grad_norm": 0.18720225989818573, "learning_rate": 9.983872535810137e-05, "loss": 0.796, "step": 127 }, { "epoch": 0.04181127761218407, "grad_norm": 0.2171001136302948, "learning_rate": 9.983451153264862e-05, "loss": 0.893, "step": 128 }, { "epoch": 0.042137928218529255, "grad_norm": 0.23457252979278564, "learning_rate": 9.983024345611434e-05, "loss": 0.985, "step": 129 }, { "epoch": 0.04246457882487444, "grad_norm": 0.26218998432159424, "learning_rate": 9.982592113314484e-05, "loss": 0.9618, "step": 130 }, { "epoch": 0.04279122943121963, "grad_norm": 0.23409634828567505, "learning_rate": 9.98215445684455e-05, "loss": 0.8384, "step": 131 }, { "epoch": 0.04311788003756482, "grad_norm": 0.25260603427886963, "learning_rate": 9.981711376678077e-05, "loss": 0.9468, "step": 132 }, { "epoch": 0.04344453064391001, "grad_norm": 0.25879567861557007, "learning_rate": 9.981262873297412e-05, "loss": 0.9695, "step": 133 }, { "epoch": 0.0437711812502552, "grad_norm": 0.27561235427856445, "learning_rate": 9.980808947190809e-05, "loss": 0.9748, "step": 134 }, { "epoch": 0.044097831856600385, "grad_norm": 0.2770557701587677, "learning_rate": 9.98034959885242e-05, "loss": 0.9308, "step": 135 }, { "epoch": 0.04442448246294557, "grad_norm": 0.31946492195129395, "learning_rate": 9.979884828782305e-05, "loss": 1.0077, "step": 136 }, { "epoch": 0.04475113306929076, "grad_norm": 0.3412545621395111, "learning_rate": 9.979414637486424e-05, "loss": 1.0537, "step": 137 }, { "epoch": 0.045077783675635946, "grad_norm": 0.3484194874763489, "learning_rate": 9.978939025476639e-05, "loss": 1.0126, "step": 138 }, { "epoch": 0.04540443428198113, "grad_norm": 0.36984875798225403, "learning_rate": 9.978457993270713e-05, "loss": 1.0589, "step": 139 }, { "epoch": 0.04573108488832633, "grad_norm": 0.46515074372291565, "learning_rate": 9.97797154139231e-05, "loss": 1.2081, "step": 140 }, { "epoch": 0.046057735494671515, "grad_norm": 0.42576462030410767, "learning_rate": 9.97747967037099e-05, "loss": 1.1499, "step": 141 }, { "epoch": 0.0463843861010167, "grad_norm": 0.47039994597435, "learning_rate": 9.976982380742221e-05, "loss": 1.1558, "step": 142 }, { "epoch": 0.04671103670736189, "grad_norm": 0.49022629857063293, "learning_rate": 9.976479673047363e-05, "loss": 1.1308, "step": 143 }, { "epoch": 0.047037687313707076, "grad_norm": 0.6336291432380676, "learning_rate": 9.975971547833674e-05, "loss": 1.4362, "step": 144 }, { "epoch": 0.04736433792005226, "grad_norm": 0.8383249640464783, "learning_rate": 9.975458005654314e-05, "loss": 1.5723, "step": 145 }, { "epoch": 0.04769098852639745, "grad_norm": 0.9654211401939392, "learning_rate": 9.974939047068337e-05, "loss": 1.4471, "step": 146 }, { "epoch": 0.04801763913274264, "grad_norm": 1.1386771202087402, "learning_rate": 9.974414672640693e-05, "loss": 1.5832, "step": 147 }, { "epoch": 0.04834428973908783, "grad_norm": 1.7021054029464722, "learning_rate": 9.973884882942232e-05, "loss": 2.0502, "step": 148 }, { "epoch": 0.04867094034543302, "grad_norm": 1.9076801538467407, "learning_rate": 9.973349678549692e-05, "loss": 2.0686, "step": 149 }, { "epoch": 0.048997590951778205, "grad_norm": 1.787797451019287, "learning_rate": 9.972809060045714e-05, "loss": 2.0867, "step": 150 }, { "epoch": 0.04932424155812339, "grad_norm": 0.1957893669605255, "learning_rate": 9.972263028018826e-05, "loss": 0.7799, "step": 151 }, { "epoch": 0.04965089216446858, "grad_norm": 0.2222263514995575, "learning_rate": 9.971711583063452e-05, "loss": 0.887, "step": 152 }, { "epoch": 0.04997754277081377, "grad_norm": 0.22523343563079834, "learning_rate": 9.97115472577991e-05, "loss": 0.941, "step": 153 }, { "epoch": 0.050304193377158954, "grad_norm": 0.23509541153907776, "learning_rate": 9.970592456774408e-05, "loss": 0.9283, "step": 154 }, { "epoch": 0.05063084398350414, "grad_norm": 0.23556609451770782, "learning_rate": 9.970024776659046e-05, "loss": 0.9119, "step": 155 }, { "epoch": 0.050957494589849335, "grad_norm": 0.2595606744289398, "learning_rate": 9.969451686051814e-05, "loss": 0.9602, "step": 156 }, { "epoch": 0.05128414519619452, "grad_norm": 0.2726079523563385, "learning_rate": 9.968873185576593e-05, "loss": 0.9543, "step": 157 }, { "epoch": 0.05161079580253971, "grad_norm": 0.27437275648117065, "learning_rate": 9.968289275863152e-05, "loss": 0.985, "step": 158 }, { "epoch": 0.051937446408884896, "grad_norm": 0.2603570222854614, "learning_rate": 9.967699957547152e-05, "loss": 0.8353, "step": 159 }, { "epoch": 0.052264097015230083, "grad_norm": 0.28538116812705994, "learning_rate": 9.967105231270137e-05, "loss": 0.982, "step": 160 }, { "epoch": 0.05259074762157527, "grad_norm": 0.28932714462280273, "learning_rate": 9.966505097679542e-05, "loss": 0.8386, "step": 161 }, { "epoch": 0.05291739822792046, "grad_norm": 0.31023868918418884, "learning_rate": 9.965899557428686e-05, "loss": 0.9743, "step": 162 }, { "epoch": 0.05324404883426565, "grad_norm": 0.3417147696018219, "learning_rate": 9.965288611176777e-05, "loss": 0.998, "step": 163 }, { "epoch": 0.05357069944061084, "grad_norm": 0.35479384660720825, "learning_rate": 9.964672259588905e-05, "loss": 1.0415, "step": 164 }, { "epoch": 0.053897350046956026, "grad_norm": 0.3958076238632202, "learning_rate": 9.964050503336047e-05, "loss": 0.9943, "step": 165 }, { "epoch": 0.05422400065330121, "grad_norm": 0.4142689108848572, "learning_rate": 9.96342334309506e-05, "loss": 1.1481, "step": 166 }, { "epoch": 0.0545506512596464, "grad_norm": 0.46137475967407227, "learning_rate": 9.962790779548688e-05, "loss": 1.2285, "step": 167 }, { "epoch": 0.05487730186599159, "grad_norm": 0.4913260340690613, "learning_rate": 9.962152813385554e-05, "loss": 1.2842, "step": 168 }, { "epoch": 0.055203952472336774, "grad_norm": 0.5377975106239319, "learning_rate": 9.961509445300163e-05, "loss": 1.1915, "step": 169 }, { "epoch": 0.05553060307868196, "grad_norm": 0.607846200466156, "learning_rate": 9.960860675992904e-05, "loss": 1.1763, "step": 170 }, { "epoch": 0.055857253685027156, "grad_norm": 0.6850741505622864, "learning_rate": 9.960206506170042e-05, "loss": 1.3445, "step": 171 }, { "epoch": 0.05618390429137234, "grad_norm": 0.849875807762146, "learning_rate": 9.959546936543722e-05, "loss": 1.5409, "step": 172 }, { "epoch": 0.05651055489771753, "grad_norm": 0.9521230459213257, "learning_rate": 9.95888196783197e-05, "loss": 1.4537, "step": 173 }, { "epoch": 0.05683720550406272, "grad_norm": 1.3079942464828491, "learning_rate": 9.958211600758683e-05, "loss": 1.6594, "step": 174 }, { "epoch": 0.057163856110407904, "grad_norm": 2.0800368785858154, "learning_rate": 9.957535836053644e-05, "loss": 2.1626, "step": 175 }, { "epoch": 0.05749050671675309, "grad_norm": 0.234994575381279, "learning_rate": 9.956854674452504e-05, "loss": 0.8112, "step": 176 }, { "epoch": 0.05781715732309828, "grad_norm": 0.2850241959095001, "learning_rate": 9.956168116696794e-05, "loss": 0.9069, "step": 177 }, { "epoch": 0.05814380792944347, "grad_norm": 0.2719287574291229, "learning_rate": 9.955476163533915e-05, "loss": 0.8626, "step": 178 }, { "epoch": 0.05847045853578866, "grad_norm": 0.2707035541534424, "learning_rate": 9.954778815717147e-05, "loss": 0.8741, "step": 179 }, { "epoch": 0.05879710914213385, "grad_norm": 0.2905394732952118, "learning_rate": 9.954076074005641e-05, "loss": 0.8914, "step": 180 }, { "epoch": 0.059123759748479034, "grad_norm": 0.29349029064178467, "learning_rate": 9.953367939164418e-05, "loss": 0.9079, "step": 181 }, { "epoch": 0.05945041035482422, "grad_norm": 0.26738202571868896, "learning_rate": 9.952654411964368e-05, "loss": 0.9479, "step": 182 }, { "epoch": 0.05977706096116941, "grad_norm": 0.2822587192058563, "learning_rate": 9.951935493182259e-05, "loss": 0.9442, "step": 183 }, { "epoch": 0.060103711567514595, "grad_norm": 0.2845873236656189, "learning_rate": 9.95121118360072e-05, "loss": 0.9895, "step": 184 }, { "epoch": 0.06043036217385978, "grad_norm": 0.2944308817386627, "learning_rate": 9.950481484008256e-05, "loss": 0.9662, "step": 185 }, { "epoch": 0.060757012780204976, "grad_norm": 0.3118637204170227, "learning_rate": 9.949746395199233e-05, "loss": 0.9222, "step": 186 }, { "epoch": 0.06108366338655016, "grad_norm": 0.32590070366859436, "learning_rate": 9.949005917973888e-05, "loss": 0.9805, "step": 187 }, { "epoch": 0.06141031399289535, "grad_norm": 0.32632124423980713, "learning_rate": 9.948260053138323e-05, "loss": 0.939, "step": 188 }, { "epoch": 0.06173696459924054, "grad_norm": 0.38124093413352966, "learning_rate": 9.947508801504503e-05, "loss": 1.0358, "step": 189 }, { "epoch": 0.062063615205585725, "grad_norm": 0.39029499888420105, "learning_rate": 9.946752163890263e-05, "loss": 1.0518, "step": 190 }, { "epoch": 0.06239026581193091, "grad_norm": 0.38209250569343567, "learning_rate": 9.945990141119295e-05, "loss": 1.0458, "step": 191 }, { "epoch": 0.0627169164182761, "grad_norm": 0.4655872583389282, "learning_rate": 9.945222734021154e-05, "loss": 1.1645, "step": 192 }, { "epoch": 0.06304356702462129, "grad_norm": 0.4615199863910675, "learning_rate": 9.944449943431262e-05, "loss": 1.0565, "step": 193 }, { "epoch": 0.06337021763096648, "grad_norm": 0.48887377977371216, "learning_rate": 9.943671770190896e-05, "loss": 1.1269, "step": 194 }, { "epoch": 0.06369686823731166, "grad_norm": 0.5362581014633179, "learning_rate": 9.942888215147193e-05, "loss": 1.2089, "step": 195 }, { "epoch": 0.06402351884365685, "grad_norm": 0.633625864982605, "learning_rate": 9.942099279153154e-05, "loss": 1.2551, "step": 196 }, { "epoch": 0.06435016945000205, "grad_norm": 0.7443326711654663, "learning_rate": 9.941304963067632e-05, "loss": 1.24, "step": 197 }, { "epoch": 0.06467682005634723, "grad_norm": 1.032091736793518, "learning_rate": 9.940505267755341e-05, "loss": 1.3817, "step": 198 }, { "epoch": 0.06500347066269242, "grad_norm": 1.4411945343017578, "learning_rate": 9.939700194086847e-05, "loss": 1.5045, "step": 199 }, { "epoch": 0.0653301212690376, "grad_norm": 2.1002960205078125, "learning_rate": 9.938889742938575e-05, "loss": 1.7348, "step": 200 }, { "epoch": 0.0656567718753828, "grad_norm": 0.2001526653766632, "learning_rate": 9.938073915192798e-05, "loss": 0.7617, "step": 201 }, { "epoch": 0.06598342248172798, "grad_norm": 0.22948387265205383, "learning_rate": 9.937252711737652e-05, "loss": 0.815, "step": 202 }, { "epoch": 0.06631007308807317, "grad_norm": 0.2272295504808426, "learning_rate": 9.936426133467115e-05, "loss": 0.8564, "step": 203 }, { "epoch": 0.06663672369441835, "grad_norm": 0.25728172063827515, "learning_rate": 9.935594181281022e-05, "loss": 0.9621, "step": 204 }, { "epoch": 0.06696337430076355, "grad_norm": 0.2590310275554657, "learning_rate": 9.934756856085059e-05, "loss": 0.9696, "step": 205 }, { "epoch": 0.06729002490710874, "grad_norm": 0.2755409777164459, "learning_rate": 9.933914158790756e-05, "loss": 0.9406, "step": 206 }, { "epoch": 0.06761667551345392, "grad_norm": 0.2778630256652832, "learning_rate": 9.933066090315494e-05, "loss": 0.9644, "step": 207 }, { "epoch": 0.06794332611979911, "grad_norm": 0.29143768548965454, "learning_rate": 9.932212651582502e-05, "loss": 0.9866, "step": 208 }, { "epoch": 0.0682699767261443, "grad_norm": 0.28746140003204346, "learning_rate": 9.931353843520856e-05, "loss": 0.9637, "step": 209 }, { "epoch": 0.06859662733248949, "grad_norm": 0.3338890075683594, "learning_rate": 9.930489667065474e-05, "loss": 0.9844, "step": 210 }, { "epoch": 0.06892327793883467, "grad_norm": 0.32576242089271545, "learning_rate": 9.929620123157121e-05, "loss": 1.0607, "step": 211 }, { "epoch": 0.06924992854517986, "grad_norm": 0.3407037556171417, "learning_rate": 9.928745212742403e-05, "loss": 1.0002, "step": 212 }, { "epoch": 0.06957657915152506, "grad_norm": 0.35422274470329285, "learning_rate": 9.927864936773769e-05, "loss": 0.962, "step": 213 }, { "epoch": 0.06990322975787024, "grad_norm": 0.42136797308921814, "learning_rate": 9.926979296209509e-05, "loss": 1.1253, "step": 214 }, { "epoch": 0.07022988036421543, "grad_norm": 0.4093579649925232, "learning_rate": 9.926088292013755e-05, "loss": 1.0692, "step": 215 }, { "epoch": 0.07055653097056061, "grad_norm": 0.48680227994918823, "learning_rate": 9.925191925156474e-05, "loss": 1.1906, "step": 216 }, { "epoch": 0.0708831815769058, "grad_norm": 0.4694482088088989, "learning_rate": 9.924290196613475e-05, "loss": 1.1077, "step": 217 }, { "epoch": 0.07120983218325098, "grad_norm": 0.485873818397522, "learning_rate": 9.923383107366402e-05, "loss": 1.1221, "step": 218 }, { "epoch": 0.07153648278959618, "grad_norm": 0.5874084830284119, "learning_rate": 9.922470658402731e-05, "loss": 1.2042, "step": 219 }, { "epoch": 0.07186313339594137, "grad_norm": 0.682409405708313, "learning_rate": 9.921552850715783e-05, "loss": 1.337, "step": 220 }, { "epoch": 0.07218978400228655, "grad_norm": 0.8160572052001953, "learning_rate": 9.920629685304701e-05, "loss": 1.5398, "step": 221 }, { "epoch": 0.07251643460863175, "grad_norm": 0.9582510590553284, "learning_rate": 9.919701163174466e-05, "loss": 1.3404, "step": 222 }, { "epoch": 0.07284308521497693, "grad_norm": 1.3406293392181396, "learning_rate": 9.918767285335892e-05, "loss": 1.6518, "step": 223 }, { "epoch": 0.07316973582132212, "grad_norm": 1.5214787721633911, "learning_rate": 9.917828052805622e-05, "loss": 1.8417, "step": 224 }, { "epoch": 0.0734963864276673, "grad_norm": 2.5116734504699707, "learning_rate": 9.916883466606127e-05, "loss": 2.2562, "step": 225 }, { "epoch": 0.0738230370340125, "grad_norm": 0.19113443791866302, "learning_rate": 9.915933527765707e-05, "loss": 0.8004, "step": 226 }, { "epoch": 0.07414968764035769, "grad_norm": 0.20771871507167816, "learning_rate": 9.914978237318487e-05, "loss": 0.8554, "step": 227 }, { "epoch": 0.07447633824670287, "grad_norm": 0.21872685849666595, "learning_rate": 9.914017596304421e-05, "loss": 0.9158, "step": 228 }, { "epoch": 0.07480298885304806, "grad_norm": 0.23601579666137695, "learning_rate": 9.913051605769288e-05, "loss": 0.8477, "step": 229 }, { "epoch": 0.07512963945939324, "grad_norm": 0.24787437915802002, "learning_rate": 9.912080266764687e-05, "loss": 0.8886, "step": 230 }, { "epoch": 0.07545629006573844, "grad_norm": 0.24604691565036774, "learning_rate": 9.911103580348044e-05, "loss": 0.9259, "step": 231 }, { "epoch": 0.07578294067208362, "grad_norm": 0.2649064362049103, "learning_rate": 9.910121547582601e-05, "loss": 0.9131, "step": 232 }, { "epoch": 0.07610959127842881, "grad_norm": 0.2932969033718109, "learning_rate": 9.909134169537426e-05, "loss": 0.8468, "step": 233 }, { "epoch": 0.07643624188477399, "grad_norm": 0.27797630429267883, "learning_rate": 9.908141447287403e-05, "loss": 0.9126, "step": 234 }, { "epoch": 0.07676289249111919, "grad_norm": 0.29813864827156067, "learning_rate": 9.907143381913231e-05, "loss": 0.8744, "step": 235 }, { "epoch": 0.07708954309746438, "grad_norm": 0.31166505813598633, "learning_rate": 9.906139974501432e-05, "loss": 0.9738, "step": 236 }, { "epoch": 0.07741619370380956, "grad_norm": 0.3513356149196625, "learning_rate": 9.905131226144337e-05, "loss": 0.9285, "step": 237 }, { "epoch": 0.07774284431015475, "grad_norm": 0.356487512588501, "learning_rate": 9.904117137940099e-05, "loss": 0.9841, "step": 238 }, { "epoch": 0.07806949491649993, "grad_norm": 0.3558920621871948, "learning_rate": 9.903097710992675e-05, "loss": 1.0901, "step": 239 }, { "epoch": 0.07839614552284513, "grad_norm": 0.4005250036716461, "learning_rate": 9.90207294641184e-05, "loss": 1.1738, "step": 240 }, { "epoch": 0.07872279612919031, "grad_norm": 0.4491129517555237, "learning_rate": 9.901042845313178e-05, "loss": 1.17, "step": 241 }, { "epoch": 0.0790494467355355, "grad_norm": 0.4596916139125824, "learning_rate": 9.900007408818082e-05, "loss": 1.1344, "step": 242 }, { "epoch": 0.0793760973418807, "grad_norm": 0.48606646060943604, "learning_rate": 9.898966638053755e-05, "loss": 1.0872, "step": 243 }, { "epoch": 0.07970274794822588, "grad_norm": 0.5776427388191223, "learning_rate": 9.897920534153207e-05, "loss": 1.2504, "step": 244 }, { "epoch": 0.08002939855457107, "grad_norm": 0.5997623801231384, "learning_rate": 9.896869098255249e-05, "loss": 1.1232, "step": 245 }, { "epoch": 0.08035604916091625, "grad_norm": 0.7988643646240234, "learning_rate": 9.895812331504502e-05, "loss": 1.4547, "step": 246 }, { "epoch": 0.08068269976726145, "grad_norm": 0.9789859056472778, "learning_rate": 9.894750235051389e-05, "loss": 1.4768, "step": 247 }, { "epoch": 0.08100935037360663, "grad_norm": 1.010258674621582, "learning_rate": 9.893682810052132e-05, "loss": 1.8305, "step": 248 }, { "epoch": 0.08133600097995182, "grad_norm": 1.1983157396316528, "learning_rate": 9.89261005766876e-05, "loss": 1.5781, "step": 249 }, { "epoch": 0.08166265158629701, "grad_norm": 1.4921436309814453, "learning_rate": 9.891531979069096e-05, "loss": 2.0553, "step": 250 }, { "epoch": 0.0819893021926422, "grad_norm": 0.21676473319530487, "learning_rate": 9.890448575426761e-05, "loss": 0.6872, "step": 251 }, { "epoch": 0.08231595279898739, "grad_norm": 0.22214944660663605, "learning_rate": 9.889359847921176e-05, "loss": 0.7853, "step": 252 }, { "epoch": 0.08264260340533257, "grad_norm": 0.2550159990787506, "learning_rate": 9.888265797737561e-05, "loss": 0.8511, "step": 253 }, { "epoch": 0.08296925401167776, "grad_norm": 0.2533634603023529, "learning_rate": 9.887166426066921e-05, "loss": 0.8282, "step": 254 }, { "epoch": 0.08329590461802294, "grad_norm": 0.29216915369033813, "learning_rate": 9.886061734106061e-05, "loss": 0.9312, "step": 255 }, { "epoch": 0.08362255522436814, "grad_norm": 0.2742856442928314, "learning_rate": 9.884951723057574e-05, "loss": 0.9006, "step": 256 }, { "epoch": 0.08394920583071332, "grad_norm": 0.29381704330444336, "learning_rate": 9.883836394129849e-05, "loss": 0.9064, "step": 257 }, { "epoch": 0.08427585643705851, "grad_norm": 0.2808193862438202, "learning_rate": 9.882715748537056e-05, "loss": 0.9602, "step": 258 }, { "epoch": 0.0846025070434037, "grad_norm": 0.3137573003768921, "learning_rate": 9.881589787499164e-05, "loss": 0.9881, "step": 259 }, { "epoch": 0.08492915764974888, "grad_norm": 0.32701578736305237, "learning_rate": 9.880458512241917e-05, "loss": 0.9211, "step": 260 }, { "epoch": 0.08525580825609408, "grad_norm": 0.34949010610580444, "learning_rate": 9.879321923996852e-05, "loss": 1.0025, "step": 261 }, { "epoch": 0.08558245886243926, "grad_norm": 0.3389429450035095, "learning_rate": 9.878180024001283e-05, "loss": 0.956, "step": 262 }, { "epoch": 0.08590910946878445, "grad_norm": 0.41225770115852356, "learning_rate": 9.877032813498315e-05, "loss": 1.1321, "step": 263 }, { "epoch": 0.08623576007512963, "grad_norm": 0.39792096614837646, "learning_rate": 9.875880293736828e-05, "loss": 1.0584, "step": 264 }, { "epoch": 0.08656241068147483, "grad_norm": 0.447539359331131, "learning_rate": 9.874722465971483e-05, "loss": 1.1351, "step": 265 }, { "epoch": 0.08688906128782002, "grad_norm": 0.4593818783760071, "learning_rate": 9.87355933146272e-05, "loss": 1.1266, "step": 266 }, { "epoch": 0.0872157118941652, "grad_norm": 0.554883599281311, "learning_rate": 9.872390891476757e-05, "loss": 1.1753, "step": 267 }, { "epoch": 0.0875423625005104, "grad_norm": 0.5333263874053955, "learning_rate": 9.871217147285588e-05, "loss": 1.1131, "step": 268 }, { "epoch": 0.08786901310685558, "grad_norm": 0.7088521122932434, "learning_rate": 9.870038100166973e-05, "loss": 1.3864, "step": 269 }, { "epoch": 0.08819566371320077, "grad_norm": 0.7951180934906006, "learning_rate": 9.868853751404461e-05, "loss": 1.3547, "step": 270 }, { "epoch": 0.08852231431954595, "grad_norm": 0.9721757173538208, "learning_rate": 9.867664102287359e-05, "loss": 1.6935, "step": 271 }, { "epoch": 0.08884896492589114, "grad_norm": 1.0832263231277466, "learning_rate": 9.866469154110748e-05, "loss": 1.3659, "step": 272 }, { "epoch": 0.08917561553223634, "grad_norm": 1.3910161256790161, "learning_rate": 9.86526890817548e-05, "loss": 1.5507, "step": 273 }, { "epoch": 0.08950226613858152, "grad_norm": 1.494617223739624, "learning_rate": 9.864063365788169e-05, "loss": 1.974, "step": 274 }, { "epoch": 0.08982891674492671, "grad_norm": 1.6420493125915527, "learning_rate": 9.862852528261202e-05, "loss": 1.6818, "step": 275 }, { "epoch": 0.09015556735127189, "grad_norm": 0.20619602501392365, "learning_rate": 9.861636396912724e-05, "loss": 0.7741, "step": 276 }, { "epoch": 0.09048221795761709, "grad_norm": 0.23640893399715424, "learning_rate": 9.860414973066647e-05, "loss": 0.8814, "step": 277 }, { "epoch": 0.09080886856396227, "grad_norm": 0.2566302716732025, "learning_rate": 9.859188258052644e-05, "loss": 0.9219, "step": 278 }, { "epoch": 0.09113551917030746, "grad_norm": 0.27351245284080505, "learning_rate": 9.857956253206144e-05, "loss": 0.9595, "step": 279 }, { "epoch": 0.09146216977665265, "grad_norm": 0.27393215894699097, "learning_rate": 9.856718959868343e-05, "loss": 0.8775, "step": 280 }, { "epoch": 0.09178882038299783, "grad_norm": 0.28952500224113464, "learning_rate": 9.855476379386186e-05, "loss": 0.907, "step": 281 }, { "epoch": 0.09211547098934303, "grad_norm": 0.27546462416648865, "learning_rate": 9.854228513112376e-05, "loss": 0.8872, "step": 282 }, { "epoch": 0.09244212159568821, "grad_norm": 0.30211374163627625, "learning_rate": 9.852975362405372e-05, "loss": 0.9283, "step": 283 }, { "epoch": 0.0927687722020334, "grad_norm": 0.32662448287010193, "learning_rate": 9.851716928629386e-05, "loss": 0.9332, "step": 284 }, { "epoch": 0.09309542280837858, "grad_norm": 0.32408496737480164, "learning_rate": 9.85045321315438e-05, "loss": 0.8625, "step": 285 }, { "epoch": 0.09342207341472378, "grad_norm": 0.31861940026283264, "learning_rate": 9.849184217356064e-05, "loss": 0.8849, "step": 286 }, { "epoch": 0.09374872402106896, "grad_norm": 0.34367239475250244, "learning_rate": 9.8479099426159e-05, "loss": 0.922, "step": 287 }, { "epoch": 0.09407537462741415, "grad_norm": 0.3816602826118469, "learning_rate": 9.846630390321095e-05, "loss": 1.0733, "step": 288 }, { "epoch": 0.09440202523375935, "grad_norm": 0.39302805066108704, "learning_rate": 9.845345561864599e-05, "loss": 1.0519, "step": 289 }, { "epoch": 0.09472867584010453, "grad_norm": 0.3958548903465271, "learning_rate": 9.844055458645109e-05, "loss": 1.1534, "step": 290 }, { "epoch": 0.09505532644644972, "grad_norm": 0.4945371150970459, "learning_rate": 9.842760082067067e-05, "loss": 1.0721, "step": 291 }, { "epoch": 0.0953819770527949, "grad_norm": 0.44846147298812866, "learning_rate": 9.841459433540646e-05, "loss": 0.9707, "step": 292 }, { "epoch": 0.0957086276591401, "grad_norm": 0.5135351419448853, "learning_rate": 9.84015351448177e-05, "loss": 1.0283, "step": 293 }, { "epoch": 0.09603527826548527, "grad_norm": 0.5842195153236389, "learning_rate": 9.838842326312089e-05, "loss": 1.0416, "step": 294 }, { "epoch": 0.09636192887183047, "grad_norm": 0.6268463730812073, "learning_rate": 9.837525870459e-05, "loss": 1.2132, "step": 295 }, { "epoch": 0.09668857947817566, "grad_norm": 0.7157014012336731, "learning_rate": 9.836204148355625e-05, "loss": 1.1914, "step": 296 }, { "epoch": 0.09701523008452084, "grad_norm": 1.0113928318023682, "learning_rate": 9.834877161440825e-05, "loss": 1.5362, "step": 297 }, { "epoch": 0.09734188069086604, "grad_norm": 1.2888816595077515, "learning_rate": 9.833544911159194e-05, "loss": 1.7309, "step": 298 }, { "epoch": 0.09766853129721122, "grad_norm": 1.4757226705551147, "learning_rate": 9.832207398961047e-05, "loss": 1.6857, "step": 299 }, { "epoch": 0.09799518190355641, "grad_norm": 3.058228015899658, "learning_rate": 9.830864626302439e-05, "loss": 1.9936, "step": 300 }, { "epoch": 0.09832183250990159, "grad_norm": 0.20743629336357117, "learning_rate": 9.82951659464514e-05, "loss": 0.8756, "step": 301 }, { "epoch": 0.09864848311624679, "grad_norm": 0.21614302694797516, "learning_rate": 9.828163305456652e-05, "loss": 0.8775, "step": 302 }, { "epoch": 0.09897513372259198, "grad_norm": 0.21997962892055511, "learning_rate": 9.826804760210202e-05, "loss": 0.8532, "step": 303 }, { "epoch": 0.09930178432893716, "grad_norm": 0.244362011551857, "learning_rate": 9.825440960384733e-05, "loss": 0.9222, "step": 304 }, { "epoch": 0.09962843493528235, "grad_norm": 0.23313583433628082, "learning_rate": 9.824071907464912e-05, "loss": 0.8339, "step": 305 }, { "epoch": 0.09995508554162753, "grad_norm": 0.2632003128528595, "learning_rate": 9.822697602941123e-05, "loss": 0.9296, "step": 306 }, { "epoch": 0.10028173614797273, "grad_norm": 0.2504887580871582, "learning_rate": 9.821318048309469e-05, "loss": 0.9384, "step": 307 }, { "epoch": 0.10060838675431791, "grad_norm": 0.2540770471096039, "learning_rate": 9.819933245071768e-05, "loss": 0.9079, "step": 308 }, { "epoch": 0.1009350373606631, "grad_norm": 0.25880515575408936, "learning_rate": 9.81854319473555e-05, "loss": 0.8531, "step": 309 }, { "epoch": 0.10126168796700828, "grad_norm": 0.29444649815559387, "learning_rate": 9.817147898814059e-05, "loss": 0.9213, "step": 310 }, { "epoch": 0.10158833857335348, "grad_norm": 0.31520259380340576, "learning_rate": 9.815747358826247e-05, "loss": 0.9258, "step": 311 }, { "epoch": 0.10191498917969867, "grad_norm": 0.3317732810974121, "learning_rate": 9.814341576296777e-05, "loss": 1.0338, "step": 312 }, { "epoch": 0.10224163978604385, "grad_norm": 0.3525193929672241, "learning_rate": 9.812930552756018e-05, "loss": 0.9796, "step": 313 }, { "epoch": 0.10256829039238904, "grad_norm": 0.3526162803173065, "learning_rate": 9.811514289740047e-05, "loss": 0.9216, "step": 314 }, { "epoch": 0.10289494099873422, "grad_norm": 0.38859978318214417, "learning_rate": 9.810092788790643e-05, "loss": 0.9836, "step": 315 }, { "epoch": 0.10322159160507942, "grad_norm": 0.4253045618534088, "learning_rate": 9.808666051455287e-05, "loss": 0.9547, "step": 316 }, { "epoch": 0.1035482422114246, "grad_norm": 0.46034926176071167, "learning_rate": 9.807234079287158e-05, "loss": 1.2128, "step": 317 }, { "epoch": 0.10387489281776979, "grad_norm": 0.5099804401397705, "learning_rate": 9.80579687384514e-05, "loss": 1.1683, "step": 318 }, { "epoch": 0.10420154342411499, "grad_norm": 0.5876259207725525, "learning_rate": 9.804354436693805e-05, "loss": 1.3587, "step": 319 }, { "epoch": 0.10452819403046017, "grad_norm": 0.596943199634552, "learning_rate": 9.80290676940343e-05, "loss": 1.2699, "step": 320 }, { "epoch": 0.10485484463680536, "grad_norm": 0.6765027046203613, "learning_rate": 9.801453873549983e-05, "loss": 1.3848, "step": 321 }, { "epoch": 0.10518149524315054, "grad_norm": 0.8787732124328613, "learning_rate": 9.799995750715118e-05, "loss": 1.3471, "step": 322 }, { "epoch": 0.10550814584949574, "grad_norm": 0.9832908511161804, "learning_rate": 9.798532402486186e-05, "loss": 1.2418, "step": 323 }, { "epoch": 0.10583479645584092, "grad_norm": 1.10990309715271, "learning_rate": 9.797063830456224e-05, "loss": 1.2758, "step": 324 }, { "epoch": 0.10616144706218611, "grad_norm": 1.856461763381958, "learning_rate": 9.795590036223955e-05, "loss": 2.2217, "step": 325 }, { "epoch": 0.1064880976685313, "grad_norm": 0.22293882071971893, "learning_rate": 9.794111021393789e-05, "loss": 0.7823, "step": 326 }, { "epoch": 0.10681474827487648, "grad_norm": 0.2518823444843292, "learning_rate": 9.792626787575817e-05, "loss": 0.8259, "step": 327 }, { "epoch": 0.10714139888122168, "grad_norm": 0.23686620593070984, "learning_rate": 9.791137336385812e-05, "loss": 0.8839, "step": 328 }, { "epoch": 0.10746804948756686, "grad_norm": 0.28266048431396484, "learning_rate": 9.789642669445227e-05, "loss": 0.9046, "step": 329 }, { "epoch": 0.10779470009391205, "grad_norm": 0.2630056142807007, "learning_rate": 9.788142788381197e-05, "loss": 0.9035, "step": 330 }, { "epoch": 0.10812135070025723, "grad_norm": 0.29435524344444275, "learning_rate": 9.786637694826527e-05, "loss": 0.9977, "step": 331 }, { "epoch": 0.10844800130660243, "grad_norm": 0.2852599322795868, "learning_rate": 9.7851273904197e-05, "loss": 0.9155, "step": 332 }, { "epoch": 0.10877465191294762, "grad_norm": 0.34938427805900574, "learning_rate": 9.783611876804869e-05, "loss": 1.0045, "step": 333 }, { "epoch": 0.1091013025192928, "grad_norm": 0.3286244869232178, "learning_rate": 9.782091155631862e-05, "loss": 0.9817, "step": 334 }, { "epoch": 0.109427953125638, "grad_norm": 0.3370409309864044, "learning_rate": 9.780565228556171e-05, "loss": 0.9247, "step": 335 }, { "epoch": 0.10975460373198317, "grad_norm": 0.32208555936813354, "learning_rate": 9.77903409723896e-05, "loss": 0.9641, "step": 336 }, { "epoch": 0.11008125433832837, "grad_norm": 0.31277501583099365, "learning_rate": 9.777497763347056e-05, "loss": 1.0156, "step": 337 }, { "epoch": 0.11040790494467355, "grad_norm": 0.34272870421409607, "learning_rate": 9.775956228552951e-05, "loss": 1.0397, "step": 338 }, { "epoch": 0.11073455555101874, "grad_norm": 0.3709607422351837, "learning_rate": 9.774409494534795e-05, "loss": 1.0335, "step": 339 }, { "epoch": 0.11106120615736392, "grad_norm": 0.3993772864341736, "learning_rate": 9.772857562976403e-05, "loss": 1.119, "step": 340 }, { "epoch": 0.11138785676370912, "grad_norm": 0.43857407569885254, "learning_rate": 9.771300435567246e-05, "loss": 1.0039, "step": 341 }, { "epoch": 0.11171450737005431, "grad_norm": 0.44964396953582764, "learning_rate": 9.769738114002451e-05, "loss": 1.2138, "step": 342 }, { "epoch": 0.11204115797639949, "grad_norm": 0.5546076893806458, "learning_rate": 9.7681705999828e-05, "loss": 1.187, "step": 343 }, { "epoch": 0.11236780858274469, "grad_norm": 0.565098762512207, "learning_rate": 9.766597895214729e-05, "loss": 1.2735, "step": 344 }, { "epoch": 0.11269445918908987, "grad_norm": 0.6920917630195618, "learning_rate": 9.76502000141032e-05, "loss": 1.3793, "step": 345 }, { "epoch": 0.11302110979543506, "grad_norm": 0.9214739799499512, "learning_rate": 9.76343692028731e-05, "loss": 1.3204, "step": 346 }, { "epoch": 0.11334776040178024, "grad_norm": 1.2229256629943848, "learning_rate": 9.761848653569078e-05, "loss": 1.8793, "step": 347 }, { "epoch": 0.11367441100812543, "grad_norm": 1.0612778663635254, "learning_rate": 9.760255202984652e-05, "loss": 1.3009, "step": 348 }, { "epoch": 0.11400106161447063, "grad_norm": 1.46675705909729, "learning_rate": 9.758656570268703e-05, "loss": 1.5488, "step": 349 }, { "epoch": 0.11432771222081581, "grad_norm": 1.9769386053085327, "learning_rate": 9.75705275716154e-05, "loss": 1.8757, "step": 350 }, { "epoch": 0.114654362827161, "grad_norm": 0.1892956793308258, "learning_rate": 9.755443765409113e-05, "loss": 0.6978, "step": 351 }, { "epoch": 0.11498101343350618, "grad_norm": 0.2098046839237213, "learning_rate": 9.753829596763012e-05, "loss": 0.9027, "step": 352 }, { "epoch": 0.11530766403985138, "grad_norm": 0.24827317893505096, "learning_rate": 9.75221025298046e-05, "loss": 0.854, "step": 353 }, { "epoch": 0.11563431464619656, "grad_norm": 0.24063189327716827, "learning_rate": 9.750585735824315e-05, "loss": 0.8822, "step": 354 }, { "epoch": 0.11596096525254175, "grad_norm": 0.2549203038215637, "learning_rate": 9.748956047063067e-05, "loss": 0.9234, "step": 355 }, { "epoch": 0.11628761585888694, "grad_norm": 0.3004131019115448, "learning_rate": 9.747321188470835e-05, "loss": 0.9409, "step": 356 }, { "epoch": 0.11661426646523212, "grad_norm": 0.3164829611778259, "learning_rate": 9.745681161827367e-05, "loss": 0.9505, "step": 357 }, { "epoch": 0.11694091707157732, "grad_norm": 0.2902483344078064, "learning_rate": 9.744035968918035e-05, "loss": 0.8985, "step": 358 }, { "epoch": 0.1172675676779225, "grad_norm": 0.3035587668418884, "learning_rate": 9.742385611533838e-05, "loss": 0.9599, "step": 359 }, { "epoch": 0.1175942182842677, "grad_norm": 0.34695568680763245, "learning_rate": 9.740730091471395e-05, "loss": 1.0561, "step": 360 }, { "epoch": 0.11792086889061287, "grad_norm": 0.3543941080570221, "learning_rate": 9.739069410532949e-05, "loss": 0.8749, "step": 361 }, { "epoch": 0.11824751949695807, "grad_norm": 0.3373425006866455, "learning_rate": 9.737403570526353e-05, "loss": 0.9717, "step": 362 }, { "epoch": 0.11857417010330325, "grad_norm": 0.37677595019340515, "learning_rate": 9.735732573265086e-05, "loss": 1.0545, "step": 363 }, { "epoch": 0.11890082070964844, "grad_norm": 0.4091393053531647, "learning_rate": 9.734056420568236e-05, "loss": 0.9264, "step": 364 }, { "epoch": 0.11922747131599364, "grad_norm": 0.4432515799999237, "learning_rate": 9.732375114260503e-05, "loss": 1.0886, "step": 365 }, { "epoch": 0.11955412192233882, "grad_norm": 0.4816678762435913, "learning_rate": 9.7306886561722e-05, "loss": 1.0472, "step": 366 }, { "epoch": 0.11988077252868401, "grad_norm": 0.5049456357955933, "learning_rate": 9.728997048139246e-05, "loss": 1.0874, "step": 367 }, { "epoch": 0.12020742313502919, "grad_norm": 0.5210486650466919, "learning_rate": 9.727300292003168e-05, "loss": 1.2298, "step": 368 }, { "epoch": 0.12053407374137438, "grad_norm": 0.5923225283622742, "learning_rate": 9.725598389611095e-05, "loss": 1.1843, "step": 369 }, { "epoch": 0.12086072434771956, "grad_norm": 0.6272051334381104, "learning_rate": 9.723891342815764e-05, "loss": 1.2197, "step": 370 }, { "epoch": 0.12118737495406476, "grad_norm": 0.8260605931282043, "learning_rate": 9.722179153475504e-05, "loss": 1.4288, "step": 371 }, { "epoch": 0.12151402556040995, "grad_norm": 0.9338024854660034, "learning_rate": 9.720461823454248e-05, "loss": 1.544, "step": 372 }, { "epoch": 0.12184067616675513, "grad_norm": 1.0130958557128906, "learning_rate": 9.718739354621527e-05, "loss": 1.4905, "step": 373 }, { "epoch": 0.12216732677310033, "grad_norm": 1.6334174871444702, "learning_rate": 9.717011748852459e-05, "loss": 1.7708, "step": 374 }, { "epoch": 0.1224939773794455, "grad_norm": 1.7457410097122192, "learning_rate": 9.715279008027759e-05, "loss": 2.0156, "step": 375 }, { "epoch": 0.1228206279857907, "grad_norm": 0.1906844973564148, "learning_rate": 9.713541134033733e-05, "loss": 0.7623, "step": 376 }, { "epoch": 0.12314727859213588, "grad_norm": 0.20964844524860382, "learning_rate": 9.711798128762273e-05, "loss": 0.7831, "step": 377 }, { "epoch": 0.12347392919848107, "grad_norm": 0.21881312131881714, "learning_rate": 9.710049994110859e-05, "loss": 0.8922, "step": 378 }, { "epoch": 0.12380057980482627, "grad_norm": 0.22225263714790344, "learning_rate": 9.708296731982551e-05, "loss": 0.8802, "step": 379 }, { "epoch": 0.12412723041117145, "grad_norm": 0.2540784478187561, "learning_rate": 9.706538344285996e-05, "loss": 0.9593, "step": 380 }, { "epoch": 0.12445388101751664, "grad_norm": 0.26809781789779663, "learning_rate": 9.704774832935415e-05, "loss": 0.8706, "step": 381 }, { "epoch": 0.12478053162386182, "grad_norm": 0.25871115922927856, "learning_rate": 9.703006199850614e-05, "loss": 0.8731, "step": 382 }, { "epoch": 0.125107182230207, "grad_norm": 0.295626699924469, "learning_rate": 9.701232446956969e-05, "loss": 0.9022, "step": 383 }, { "epoch": 0.1254338328365522, "grad_norm": 0.27636104822158813, "learning_rate": 9.699453576185429e-05, "loss": 0.8455, "step": 384 }, { "epoch": 0.1257604834428974, "grad_norm": 0.30843478441238403, "learning_rate": 9.697669589472521e-05, "loss": 0.9653, "step": 385 }, { "epoch": 0.12608713404924257, "grad_norm": 0.321384996175766, "learning_rate": 9.695880488760333e-05, "loss": 0.9124, "step": 386 }, { "epoch": 0.12641378465558778, "grad_norm": 0.33451342582702637, "learning_rate": 9.69408627599653e-05, "loss": 1.0844, "step": 387 }, { "epoch": 0.12674043526193296, "grad_norm": 0.3522025942802429, "learning_rate": 9.692286953134328e-05, "loss": 1.0568, "step": 388 }, { "epoch": 0.12706708586827814, "grad_norm": 0.37441399693489075, "learning_rate": 9.690482522132523e-05, "loss": 0.989, "step": 389 }, { "epoch": 0.12739373647462332, "grad_norm": 0.39532291889190674, "learning_rate": 9.688672984955455e-05, "loss": 1.1478, "step": 390 }, { "epoch": 0.12772038708096853, "grad_norm": 0.39523276686668396, "learning_rate": 9.686858343573037e-05, "loss": 0.9581, "step": 391 }, { "epoch": 0.1280470376873137, "grad_norm": 0.4399612247943878, "learning_rate": 9.685038599960731e-05, "loss": 1.0984, "step": 392 }, { "epoch": 0.1283736882936589, "grad_norm": 0.493168443441391, "learning_rate": 9.683213756099555e-05, "loss": 1.1655, "step": 393 }, { "epoch": 0.1287003389000041, "grad_norm": 0.5002815127372742, "learning_rate": 9.681383813976077e-05, "loss": 1.1861, "step": 394 }, { "epoch": 0.12902698950634928, "grad_norm": 0.6230116486549377, "learning_rate": 9.679548775582421e-05, "loss": 1.1512, "step": 395 }, { "epoch": 0.12935364011269446, "grad_norm": 0.660778284072876, "learning_rate": 9.67770864291625e-05, "loss": 1.3088, "step": 396 }, { "epoch": 0.12968029071903964, "grad_norm": 0.8624020218849182, "learning_rate": 9.675863417980784e-05, "loss": 1.4296, "step": 397 }, { "epoch": 0.13000694132538484, "grad_norm": 1.1575559377670288, "learning_rate": 9.674013102784776e-05, "loss": 1.5907, "step": 398 }, { "epoch": 0.13033359193173003, "grad_norm": 1.721889615058899, "learning_rate": 9.672157699342526e-05, "loss": 1.3368, "step": 399 }, { "epoch": 0.1306602425380752, "grad_norm": 1.968307614326477, "learning_rate": 9.670297209673871e-05, "loss": 1.9616, "step": 400 }, { "epoch": 0.1309868931444204, "grad_norm": 0.19404010474681854, "learning_rate": 9.668431635804189e-05, "loss": 0.7653, "step": 401 }, { "epoch": 0.1313135437507656, "grad_norm": 0.20838405191898346, "learning_rate": 9.66656097976439e-05, "loss": 0.8618, "step": 402 }, { "epoch": 0.13164019435711077, "grad_norm": 0.24252377450466156, "learning_rate": 9.664685243590911e-05, "loss": 0.9215, "step": 403 }, { "epoch": 0.13196684496345595, "grad_norm": 0.24695122241973877, "learning_rate": 9.662804429325732e-05, "loss": 0.8518, "step": 404 }, { "epoch": 0.13229349556980116, "grad_norm": 0.24052157998085022, "learning_rate": 9.660918539016348e-05, "loss": 0.8623, "step": 405 }, { "epoch": 0.13262014617614634, "grad_norm": 0.25183114409446716, "learning_rate": 9.659027574715789e-05, "loss": 0.8884, "step": 406 }, { "epoch": 0.13294679678249152, "grad_norm": 0.26628735661506653, "learning_rate": 9.657131538482605e-05, "loss": 0.8609, "step": 407 }, { "epoch": 0.1332734473888367, "grad_norm": 0.2659852206707001, "learning_rate": 9.655230432380869e-05, "loss": 0.949, "step": 408 }, { "epoch": 0.1336000979951819, "grad_norm": 0.2931317687034607, "learning_rate": 9.653324258480167e-05, "loss": 0.8961, "step": 409 }, { "epoch": 0.1339267486015271, "grad_norm": 0.29204240441322327, "learning_rate": 9.651413018855613e-05, "loss": 0.9064, "step": 410 }, { "epoch": 0.13425339920787227, "grad_norm": 0.30939286947250366, "learning_rate": 9.649496715587828e-05, "loss": 0.8333, "step": 411 }, { "epoch": 0.13458004981421748, "grad_norm": 0.34340524673461914, "learning_rate": 9.647575350762946e-05, "loss": 0.9585, "step": 412 }, { "epoch": 0.13490670042056266, "grad_norm": 0.3608761429786682, "learning_rate": 9.645648926472612e-05, "loss": 0.9619, "step": 413 }, { "epoch": 0.13523335102690784, "grad_norm": 0.37178945541381836, "learning_rate": 9.643717444813982e-05, "loss": 1.0787, "step": 414 }, { "epoch": 0.13556000163325302, "grad_norm": 0.38620254397392273, "learning_rate": 9.641780907889712e-05, "loss": 1.1331, "step": 415 }, { "epoch": 0.13588665223959823, "grad_norm": 0.38686099648475647, "learning_rate": 9.639839317807963e-05, "loss": 0.9718, "step": 416 }, { "epoch": 0.1362133028459434, "grad_norm": 0.4547607898712158, "learning_rate": 9.637892676682403e-05, "loss": 1.2155, "step": 417 }, { "epoch": 0.1365399534522886, "grad_norm": 0.45713773369789124, "learning_rate": 9.635940986632188e-05, "loss": 0.9978, "step": 418 }, { "epoch": 0.1368666040586338, "grad_norm": 0.572322428226471, "learning_rate": 9.633984249781977e-05, "loss": 1.3199, "step": 419 }, { "epoch": 0.13719325466497898, "grad_norm": 0.5989283919334412, "learning_rate": 9.632022468261927e-05, "loss": 1.2575, "step": 420 }, { "epoch": 0.13751990527132416, "grad_norm": 0.6520776152610779, "learning_rate": 9.630055644207677e-05, "loss": 1.119, "step": 421 }, { "epoch": 0.13784655587766934, "grad_norm": 0.9217725992202759, "learning_rate": 9.628083779760361e-05, "loss": 1.3642, "step": 422 }, { "epoch": 0.13817320648401454, "grad_norm": 1.1913374662399292, "learning_rate": 9.6261068770666e-05, "loss": 1.3737, "step": 423 }, { "epoch": 0.13849985709035972, "grad_norm": 1.2535861730575562, "learning_rate": 9.6241249382785e-05, "loss": 1.5402, "step": 424 }, { "epoch": 0.1388265076967049, "grad_norm": 2.2817628383636475, "learning_rate": 9.622137965553647e-05, "loss": 1.8676, "step": 425 }, { "epoch": 0.1391531583030501, "grad_norm": 0.17584745585918427, "learning_rate": 9.62014596105511e-05, "loss": 0.7117, "step": 426 }, { "epoch": 0.1394798089093953, "grad_norm": 0.22812600433826447, "learning_rate": 9.618148926951434e-05, "loss": 0.8282, "step": 427 }, { "epoch": 0.13980645951574047, "grad_norm": 0.24293436110019684, "learning_rate": 9.616146865416638e-05, "loss": 0.8739, "step": 428 }, { "epoch": 0.14013311012208565, "grad_norm": 0.24836447834968567, "learning_rate": 9.614139778630219e-05, "loss": 0.7628, "step": 429 }, { "epoch": 0.14045976072843086, "grad_norm": 0.2706006169319153, "learning_rate": 9.612127668777139e-05, "loss": 0.8577, "step": 430 }, { "epoch": 0.14078641133477604, "grad_norm": 0.2763189673423767, "learning_rate": 9.61011053804783e-05, "loss": 0.9749, "step": 431 }, { "epoch": 0.14111306194112122, "grad_norm": 0.2643950581550598, "learning_rate": 9.608088388638193e-05, "loss": 0.8468, "step": 432 }, { "epoch": 0.14143971254746643, "grad_norm": 0.2801801562309265, "learning_rate": 9.606061222749587e-05, "loss": 0.8929, "step": 433 }, { "epoch": 0.1417663631538116, "grad_norm": 0.3056126832962036, "learning_rate": 9.604029042588838e-05, "loss": 0.8787, "step": 434 }, { "epoch": 0.1420930137601568, "grad_norm": 0.3168674409389496, "learning_rate": 9.601991850368224e-05, "loss": 0.933, "step": 435 }, { "epoch": 0.14241966436650197, "grad_norm": 0.31766176223754883, "learning_rate": 9.599949648305486e-05, "loss": 1.0011, "step": 436 }, { "epoch": 0.14274631497284718, "grad_norm": 0.35657021403312683, "learning_rate": 9.597902438623814e-05, "loss": 1.0058, "step": 437 }, { "epoch": 0.14307296557919236, "grad_norm": 0.4108402132987976, "learning_rate": 9.59585022355185e-05, "loss": 1.2417, "step": 438 }, { "epoch": 0.14339961618553754, "grad_norm": 0.4023765027523041, "learning_rate": 9.593793005323689e-05, "loss": 1.031, "step": 439 }, { "epoch": 0.14372626679188275, "grad_norm": 0.3895339369773865, "learning_rate": 9.591730786178866e-05, "loss": 1.0337, "step": 440 }, { "epoch": 0.14405291739822793, "grad_norm": 0.42880305647850037, "learning_rate": 9.589663568362368e-05, "loss": 1.0338, "step": 441 }, { "epoch": 0.1443795680045731, "grad_norm": 0.4699881076812744, "learning_rate": 9.587591354124616e-05, "loss": 1.0881, "step": 442 }, { "epoch": 0.14470621861091829, "grad_norm": 0.5102139711380005, "learning_rate": 9.585514145721475e-05, "loss": 1.0901, "step": 443 }, { "epoch": 0.1450328692172635, "grad_norm": 0.5716970562934875, "learning_rate": 9.583431945414245e-05, "loss": 1.1766, "step": 444 }, { "epoch": 0.14535951982360867, "grad_norm": 0.6934454441070557, "learning_rate": 9.581344755469663e-05, "loss": 1.2836, "step": 445 }, { "epoch": 0.14568617042995385, "grad_norm": 0.8125836849212646, "learning_rate": 9.579252578159892e-05, "loss": 1.2623, "step": 446 }, { "epoch": 0.14601282103629906, "grad_norm": 1.278208613395691, "learning_rate": 9.57715541576253e-05, "loss": 1.5706, "step": 447 }, { "epoch": 0.14633947164264424, "grad_norm": 1.3957326412200928, "learning_rate": 9.575053270560598e-05, "loss": 1.675, "step": 448 }, { "epoch": 0.14666612224898942, "grad_norm": 1.5048333406448364, "learning_rate": 9.572946144842547e-05, "loss": 1.4975, "step": 449 }, { "epoch": 0.1469927728553346, "grad_norm": 2.2100048065185547, "learning_rate": 9.570834040902243e-05, "loss": 2.1886, "step": 450 }, { "epoch": 0.1473194234616798, "grad_norm": 0.1943780481815338, "learning_rate": 9.568716961038977e-05, "loss": 0.7153, "step": 451 }, { "epoch": 0.147646074068025, "grad_norm": 0.20413154363632202, "learning_rate": 9.566594907557452e-05, "loss": 0.8099, "step": 452 }, { "epoch": 0.14797272467437017, "grad_norm": 0.24154390394687653, "learning_rate": 9.564467882767787e-05, "loss": 0.9133, "step": 453 }, { "epoch": 0.14829937528071538, "grad_norm": 0.2548098564147949, "learning_rate": 9.562335888985516e-05, "loss": 0.8946, "step": 454 }, { "epoch": 0.14862602588706056, "grad_norm": 0.2680473029613495, "learning_rate": 9.560198928531581e-05, "loss": 0.8944, "step": 455 }, { "epoch": 0.14895267649340574, "grad_norm": 0.2628840506076813, "learning_rate": 9.55805700373233e-05, "loss": 0.8787, "step": 456 }, { "epoch": 0.14927932709975092, "grad_norm": 0.2785063087940216, "learning_rate": 9.55591011691951e-05, "loss": 0.8681, "step": 457 }, { "epoch": 0.14960597770609613, "grad_norm": 0.28061676025390625, "learning_rate": 9.553758270430284e-05, "loss": 0.8655, "step": 458 }, { "epoch": 0.1499326283124413, "grad_norm": 0.29894521832466125, "learning_rate": 9.551601466607197e-05, "loss": 0.9433, "step": 459 }, { "epoch": 0.1502592789187865, "grad_norm": 0.292904794216156, "learning_rate": 9.549439707798203e-05, "loss": 0.8323, "step": 460 }, { "epoch": 0.15058592952513167, "grad_norm": 0.3242059648036957, "learning_rate": 9.547272996356646e-05, "loss": 0.9685, "step": 461 }, { "epoch": 0.15091258013147688, "grad_norm": 0.334773987531662, "learning_rate": 9.545101334641262e-05, "loss": 0.946, "step": 462 }, { "epoch": 0.15123923073782206, "grad_norm": 0.3734418749809265, "learning_rate": 9.542924725016173e-05, "loss": 1.0552, "step": 463 }, { "epoch": 0.15156588134416724, "grad_norm": 0.41196638345718384, "learning_rate": 9.540743169850893e-05, "loss": 1.1502, "step": 464 }, { "epoch": 0.15189253195051244, "grad_norm": 0.4083957076072693, "learning_rate": 9.538556671520316e-05, "loss": 1.0984, "step": 465 }, { "epoch": 0.15221918255685762, "grad_norm": 0.4450596570968628, "learning_rate": 9.536365232404718e-05, "loss": 1.1297, "step": 466 }, { "epoch": 0.1525458331632028, "grad_norm": 0.4478522837162018, "learning_rate": 9.534168854889754e-05, "loss": 1.125, "step": 467 }, { "epoch": 0.15287248376954798, "grad_norm": 0.5305338501930237, "learning_rate": 9.531967541366452e-05, "loss": 1.1453, "step": 468 }, { "epoch": 0.1531991343758932, "grad_norm": 0.5751011967658997, "learning_rate": 9.529761294231221e-05, "loss": 1.2348, "step": 469 }, { "epoch": 0.15352578498223837, "grad_norm": 0.6348747611045837, "learning_rate": 9.527550115885833e-05, "loss": 1.2891, "step": 470 }, { "epoch": 0.15385243558858355, "grad_norm": 0.7538950443267822, "learning_rate": 9.525334008737435e-05, "loss": 1.4452, "step": 471 }, { "epoch": 0.15417908619492876, "grad_norm": 0.9422009587287903, "learning_rate": 9.523112975198532e-05, "loss": 1.3917, "step": 472 }, { "epoch": 0.15450573680127394, "grad_norm": 1.307346224784851, "learning_rate": 9.520887017686997e-05, "loss": 1.5785, "step": 473 }, { "epoch": 0.15483238740761912, "grad_norm": 1.255781650543213, "learning_rate": 9.518656138626063e-05, "loss": 1.7596, "step": 474 }, { "epoch": 0.1551590380139643, "grad_norm": 1.9935503005981445, "learning_rate": 9.51642034044432e-05, "loss": 2.3361, "step": 475 }, { "epoch": 0.1554856886203095, "grad_norm": 0.23217955231666565, "learning_rate": 9.514179625575715e-05, "loss": 0.7319, "step": 476 }, { "epoch": 0.1558123392266547, "grad_norm": 0.23310193419456482, "learning_rate": 9.511933996459544e-05, "loss": 0.8677, "step": 477 }, { "epoch": 0.15613898983299987, "grad_norm": 0.22472940385341644, "learning_rate": 9.509683455540452e-05, "loss": 0.8419, "step": 478 }, { "epoch": 0.15646564043934508, "grad_norm": 0.23470456898212433, "learning_rate": 9.507428005268438e-05, "loss": 0.921, "step": 479 }, { "epoch": 0.15679229104569026, "grad_norm": 0.2441803216934204, "learning_rate": 9.505167648098837e-05, "loss": 0.9128, "step": 480 }, { "epoch": 0.15711894165203544, "grad_norm": 0.2495880424976349, "learning_rate": 9.502902386492332e-05, "loss": 0.8631, "step": 481 }, { "epoch": 0.15744559225838062, "grad_norm": 0.2750225365161896, "learning_rate": 9.500632222914943e-05, "loss": 0.8532, "step": 482 }, { "epoch": 0.15777224286472583, "grad_norm": 0.29554665088653564, "learning_rate": 9.498357159838025e-05, "loss": 0.9093, "step": 483 }, { "epoch": 0.158098893471071, "grad_norm": 0.26975587010383606, "learning_rate": 9.496077199738267e-05, "loss": 0.8197, "step": 484 }, { "epoch": 0.15842554407741619, "grad_norm": 0.31354406476020813, "learning_rate": 9.493792345097693e-05, "loss": 0.8923, "step": 485 }, { "epoch": 0.1587521946837614, "grad_norm": 0.29501739144325256, "learning_rate": 9.49150259840365e-05, "loss": 0.8918, "step": 486 }, { "epoch": 0.15907884529010657, "grad_norm": 0.3456759452819824, "learning_rate": 9.489207962148814e-05, "loss": 0.9493, "step": 487 }, { "epoch": 0.15940549589645175, "grad_norm": 0.3523378372192383, "learning_rate": 9.486908438831181e-05, "loss": 0.9785, "step": 488 }, { "epoch": 0.15973214650279693, "grad_norm": 0.38017013669013977, "learning_rate": 9.484604030954072e-05, "loss": 1.0719, "step": 489 }, { "epoch": 0.16005879710914214, "grad_norm": 0.39783334732055664, "learning_rate": 9.482294741026119e-05, "loss": 0.9972, "step": 490 }, { "epoch": 0.16038544771548732, "grad_norm": 0.4237908124923706, "learning_rate": 9.479980571561274e-05, "loss": 1.0702, "step": 491 }, { "epoch": 0.1607120983218325, "grad_norm": 0.4826822876930237, "learning_rate": 9.4776615250788e-05, "loss": 1.0767, "step": 492 }, { "epoch": 0.1610387489281777, "grad_norm": 0.47704166173934937, "learning_rate": 9.475337604103266e-05, "loss": 0.9948, "step": 493 }, { "epoch": 0.1613653995345229, "grad_norm": 0.520698070526123, "learning_rate": 9.47300881116455e-05, "loss": 1.0778, "step": 494 }, { "epoch": 0.16169205014086807, "grad_norm": 0.5984883308410645, "learning_rate": 9.470675148797836e-05, "loss": 1.2363, "step": 495 }, { "epoch": 0.16201870074721325, "grad_norm": 0.788203775882721, "learning_rate": 9.468336619543605e-05, "loss": 1.3552, "step": 496 }, { "epoch": 0.16234535135355846, "grad_norm": 0.9296058416366577, "learning_rate": 9.465993225947638e-05, "loss": 1.3718, "step": 497 }, { "epoch": 0.16267200195990364, "grad_norm": 1.005128264427185, "learning_rate": 9.463644970561009e-05, "loss": 1.2159, "step": 498 }, { "epoch": 0.16299865256624882, "grad_norm": 1.1713541746139526, "learning_rate": 9.461291855940091e-05, "loss": 1.4884, "step": 499 }, { "epoch": 0.16332530317259403, "grad_norm": 1.967857003211975, "learning_rate": 9.458933884646541e-05, "loss": 1.4611, "step": 500 }, { "epoch": 0.1636519537789392, "grad_norm": 0.18524925410747528, "learning_rate": 9.456571059247303e-05, "loss": 0.7829, "step": 501 }, { "epoch": 0.1639786043852844, "grad_norm": 0.2153564691543579, "learning_rate": 9.45420338231461e-05, "loss": 0.8495, "step": 502 }, { "epoch": 0.16430525499162957, "grad_norm": 0.2383943796157837, "learning_rate": 9.451830856425973e-05, "loss": 0.7869, "step": 503 }, { "epoch": 0.16463190559797478, "grad_norm": 0.24494485557079315, "learning_rate": 9.449453484164181e-05, "loss": 0.8712, "step": 504 }, { "epoch": 0.16495855620431996, "grad_norm": 0.2590121626853943, "learning_rate": 9.4470712681173e-05, "loss": 0.8634, "step": 505 }, { "epoch": 0.16528520681066514, "grad_norm": 0.2706097662448883, "learning_rate": 9.444684210878671e-05, "loss": 0.8342, "step": 506 }, { "epoch": 0.16561185741701034, "grad_norm": 0.2868553698062897, "learning_rate": 9.442292315046903e-05, "loss": 0.9175, "step": 507 }, { "epoch": 0.16593850802335552, "grad_norm": 0.29883742332458496, "learning_rate": 9.439895583225873e-05, "loss": 0.9143, "step": 508 }, { "epoch": 0.1662651586297007, "grad_norm": 0.31189242005348206, "learning_rate": 9.437494018024721e-05, "loss": 0.9101, "step": 509 }, { "epoch": 0.16659180923604588, "grad_norm": 0.2974110245704651, "learning_rate": 9.435087622057855e-05, "loss": 0.8765, "step": 510 }, { "epoch": 0.1669184598423911, "grad_norm": 0.3233146071434021, "learning_rate": 9.43267639794493e-05, "loss": 0.9817, "step": 511 }, { "epoch": 0.16724511044873627, "grad_norm": 0.33368226885795593, "learning_rate": 9.430260348310869e-05, "loss": 0.968, "step": 512 }, { "epoch": 0.16757176105508145, "grad_norm": 0.37103214859962463, "learning_rate": 9.427839475785844e-05, "loss": 0.9822, "step": 513 }, { "epoch": 0.16789841166142663, "grad_norm": 0.36908629536628723, "learning_rate": 9.425413783005272e-05, "loss": 1.0336, "step": 514 }, { "epoch": 0.16822506226777184, "grad_norm": 0.41780969500541687, "learning_rate": 9.422983272609828e-05, "loss": 0.9956, "step": 515 }, { "epoch": 0.16855171287411702, "grad_norm": 0.4576914310455322, "learning_rate": 9.420547947245422e-05, "loss": 0.9278, "step": 516 }, { "epoch": 0.1688783634804622, "grad_norm": 0.49422821402549744, "learning_rate": 9.418107809563208e-05, "loss": 1.1263, "step": 517 }, { "epoch": 0.1692050140868074, "grad_norm": 0.5176439881324768, "learning_rate": 9.415662862219585e-05, "loss": 1.1614, "step": 518 }, { "epoch": 0.1695316646931526, "grad_norm": 0.48677173256874084, "learning_rate": 9.41321310787618e-05, "loss": 0.9933, "step": 519 }, { "epoch": 0.16985831529949777, "grad_norm": 0.6720970273017883, "learning_rate": 9.410758549199856e-05, "loss": 1.3328, "step": 520 }, { "epoch": 0.17018496590584295, "grad_norm": 0.7612361311912537, "learning_rate": 9.408299188862709e-05, "loss": 1.2837, "step": 521 }, { "epoch": 0.17051161651218816, "grad_norm": 0.8276899456977844, "learning_rate": 9.405835029542055e-05, "loss": 1.5145, "step": 522 }, { "epoch": 0.17083826711853334, "grad_norm": 1.2721513509750366, "learning_rate": 9.403366073920442e-05, "loss": 1.6956, "step": 523 }, { "epoch": 0.17116491772487852, "grad_norm": 1.5282410383224487, "learning_rate": 9.400892324685636e-05, "loss": 2.0885, "step": 524 }, { "epoch": 0.17149156833122373, "grad_norm": 1.656018853187561, "learning_rate": 9.398413784530621e-05, "loss": 1.6356, "step": 525 }, { "epoch": 0.1718182189375689, "grad_norm": 0.18846455216407776, "learning_rate": 9.395930456153597e-05, "loss": 0.867, "step": 526 }, { "epoch": 0.1721448695439141, "grad_norm": 0.21775482594966888, "learning_rate": 9.393442342257977e-05, "loss": 0.7604, "step": 527 }, { "epoch": 0.17247152015025927, "grad_norm": 0.22779767215251923, "learning_rate": 9.390949445552383e-05, "loss": 0.7736, "step": 528 }, { "epoch": 0.17279817075660447, "grad_norm": 0.23663291335105896, "learning_rate": 9.388451768750644e-05, "loss": 0.8514, "step": 529 }, { "epoch": 0.17312482136294965, "grad_norm": 0.25589585304260254, "learning_rate": 9.385949314571792e-05, "loss": 0.8874, "step": 530 }, { "epoch": 0.17345147196929483, "grad_norm": 0.24930959939956665, "learning_rate": 9.383442085740062e-05, "loss": 0.8109, "step": 531 }, { "epoch": 0.17377812257564004, "grad_norm": 0.2678419053554535, "learning_rate": 9.380930084984884e-05, "loss": 0.8583, "step": 532 }, { "epoch": 0.17410477318198522, "grad_norm": 0.2837614417076111, "learning_rate": 9.378413315040887e-05, "loss": 0.9681, "step": 533 }, { "epoch": 0.1744314237883304, "grad_norm": 0.2679899036884308, "learning_rate": 9.375891778647885e-05, "loss": 0.8959, "step": 534 }, { "epoch": 0.17475807439467558, "grad_norm": 0.27939561009407043, "learning_rate": 9.373365478550886e-05, "loss": 0.9481, "step": 535 }, { "epoch": 0.1750847250010208, "grad_norm": 0.28748729825019836, "learning_rate": 9.370834417500085e-05, "loss": 0.8571, "step": 536 }, { "epoch": 0.17541137560736597, "grad_norm": 0.3106549084186554, "learning_rate": 9.368298598250856e-05, "loss": 0.878, "step": 537 }, { "epoch": 0.17573802621371115, "grad_norm": 0.34394335746765137, "learning_rate": 9.365758023563753e-05, "loss": 0.9116, "step": 538 }, { "epoch": 0.17606467682005636, "grad_norm": 0.3183876872062683, "learning_rate": 9.363212696204511e-05, "loss": 0.8952, "step": 539 }, { "epoch": 0.17639132742640154, "grad_norm": 0.39337795972824097, "learning_rate": 9.360662618944033e-05, "loss": 1.0152, "step": 540 }, { "epoch": 0.17671797803274672, "grad_norm": 0.4016577899456024, "learning_rate": 9.358107794558401e-05, "loss": 0.9301, "step": 541 }, { "epoch": 0.1770446286390919, "grad_norm": 0.42000070214271545, "learning_rate": 9.355548225828858e-05, "loss": 1.111, "step": 542 }, { "epoch": 0.1773712792454371, "grad_norm": 0.5311472415924072, "learning_rate": 9.352983915541813e-05, "loss": 1.1984, "step": 543 }, { "epoch": 0.1776979298517823, "grad_norm": 0.5248878598213196, "learning_rate": 9.350414866488837e-05, "loss": 1.1448, "step": 544 }, { "epoch": 0.17802458045812747, "grad_norm": 0.7028717994689941, "learning_rate": 9.347841081466662e-05, "loss": 1.1947, "step": 545 }, { "epoch": 0.17835123106447268, "grad_norm": 0.7405298948287964, "learning_rate": 9.345262563277173e-05, "loss": 1.3281, "step": 546 }, { "epoch": 0.17867788167081786, "grad_norm": 1.0811318159103394, "learning_rate": 9.342679314727408e-05, "loss": 1.2947, "step": 547 }, { "epoch": 0.17900453227716304, "grad_norm": 1.2324838638305664, "learning_rate": 9.340091338629556e-05, "loss": 1.4522, "step": 548 }, { "epoch": 0.17933118288350822, "grad_norm": 1.280505657196045, "learning_rate": 9.337498637800952e-05, "loss": 1.4853, "step": 549 }, { "epoch": 0.17965783348985342, "grad_norm": 1.5319606065750122, "learning_rate": 9.334901215064075e-05, "loss": 1.8064, "step": 550 }, { "epoch": 0.1799844840961986, "grad_norm": 0.19149908423423767, "learning_rate": 9.332299073246543e-05, "loss": 0.6917, "step": 551 }, { "epoch": 0.18031113470254378, "grad_norm": 0.24123403429985046, "learning_rate": 9.329692215181111e-05, "loss": 0.8778, "step": 552 }, { "epoch": 0.180637785308889, "grad_norm": 0.2261694222688675, "learning_rate": 9.32708064370567e-05, "loss": 0.7859, "step": 553 }, { "epoch": 0.18096443591523417, "grad_norm": 0.2447603940963745, "learning_rate": 9.32446436166324e-05, "loss": 0.8404, "step": 554 }, { "epoch": 0.18129108652157935, "grad_norm": 0.25092342495918274, "learning_rate": 9.321843371901975e-05, "loss": 0.8508, "step": 555 }, { "epoch": 0.18161773712792453, "grad_norm": 0.2693803906440735, "learning_rate": 9.319217677275142e-05, "loss": 0.8603, "step": 556 }, { "epoch": 0.18194438773426974, "grad_norm": 0.26989585161209106, "learning_rate": 9.316587280641142e-05, "loss": 0.819, "step": 557 }, { "epoch": 0.18227103834061492, "grad_norm": 0.3000803589820862, "learning_rate": 9.313952184863489e-05, "loss": 0.8996, "step": 558 }, { "epoch": 0.1825976889469601, "grad_norm": 0.27644068002700806, "learning_rate": 9.311312392810813e-05, "loss": 0.9319, "step": 559 }, { "epoch": 0.1829243395533053, "grad_norm": 0.31071117520332336, "learning_rate": 9.308667907356856e-05, "loss": 0.9464, "step": 560 }, { "epoch": 0.1832509901596505, "grad_norm": 0.3157282769680023, "learning_rate": 9.306018731380472e-05, "loss": 0.9281, "step": 561 }, { "epoch": 0.18357764076599567, "grad_norm": 0.33144116401672363, "learning_rate": 9.303364867765619e-05, "loss": 0.8753, "step": 562 }, { "epoch": 0.18390429137234085, "grad_norm": 0.3958095610141754, "learning_rate": 9.300706319401358e-05, "loss": 1.0352, "step": 563 }, { "epoch": 0.18423094197868606, "grad_norm": 0.4302027225494385, "learning_rate": 9.298043089181852e-05, "loss": 0.9228, "step": 564 }, { "epoch": 0.18455759258503124, "grad_norm": 0.39587950706481934, "learning_rate": 9.295375180006356e-05, "loss": 0.9858, "step": 565 }, { "epoch": 0.18488424319137642, "grad_norm": 0.486945778131485, "learning_rate": 9.292702594779224e-05, "loss": 1.0143, "step": 566 }, { "epoch": 0.1852108937977216, "grad_norm": 0.44961902499198914, "learning_rate": 9.2900253364099e-05, "loss": 1.0699, "step": 567 }, { "epoch": 0.1855375444040668, "grad_norm": 0.4762088656425476, "learning_rate": 9.287343407812909e-05, "loss": 1.0856, "step": 568 }, { "epoch": 0.185864195010412, "grad_norm": 0.5508722066879272, "learning_rate": 9.28465681190787e-05, "loss": 1.2314, "step": 569 }, { "epoch": 0.18619084561675717, "grad_norm": 0.6167343258857727, "learning_rate": 9.281965551619476e-05, "loss": 1.1919, "step": 570 }, { "epoch": 0.18651749622310237, "grad_norm": 0.7416098713874817, "learning_rate": 9.279269629877497e-05, "loss": 1.2134, "step": 571 }, { "epoch": 0.18684414682944755, "grad_norm": 0.9034223556518555, "learning_rate": 9.276569049616784e-05, "loss": 1.2163, "step": 572 }, { "epoch": 0.18717079743579274, "grad_norm": 1.2114628553390503, "learning_rate": 9.273863813777253e-05, "loss": 1.2241, "step": 573 }, { "epoch": 0.18749744804213792, "grad_norm": 1.2960182428359985, "learning_rate": 9.27115392530389e-05, "loss": 1.1737, "step": 574 }, { "epoch": 0.18782409864848312, "grad_norm": 1.766401767730713, "learning_rate": 9.268439387146747e-05, "loss": 1.6614, "step": 575 }, { "epoch": 0.1881507492548283, "grad_norm": 0.1993979662656784, "learning_rate": 9.26572020226094e-05, "loss": 0.8511, "step": 576 }, { "epoch": 0.18847739986117348, "grad_norm": 0.23298919200897217, "learning_rate": 9.262996373606638e-05, "loss": 0.8701, "step": 577 }, { "epoch": 0.1888040504675187, "grad_norm": 0.2348179668188095, "learning_rate": 9.26026790414907e-05, "loss": 0.8383, "step": 578 }, { "epoch": 0.18913070107386387, "grad_norm": 0.258207768201828, "learning_rate": 9.257534796858514e-05, "loss": 0.8137, "step": 579 }, { "epoch": 0.18945735168020905, "grad_norm": 0.2487855851650238, "learning_rate": 9.2547970547103e-05, "loss": 0.8072, "step": 580 }, { "epoch": 0.18978400228655423, "grad_norm": 0.2482251226902008, "learning_rate": 9.252054680684799e-05, "loss": 0.8812, "step": 581 }, { "epoch": 0.19011065289289944, "grad_norm": 0.264870285987854, "learning_rate": 9.249307677767429e-05, "loss": 0.9027, "step": 582 }, { "epoch": 0.19043730349924462, "grad_norm": 0.27695828676223755, "learning_rate": 9.246556048948645e-05, "loss": 0.9643, "step": 583 }, { "epoch": 0.1907639541055898, "grad_norm": 0.2882271409034729, "learning_rate": 9.243799797223938e-05, "loss": 0.9431, "step": 584 }, { "epoch": 0.191090604711935, "grad_norm": 0.2983972132205963, "learning_rate": 9.241038925593832e-05, "loss": 0.8474, "step": 585 }, { "epoch": 0.1914172553182802, "grad_norm": 0.2943267524242401, "learning_rate": 9.23827343706388e-05, "loss": 0.8948, "step": 586 }, { "epoch": 0.19174390592462537, "grad_norm": 0.3304755985736847, "learning_rate": 9.235503334644662e-05, "loss": 0.98, "step": 587 }, { "epoch": 0.19207055653097055, "grad_norm": 0.3549445867538452, "learning_rate": 9.232728621351778e-05, "loss": 0.9946, "step": 588 }, { "epoch": 0.19239720713731576, "grad_norm": 0.3509342670440674, "learning_rate": 9.229949300205852e-05, "loss": 0.9771, "step": 589 }, { "epoch": 0.19272385774366094, "grad_norm": 0.37002110481262207, "learning_rate": 9.22716537423252e-05, "loss": 1.0676, "step": 590 }, { "epoch": 0.19305050835000612, "grad_norm": 0.3887923061847687, "learning_rate": 9.224376846462434e-05, "loss": 1.0657, "step": 591 }, { "epoch": 0.19337715895635132, "grad_norm": 0.41990533471107483, "learning_rate": 9.221583719931253e-05, "loss": 1.0131, "step": 592 }, { "epoch": 0.1937038095626965, "grad_norm": 0.46272900700569153, "learning_rate": 9.218785997679643e-05, "loss": 1.1946, "step": 593 }, { "epoch": 0.19403046016904169, "grad_norm": 0.5167956352233887, "learning_rate": 9.215983682753275e-05, "loss": 0.9902, "step": 594 }, { "epoch": 0.19435711077538687, "grad_norm": 0.5987634658813477, "learning_rate": 9.213176778202818e-05, "loss": 1.1352, "step": 595 }, { "epoch": 0.19468376138173207, "grad_norm": 0.6184087991714478, "learning_rate": 9.210365287083939e-05, "loss": 1.3175, "step": 596 }, { "epoch": 0.19501041198807725, "grad_norm": 0.7726343870162964, "learning_rate": 9.207549212457293e-05, "loss": 1.1563, "step": 597 }, { "epoch": 0.19533706259442243, "grad_norm": 0.941210925579071, "learning_rate": 9.204728557388535e-05, "loss": 1.3595, "step": 598 }, { "epoch": 0.19566371320076764, "grad_norm": 1.330126166343689, "learning_rate": 9.201903324948292e-05, "loss": 1.7922, "step": 599 }, { "epoch": 0.19599036380711282, "grad_norm": 2.1613612174987793, "learning_rate": 9.199073518212186e-05, "loss": 1.4775, "step": 600 }, { "epoch": 0.196317014413458, "grad_norm": 0.19565562903881073, "learning_rate": 9.196239140260816e-05, "loss": 0.7829, "step": 601 }, { "epoch": 0.19664366501980318, "grad_norm": 0.22353625297546387, "learning_rate": 9.193400194179753e-05, "loss": 0.8601, "step": 602 }, { "epoch": 0.1969703156261484, "grad_norm": 0.23643703758716583, "learning_rate": 9.190556683059546e-05, "loss": 0.8285, "step": 603 }, { "epoch": 0.19729696623249357, "grad_norm": 0.24481813609600067, "learning_rate": 9.187708609995711e-05, "loss": 0.942, "step": 604 }, { "epoch": 0.19762361683883875, "grad_norm": 0.2547847032546997, "learning_rate": 9.184855978088729e-05, "loss": 0.7936, "step": 605 }, { "epoch": 0.19795026744518396, "grad_norm": 0.2569129765033722, "learning_rate": 9.181998790444047e-05, "loss": 0.8767, "step": 606 }, { "epoch": 0.19827691805152914, "grad_norm": 0.26046594977378845, "learning_rate": 9.179137050172071e-05, "loss": 0.9215, "step": 607 }, { "epoch": 0.19860356865787432, "grad_norm": 0.2738451063632965, "learning_rate": 9.176270760388161e-05, "loss": 0.9197, "step": 608 }, { "epoch": 0.1989302192642195, "grad_norm": 0.2807084619998932, "learning_rate": 9.173399924212631e-05, "loss": 0.8967, "step": 609 }, { "epoch": 0.1992568698705647, "grad_norm": 0.29269683361053467, "learning_rate": 9.170524544770745e-05, "loss": 0.8018, "step": 610 }, { "epoch": 0.1995835204769099, "grad_norm": 0.30087584257125854, "learning_rate": 9.167644625192713e-05, "loss": 0.919, "step": 611 }, { "epoch": 0.19991017108325507, "grad_norm": 0.33221113681793213, "learning_rate": 9.164760168613683e-05, "loss": 0.997, "step": 612 }, { "epoch": 0.20023682168960028, "grad_norm": 0.3531966805458069, "learning_rate": 9.161871178173749e-05, "loss": 0.9249, "step": 613 }, { "epoch": 0.20056347229594546, "grad_norm": 0.40778499841690063, "learning_rate": 9.158977657017937e-05, "loss": 0.9746, "step": 614 }, { "epoch": 0.20089012290229064, "grad_norm": 0.40673455595970154, "learning_rate": 9.156079608296204e-05, "loss": 0.9772, "step": 615 }, { "epoch": 0.20121677350863582, "grad_norm": 0.4674896001815796, "learning_rate": 9.15317703516344e-05, "loss": 0.9516, "step": 616 }, { "epoch": 0.20154342411498102, "grad_norm": 0.4718310236930847, "learning_rate": 9.150269940779457e-05, "loss": 1.0437, "step": 617 }, { "epoch": 0.2018700747213262, "grad_norm": 0.5518301725387573, "learning_rate": 9.147358328308987e-05, "loss": 1.2963, "step": 618 }, { "epoch": 0.20219672532767138, "grad_norm": 0.5814355611801147, "learning_rate": 9.144442200921688e-05, "loss": 1.0964, "step": 619 }, { "epoch": 0.20252337593401656, "grad_norm": 0.6938658952713013, "learning_rate": 9.141521561792127e-05, "loss": 1.3407, "step": 620 }, { "epoch": 0.20285002654036177, "grad_norm": 0.7887697815895081, "learning_rate": 9.138596414099781e-05, "loss": 1.4162, "step": 621 }, { "epoch": 0.20317667714670695, "grad_norm": 0.8191277384757996, "learning_rate": 9.135666761029043e-05, "loss": 1.3763, "step": 622 }, { "epoch": 0.20350332775305213, "grad_norm": 1.0939558744430542, "learning_rate": 9.1327326057692e-05, "loss": 1.2384, "step": 623 }, { "epoch": 0.20382997835939734, "grad_norm": 1.284587025642395, "learning_rate": 9.129793951514449e-05, "loss": 1.7821, "step": 624 }, { "epoch": 0.20415662896574252, "grad_norm": 1.8239585161209106, "learning_rate": 9.126850801463884e-05, "loss": 2.5487, "step": 625 }, { "epoch": 0.2044832795720877, "grad_norm": 0.1805727183818817, "learning_rate": 9.123903158821487e-05, "loss": 0.7256, "step": 626 }, { "epoch": 0.20480993017843288, "grad_norm": 0.2200499176979065, "learning_rate": 9.120951026796138e-05, "loss": 0.796, "step": 627 }, { "epoch": 0.2051365807847781, "grad_norm": 0.23434124886989594, "learning_rate": 9.117994408601598e-05, "loss": 0.9145, "step": 628 }, { "epoch": 0.20546323139112327, "grad_norm": 0.23503413796424866, "learning_rate": 9.115033307456515e-05, "loss": 0.8078, "step": 629 }, { "epoch": 0.20578988199746845, "grad_norm": 0.2527485489845276, "learning_rate": 9.112067726584419e-05, "loss": 0.8819, "step": 630 }, { "epoch": 0.20611653260381366, "grad_norm": 0.27487868070602417, "learning_rate": 9.109097669213713e-05, "loss": 0.8779, "step": 631 }, { "epoch": 0.20644318321015884, "grad_norm": 0.2560602128505707, "learning_rate": 9.106123138577675e-05, "loss": 0.8185, "step": 632 }, { "epoch": 0.20676983381650402, "grad_norm": 0.2872486114501953, "learning_rate": 9.103144137914454e-05, "loss": 0.8721, "step": 633 }, { "epoch": 0.2070964844228492, "grad_norm": 0.298370361328125, "learning_rate": 9.100160670467064e-05, "loss": 0.8647, "step": 634 }, { "epoch": 0.2074231350291944, "grad_norm": 0.3106897473335266, "learning_rate": 9.097172739483379e-05, "loss": 0.968, "step": 635 }, { "epoch": 0.20774978563553959, "grad_norm": 0.32926785945892334, "learning_rate": 9.094180348216135e-05, "loss": 0.9056, "step": 636 }, { "epoch": 0.20807643624188477, "grad_norm": 0.35057681798934937, "learning_rate": 9.091183499922924e-05, "loss": 0.9649, "step": 637 }, { "epoch": 0.20840308684822997, "grad_norm": 0.35323870182037354, "learning_rate": 9.088182197866189e-05, "loss": 0.867, "step": 638 }, { "epoch": 0.20872973745457515, "grad_norm": 0.40710511803627014, "learning_rate": 9.085176445313223e-05, "loss": 0.9675, "step": 639 }, { "epoch": 0.20905638806092033, "grad_norm": 0.38960814476013184, "learning_rate": 9.08216624553616e-05, "loss": 0.9669, "step": 640 }, { "epoch": 0.20938303866726551, "grad_norm": 0.4638998806476593, "learning_rate": 9.079151601811979e-05, "loss": 1.136, "step": 641 }, { "epoch": 0.20970968927361072, "grad_norm": 0.4785967171192169, "learning_rate": 9.076132517422497e-05, "loss": 1.1302, "step": 642 }, { "epoch": 0.2100363398799559, "grad_norm": 0.5356094837188721, "learning_rate": 9.073108995654362e-05, "loss": 1.0974, "step": 643 }, { "epoch": 0.21036299048630108, "grad_norm": 0.5519333481788635, "learning_rate": 9.070081039799056e-05, "loss": 1.0546, "step": 644 }, { "epoch": 0.2106896410926463, "grad_norm": 0.6569791436195374, "learning_rate": 9.067048653152885e-05, "loss": 1.3072, "step": 645 }, { "epoch": 0.21101629169899147, "grad_norm": 0.8130210638046265, "learning_rate": 9.064011839016982e-05, "loss": 1.3312, "step": 646 }, { "epoch": 0.21134294230533665, "grad_norm": 1.0476133823394775, "learning_rate": 9.060970600697296e-05, "loss": 1.389, "step": 647 }, { "epoch": 0.21166959291168183, "grad_norm": 1.3284598588943481, "learning_rate": 9.057924941504596e-05, "loss": 1.8065, "step": 648 }, { "epoch": 0.21199624351802704, "grad_norm": 1.56967031955719, "learning_rate": 9.05487486475446e-05, "loss": 1.9164, "step": 649 }, { "epoch": 0.21232289412437222, "grad_norm": 1.61257803440094, "learning_rate": 9.05182037376728e-05, "loss": 1.6794, "step": 650 }, { "epoch": 0.2126495447307174, "grad_norm": 0.19036370515823364, "learning_rate": 9.048761471868248e-05, "loss": 0.7986, "step": 651 }, { "epoch": 0.2129761953370626, "grad_norm": 0.2137875109910965, "learning_rate": 9.04569816238736e-05, "loss": 0.7879, "step": 652 }, { "epoch": 0.2133028459434078, "grad_norm": 0.21933026611804962, "learning_rate": 9.042630448659413e-05, "loss": 0.8792, "step": 653 }, { "epoch": 0.21362949654975297, "grad_norm": 0.24495473504066467, "learning_rate": 9.039558334023991e-05, "loss": 0.8169, "step": 654 }, { "epoch": 0.21395614715609815, "grad_norm": 0.2623686194419861, "learning_rate": 9.03648182182548e-05, "loss": 0.8705, "step": 655 }, { "epoch": 0.21428279776244336, "grad_norm": 0.26370468735694885, "learning_rate": 9.033400915413044e-05, "loss": 0.9156, "step": 656 }, { "epoch": 0.21460944836878854, "grad_norm": 0.2694458067417145, "learning_rate": 9.030315618140634e-05, "loss": 0.9068, "step": 657 }, { "epoch": 0.21493609897513372, "grad_norm": 0.2986109256744385, "learning_rate": 9.027225933366982e-05, "loss": 0.9385, "step": 658 }, { "epoch": 0.21526274958147892, "grad_norm": 0.28405866026878357, "learning_rate": 9.024131864455594e-05, "loss": 0.8657, "step": 659 }, { "epoch": 0.2155894001878241, "grad_norm": 0.31199824810028076, "learning_rate": 9.02103341477475e-05, "loss": 0.9322, "step": 660 }, { "epoch": 0.21591605079416928, "grad_norm": 0.32993587851524353, "learning_rate": 9.017930587697501e-05, "loss": 0.9344, "step": 661 }, { "epoch": 0.21624270140051446, "grad_norm": 0.3460935652256012, "learning_rate": 9.014823386601658e-05, "loss": 0.9538, "step": 662 }, { "epoch": 0.21656935200685967, "grad_norm": 0.3695278763771057, "learning_rate": 9.011711814869798e-05, "loss": 0.9617, "step": 663 }, { "epoch": 0.21689600261320485, "grad_norm": 0.38950344920158386, "learning_rate": 9.008595875889258e-05, "loss": 0.8396, "step": 664 }, { "epoch": 0.21722265321955003, "grad_norm": 0.4560457468032837, "learning_rate": 9.005475573052123e-05, "loss": 1.1105, "step": 665 }, { "epoch": 0.21754930382589524, "grad_norm": 0.4376985728740692, "learning_rate": 9.002350909755231e-05, "loss": 0.9999, "step": 666 }, { "epoch": 0.21787595443224042, "grad_norm": 0.49906688928604126, "learning_rate": 8.999221889400171e-05, "loss": 1.1028, "step": 667 }, { "epoch": 0.2182026050385856, "grad_norm": 0.5753675699234009, "learning_rate": 8.996088515393268e-05, "loss": 1.1046, "step": 668 }, { "epoch": 0.21852925564493078, "grad_norm": 0.6120446920394897, "learning_rate": 8.992950791145596e-05, "loss": 1.1665, "step": 669 }, { "epoch": 0.218855906251276, "grad_norm": 0.8368681073188782, "learning_rate": 8.989808720072955e-05, "loss": 1.4047, "step": 670 }, { "epoch": 0.21918255685762117, "grad_norm": 0.9521639347076416, "learning_rate": 8.98666230559588e-05, "loss": 1.4259, "step": 671 }, { "epoch": 0.21950920746396635, "grad_norm": 1.2514474391937256, "learning_rate": 8.983511551139641e-05, "loss": 1.5017, "step": 672 }, { "epoch": 0.21983585807031153, "grad_norm": 1.1896357536315918, "learning_rate": 8.980356460134222e-05, "loss": 1.3464, "step": 673 }, { "epoch": 0.22016250867665674, "grad_norm": 1.2468774318695068, "learning_rate": 8.977197036014336e-05, "loss": 1.3893, "step": 674 }, { "epoch": 0.22048915928300192, "grad_norm": 2.550205707550049, "learning_rate": 8.974033282219407e-05, "loss": 2.4879, "step": 675 }, { "epoch": 0.2208158098893471, "grad_norm": 0.22337143123149872, "learning_rate": 8.970865202193581e-05, "loss": 0.8812, "step": 676 }, { "epoch": 0.2211424604956923, "grad_norm": 0.23587287962436676, "learning_rate": 8.967692799385702e-05, "loss": 0.9307, "step": 677 }, { "epoch": 0.22146911110203749, "grad_norm": 0.23437225818634033, "learning_rate": 8.964516077249331e-05, "loss": 0.858, "step": 678 }, { "epoch": 0.22179576170838267, "grad_norm": 0.24456767737865448, "learning_rate": 8.961335039242727e-05, "loss": 0.8753, "step": 679 }, { "epoch": 0.22212241231472785, "grad_norm": 0.2503317892551422, "learning_rate": 8.95814968882884e-05, "loss": 0.8468, "step": 680 }, { "epoch": 0.22244906292107305, "grad_norm": 0.24607540667057037, "learning_rate": 8.954960029475328e-05, "loss": 0.8255, "step": 681 }, { "epoch": 0.22277571352741823, "grad_norm": 0.28483763337135315, "learning_rate": 8.95176606465453e-05, "loss": 0.8317, "step": 682 }, { "epoch": 0.22310236413376341, "grad_norm": 0.2800973355770111, "learning_rate": 8.948567797843476e-05, "loss": 0.8759, "step": 683 }, { "epoch": 0.22342901474010862, "grad_norm": 0.2901217043399811, "learning_rate": 8.945365232523877e-05, "loss": 0.9152, "step": 684 }, { "epoch": 0.2237556653464538, "grad_norm": 0.2840677499771118, "learning_rate": 8.942158372182126e-05, "loss": 0.8851, "step": 685 }, { "epoch": 0.22408231595279898, "grad_norm": 0.29865285754203796, "learning_rate": 8.93894722030929e-05, "loss": 0.894, "step": 686 }, { "epoch": 0.22440896655914416, "grad_norm": 0.3416735827922821, "learning_rate": 8.935731780401109e-05, "loss": 0.9313, "step": 687 }, { "epoch": 0.22473561716548937, "grad_norm": 0.35249149799346924, "learning_rate": 8.93251205595799e-05, "loss": 0.9602, "step": 688 }, { "epoch": 0.22506226777183455, "grad_norm": 0.3721778094768524, "learning_rate": 8.929288050485005e-05, "loss": 0.9332, "step": 689 }, { "epoch": 0.22538891837817973, "grad_norm": 0.39727091789245605, "learning_rate": 8.926059767491884e-05, "loss": 1.0504, "step": 690 }, { "epoch": 0.22571556898452494, "grad_norm": 0.43423396348953247, "learning_rate": 8.922827210493019e-05, "loss": 1.1031, "step": 691 }, { "epoch": 0.22604221959087012, "grad_norm": 0.4453451335430145, "learning_rate": 8.919590383007448e-05, "loss": 0.9969, "step": 692 }, { "epoch": 0.2263688701972153, "grad_norm": 0.5197416543960571, "learning_rate": 8.916349288558865e-05, "loss": 1.1138, "step": 693 }, { "epoch": 0.22669552080356048, "grad_norm": 0.539627194404602, "learning_rate": 8.913103930675602e-05, "loss": 1.1913, "step": 694 }, { "epoch": 0.2270221714099057, "grad_norm": 0.670895516872406, "learning_rate": 8.90985431289064e-05, "loss": 1.2253, "step": 695 }, { "epoch": 0.22734882201625087, "grad_norm": 0.8996394872665405, "learning_rate": 8.906600438741589e-05, "loss": 1.415, "step": 696 }, { "epoch": 0.22767547262259605, "grad_norm": 0.964454174041748, "learning_rate": 8.9033423117707e-05, "loss": 1.4633, "step": 697 }, { "epoch": 0.22800212322894126, "grad_norm": 1.2644301652908325, "learning_rate": 8.900079935524849e-05, "loss": 1.4658, "step": 698 }, { "epoch": 0.22832877383528644, "grad_norm": 1.2433687448501587, "learning_rate": 8.896813313555539e-05, "loss": 1.7506, "step": 699 }, { "epoch": 0.22865542444163162, "grad_norm": 1.4691505432128906, "learning_rate": 8.893542449418897e-05, "loss": 1.9687, "step": 700 }, { "epoch": 0.2289820750479768, "grad_norm": 0.18912772834300995, "learning_rate": 8.890267346675666e-05, "loss": 0.811, "step": 701 }, { "epoch": 0.229308725654322, "grad_norm": 0.23038050532341003, "learning_rate": 8.886988008891205e-05, "loss": 0.8602, "step": 702 }, { "epoch": 0.22963537626066718, "grad_norm": 0.2359895408153534, "learning_rate": 8.883704439635479e-05, "loss": 0.8598, "step": 703 }, { "epoch": 0.22996202686701236, "grad_norm": 0.23904529213905334, "learning_rate": 8.880416642483063e-05, "loss": 0.813, "step": 704 }, { "epoch": 0.23028867747335757, "grad_norm": 0.25027212500572205, "learning_rate": 8.877124621013139e-05, "loss": 0.8003, "step": 705 }, { "epoch": 0.23061532807970275, "grad_norm": 0.27593323588371277, "learning_rate": 8.873828378809479e-05, "loss": 0.9057, "step": 706 }, { "epoch": 0.23094197868604793, "grad_norm": 0.2732234597206116, "learning_rate": 8.870527919460454e-05, "loss": 0.8575, "step": 707 }, { "epoch": 0.2312686292923931, "grad_norm": 0.28355199098587036, "learning_rate": 8.867223246559027e-05, "loss": 0.8595, "step": 708 }, { "epoch": 0.23159527989873832, "grad_norm": 0.2759125232696533, "learning_rate": 8.863914363702746e-05, "loss": 0.8502, "step": 709 }, { "epoch": 0.2319219305050835, "grad_norm": 0.2850385010242462, "learning_rate": 8.86060127449374e-05, "loss": 0.9327, "step": 710 }, { "epoch": 0.23224858111142868, "grad_norm": 0.29679635167121887, "learning_rate": 8.857283982538727e-05, "loss": 0.886, "step": 711 }, { "epoch": 0.2325752317177739, "grad_norm": 0.32602760195732117, "learning_rate": 8.853962491448985e-05, "loss": 0.847, "step": 712 }, { "epoch": 0.23290188232411907, "grad_norm": 0.35561874508857727, "learning_rate": 8.850636804840375e-05, "loss": 0.8877, "step": 713 }, { "epoch": 0.23322853293046425, "grad_norm": 0.38955962657928467, "learning_rate": 8.847306926333323e-05, "loss": 1.0865, "step": 714 }, { "epoch": 0.23355518353680943, "grad_norm": 0.4370516538619995, "learning_rate": 8.843972859552816e-05, "loss": 1.0979, "step": 715 }, { "epoch": 0.23388183414315464, "grad_norm": 0.476929247379303, "learning_rate": 8.8406346081284e-05, "loss": 1.1663, "step": 716 }, { "epoch": 0.23420848474949982, "grad_norm": 0.45586854219436646, "learning_rate": 8.837292175694178e-05, "loss": 1.1494, "step": 717 }, { "epoch": 0.234535135355845, "grad_norm": 0.49512070417404175, "learning_rate": 8.833945565888809e-05, "loss": 1.0131, "step": 718 }, { "epoch": 0.2348617859621902, "grad_norm": 0.5380144715309143, "learning_rate": 8.830594782355489e-05, "loss": 1.1249, "step": 719 }, { "epoch": 0.2351884365685354, "grad_norm": 0.5472076535224915, "learning_rate": 8.827239828741969e-05, "loss": 1.0563, "step": 720 }, { "epoch": 0.23551508717488057, "grad_norm": 0.7166975140571594, "learning_rate": 8.82388070870053e-05, "loss": 1.0809, "step": 721 }, { "epoch": 0.23584173778122575, "grad_norm": 0.8227059245109558, "learning_rate": 8.820517425887998e-05, "loss": 1.6083, "step": 722 }, { "epoch": 0.23616838838757095, "grad_norm": 1.0604398250579834, "learning_rate": 8.81714998396572e-05, "loss": 1.4259, "step": 723 }, { "epoch": 0.23649503899391613, "grad_norm": 1.3100907802581787, "learning_rate": 8.813778386599582e-05, "loss": 1.5418, "step": 724 }, { "epoch": 0.23682168960026131, "grad_norm": 1.9565879106521606, "learning_rate": 8.810402637459987e-05, "loss": 1.8868, "step": 725 }, { "epoch": 0.2371483402066065, "grad_norm": 0.19288013875484467, "learning_rate": 8.807022740221856e-05, "loss": 0.7707, "step": 726 }, { "epoch": 0.2374749908129517, "grad_norm": 0.2201738804578781, "learning_rate": 8.803638698564631e-05, "loss": 0.8566, "step": 727 }, { "epoch": 0.23780164141929688, "grad_norm": 0.24070677161216736, "learning_rate": 8.800250516172264e-05, "loss": 0.8972, "step": 728 }, { "epoch": 0.23812829202564206, "grad_norm": 0.25289085507392883, "learning_rate": 8.796858196733214e-05, "loss": 0.8939, "step": 729 }, { "epoch": 0.23845494263198727, "grad_norm": 0.25093331933021545, "learning_rate": 8.793461743940442e-05, "loss": 0.8377, "step": 730 }, { "epoch": 0.23878159323833245, "grad_norm": 0.2646547853946686, "learning_rate": 8.790061161491409e-05, "loss": 0.8811, "step": 731 }, { "epoch": 0.23910824384467763, "grad_norm": 0.27221933007240295, "learning_rate": 8.78665645308808e-05, "loss": 0.8527, "step": 732 }, { "epoch": 0.2394348944510228, "grad_norm": 0.2808360159397125, "learning_rate": 8.783247622436899e-05, "loss": 0.8598, "step": 733 }, { "epoch": 0.23976154505736802, "grad_norm": 0.2827470302581787, "learning_rate": 8.779834673248803e-05, "loss": 0.8815, "step": 734 }, { "epoch": 0.2400881956637132, "grad_norm": 0.3063890039920807, "learning_rate": 8.776417609239218e-05, "loss": 0.9354, "step": 735 }, { "epoch": 0.24041484627005838, "grad_norm": 0.3075849711894989, "learning_rate": 8.772996434128039e-05, "loss": 0.8594, "step": 736 }, { "epoch": 0.2407414968764036, "grad_norm": 0.3473495543003082, "learning_rate": 8.769571151639644e-05, "loss": 0.9897, "step": 737 }, { "epoch": 0.24106814748274877, "grad_norm": 0.33075860142707825, "learning_rate": 8.766141765502882e-05, "loss": 1.0453, "step": 738 }, { "epoch": 0.24139479808909395, "grad_norm": 0.40573230385780334, "learning_rate": 8.762708279451063e-05, "loss": 0.98, "step": 739 }, { "epoch": 0.24172144869543913, "grad_norm": 0.36625126004219055, "learning_rate": 8.759270697221971e-05, "loss": 0.9599, "step": 740 }, { "epoch": 0.24204809930178434, "grad_norm": 0.3789199888706207, "learning_rate": 8.755829022557837e-05, "loss": 0.9305, "step": 741 }, { "epoch": 0.24237474990812952, "grad_norm": 0.41750094294548035, "learning_rate": 8.752383259205355e-05, "loss": 0.8474, "step": 742 }, { "epoch": 0.2427014005144747, "grad_norm": 0.5106528401374817, "learning_rate": 8.748933410915671e-05, "loss": 1.1046, "step": 743 }, { "epoch": 0.2430280511208199, "grad_norm": 0.5671947002410889, "learning_rate": 8.745479481444372e-05, "loss": 1.2206, "step": 744 }, { "epoch": 0.24335470172716508, "grad_norm": 0.6028645634651184, "learning_rate": 8.742021474551492e-05, "loss": 1.2882, "step": 745 }, { "epoch": 0.24368135233351026, "grad_norm": 0.7777064442634583, "learning_rate": 8.738559394001503e-05, "loss": 1.3427, "step": 746 }, { "epoch": 0.24400800293985545, "grad_norm": 0.9058758616447449, "learning_rate": 8.735093243563311e-05, "loss": 1.0836, "step": 747 }, { "epoch": 0.24433465354620065, "grad_norm": 1.1198101043701172, "learning_rate": 8.731623027010254e-05, "loss": 1.7559, "step": 748 }, { "epoch": 0.24466130415254583, "grad_norm": 1.198984980583191, "learning_rate": 8.728148748120095e-05, "loss": 1.4122, "step": 749 }, { "epoch": 0.244987954758891, "grad_norm": 1.8164176940917969, "learning_rate": 8.72467041067502e-05, "loss": 1.6278, "step": 750 }, { "epoch": 0.24531460536523622, "grad_norm": 0.2089819461107254, "learning_rate": 8.721188018461633e-05, "loss": 0.7842, "step": 751 }, { "epoch": 0.2456412559715814, "grad_norm": 0.2276405692100525, "learning_rate": 8.717701575270953e-05, "loss": 0.8506, "step": 752 }, { "epoch": 0.24596790657792658, "grad_norm": 0.23214949667453766, "learning_rate": 8.714211084898407e-05, "loss": 0.7915, "step": 753 }, { "epoch": 0.24629455718427176, "grad_norm": 0.2502540051937103, "learning_rate": 8.71071655114383e-05, "loss": 0.8058, "step": 754 }, { "epoch": 0.24662120779061697, "grad_norm": 0.25934338569641113, "learning_rate": 8.707217977811456e-05, "loss": 0.9029, "step": 755 }, { "epoch": 0.24694785839696215, "grad_norm": 0.2814617455005646, "learning_rate": 8.703715368709922e-05, "loss": 0.87, "step": 756 }, { "epoch": 0.24727450900330733, "grad_norm": 0.27127784490585327, "learning_rate": 8.700208727652252e-05, "loss": 0.8441, "step": 757 }, { "epoch": 0.24760115960965254, "grad_norm": 0.2742736041545868, "learning_rate": 8.696698058455863e-05, "loss": 0.8699, "step": 758 }, { "epoch": 0.24792781021599772, "grad_norm": 0.2857213020324707, "learning_rate": 8.693183364942556e-05, "loss": 0.9757, "step": 759 }, { "epoch": 0.2482544608223429, "grad_norm": 0.30193600058555603, "learning_rate": 8.689664650938516e-05, "loss": 0.9146, "step": 760 }, { "epoch": 0.24858111142868808, "grad_norm": 0.33248287439346313, "learning_rate": 8.686141920274297e-05, "loss": 0.9358, "step": 761 }, { "epoch": 0.2489077620350333, "grad_norm": 0.3253381848335266, "learning_rate": 8.682615176784835e-05, "loss": 0.8875, "step": 762 }, { "epoch": 0.24923441264137847, "grad_norm": 0.35137176513671875, "learning_rate": 8.679084424309428e-05, "loss": 0.9901, "step": 763 }, { "epoch": 0.24956106324772365, "grad_norm": 0.38631191849708557, "learning_rate": 8.675549666691742e-05, "loss": 0.9687, "step": 764 }, { "epoch": 0.24988771385406885, "grad_norm": 0.3780824840068817, "learning_rate": 8.672010907779799e-05, "loss": 0.9143, "step": 765 }, { "epoch": 0.250214364460414, "grad_norm": 0.439392626285553, "learning_rate": 8.668468151425982e-05, "loss": 1.0443, "step": 766 }, { "epoch": 0.250214364460414, "eval_loss": 1.0732765197753906, "eval_runtime": 499.2422, "eval_samples_per_second": 5.164, "eval_steps_per_second": 2.582, "step": 766 }, { "epoch": 0.25054101506675924, "grad_norm": 0.4362111985683441, "learning_rate": 8.664921401487023e-05, "loss": 1.0782, "step": 767 }, { "epoch": 0.2508676656731044, "grad_norm": 0.4964749813079834, "learning_rate": 8.661370661824e-05, "loss": 1.1945, "step": 768 }, { "epoch": 0.2511943162794496, "grad_norm": 0.5608397722244263, "learning_rate": 8.657815936302337e-05, "loss": 1.2661, "step": 769 }, { "epoch": 0.2515209668857948, "grad_norm": 0.5862159132957458, "learning_rate": 8.654257228791795e-05, "loss": 1.0976, "step": 770 }, { "epoch": 0.25184761749213996, "grad_norm": 0.7066771984100342, "learning_rate": 8.650694543166475e-05, "loss": 1.274, "step": 771 }, { "epoch": 0.25217426809848514, "grad_norm": 0.7534899711608887, "learning_rate": 8.647127883304799e-05, "loss": 1.1669, "step": 772 }, { "epoch": 0.2525009187048303, "grad_norm": 0.9842721819877625, "learning_rate": 8.643557253089525e-05, "loss": 1.3533, "step": 773 }, { "epoch": 0.25282756931117556, "grad_norm": 1.294683814048767, "learning_rate": 8.639982656407729e-05, "loss": 1.7016, "step": 774 }, { "epoch": 0.25315421991752074, "grad_norm": 1.7022143602371216, "learning_rate": 8.636404097150802e-05, "loss": 1.9253, "step": 775 }, { "epoch": 0.2534808705238659, "grad_norm": 0.19106809794902802, "learning_rate": 8.632821579214456e-05, "loss": 0.7292, "step": 776 }, { "epoch": 0.2538075211302111, "grad_norm": 0.2291499525308609, "learning_rate": 8.629235106498708e-05, "loss": 0.8147, "step": 777 }, { "epoch": 0.2541341717365563, "grad_norm": 0.23071865737438202, "learning_rate": 8.625644682907879e-05, "loss": 0.7612, "step": 778 }, { "epoch": 0.25446082234290146, "grad_norm": 0.24651503562927246, "learning_rate": 8.622050312350594e-05, "loss": 0.874, "step": 779 }, { "epoch": 0.25478747294924664, "grad_norm": 0.2605278789997101, "learning_rate": 8.618451998739774e-05, "loss": 0.8279, "step": 780 }, { "epoch": 0.2551141235555919, "grad_norm": 0.2556253969669342, "learning_rate": 8.614849745992632e-05, "loss": 0.8052, "step": 781 }, { "epoch": 0.25544077416193706, "grad_norm": 0.28385311365127563, "learning_rate": 8.611243558030668e-05, "loss": 0.8039, "step": 782 }, { "epoch": 0.25576742476828224, "grad_norm": 0.303835928440094, "learning_rate": 8.60763343877967e-05, "loss": 0.9079, "step": 783 }, { "epoch": 0.2560940753746274, "grad_norm": 0.31251901388168335, "learning_rate": 8.604019392169702e-05, "loss": 0.8428, "step": 784 }, { "epoch": 0.2564207259809726, "grad_norm": 0.335475891828537, "learning_rate": 8.600401422135104e-05, "loss": 0.9193, "step": 785 }, { "epoch": 0.2567473765873178, "grad_norm": 0.3252756595611572, "learning_rate": 8.596779532614488e-05, "loss": 0.8837, "step": 786 }, { "epoch": 0.25707402719366296, "grad_norm": 0.3637792766094208, "learning_rate": 8.593153727550732e-05, "loss": 0.9312, "step": 787 }, { "epoch": 0.2574006778000082, "grad_norm": 0.4200628399848938, "learning_rate": 8.589524010890977e-05, "loss": 1.0147, "step": 788 }, { "epoch": 0.2577273284063534, "grad_norm": 0.3719130754470825, "learning_rate": 8.585890386586623e-05, "loss": 0.9036, "step": 789 }, { "epoch": 0.25805397901269855, "grad_norm": 0.4389430284500122, "learning_rate": 8.582252858593324e-05, "loss": 1.0327, "step": 790 }, { "epoch": 0.25838062961904373, "grad_norm": 0.445206880569458, "learning_rate": 8.578611430870979e-05, "loss": 1.1391, "step": 791 }, { "epoch": 0.2587072802253889, "grad_norm": 0.5291475057601929, "learning_rate": 8.574966107383744e-05, "loss": 1.2331, "step": 792 }, { "epoch": 0.2590339308317341, "grad_norm": 0.5588846802711487, "learning_rate": 8.5713168921e-05, "loss": 1.0723, "step": 793 }, { "epoch": 0.2593605814380793, "grad_norm": 0.6296544075012207, "learning_rate": 8.567663788992377e-05, "loss": 1.2545, "step": 794 }, { "epoch": 0.2596872320444245, "grad_norm": 0.6767513155937195, "learning_rate": 8.564006802037734e-05, "loss": 1.1453, "step": 795 }, { "epoch": 0.2600138826507697, "grad_norm": 0.7685205340385437, "learning_rate": 8.560345935217155e-05, "loss": 1.169, "step": 796 }, { "epoch": 0.26034053325711487, "grad_norm": 0.975924015045166, "learning_rate": 8.556681192515952e-05, "loss": 1.3188, "step": 797 }, { "epoch": 0.26066718386346005, "grad_norm": 1.1853704452514648, "learning_rate": 8.553012577923653e-05, "loss": 1.5078, "step": 798 }, { "epoch": 0.26099383446980523, "grad_norm": 1.6453309059143066, "learning_rate": 8.549340095434006e-05, "loss": 1.5953, "step": 799 }, { "epoch": 0.2613204850761504, "grad_norm": 1.7937952280044556, "learning_rate": 8.54566374904496e-05, "loss": 2.4296, "step": 800 }, { "epoch": 0.2616471356824956, "grad_norm": 0.190406933426857, "learning_rate": 8.541983542758685e-05, "loss": 0.7342, "step": 801 }, { "epoch": 0.2619737862888408, "grad_norm": 0.22838981449604034, "learning_rate": 8.538299480581538e-05, "loss": 0.7493, "step": 802 }, { "epoch": 0.262300436895186, "grad_norm": 0.2559290826320648, "learning_rate": 8.53461156652408e-05, "loss": 0.8504, "step": 803 }, { "epoch": 0.2626270875015312, "grad_norm": 0.2588362693786621, "learning_rate": 8.53091980460107e-05, "loss": 0.9, "step": 804 }, { "epoch": 0.26295373810787637, "grad_norm": 0.2569828927516937, "learning_rate": 8.527224198831447e-05, "loss": 0.813, "step": 805 }, { "epoch": 0.26328038871422155, "grad_norm": 0.2703791558742523, "learning_rate": 8.523524753238342e-05, "loss": 0.8616, "step": 806 }, { "epoch": 0.2636070393205667, "grad_norm": 0.27567189931869507, "learning_rate": 8.519821471849061e-05, "loss": 0.9035, "step": 807 }, { "epoch": 0.2639336899269119, "grad_norm": 0.2831965386867523, "learning_rate": 8.516114358695089e-05, "loss": 0.94, "step": 808 }, { "epoch": 0.2642603405332571, "grad_norm": 0.28172117471694946, "learning_rate": 8.51240341781208e-05, "loss": 0.8232, "step": 809 }, { "epoch": 0.2645869911396023, "grad_norm": 0.3024875521659851, "learning_rate": 8.508688653239858e-05, "loss": 0.8993, "step": 810 }, { "epoch": 0.2649136417459475, "grad_norm": 0.3354199528694153, "learning_rate": 8.504970069022404e-05, "loss": 0.89, "step": 811 }, { "epoch": 0.2652402923522927, "grad_norm": 0.34784260392189026, "learning_rate": 8.501247669207864e-05, "loss": 1.0391, "step": 812 }, { "epoch": 0.26556694295863786, "grad_norm": 0.3697127103805542, "learning_rate": 8.497521457848532e-05, "loss": 1.0011, "step": 813 }, { "epoch": 0.26589359356498304, "grad_norm": 0.37952542304992676, "learning_rate": 8.493791439000855e-05, "loss": 0.8449, "step": 814 }, { "epoch": 0.2662202441713282, "grad_norm": 0.42014080286026, "learning_rate": 8.490057616725424e-05, "loss": 0.9275, "step": 815 }, { "epoch": 0.2665468947776734, "grad_norm": 0.46746087074279785, "learning_rate": 8.48631999508697e-05, "loss": 1.2106, "step": 816 }, { "epoch": 0.26687354538401864, "grad_norm": 0.5447984933853149, "learning_rate": 8.482578578154361e-05, "loss": 1.1964, "step": 817 }, { "epoch": 0.2672001959903638, "grad_norm": 0.5636278390884399, "learning_rate": 8.478833370000594e-05, "loss": 1.1554, "step": 818 }, { "epoch": 0.267526846596709, "grad_norm": 0.6589205861091614, "learning_rate": 8.475084374702797e-05, "loss": 1.2816, "step": 819 }, { "epoch": 0.2678534972030542, "grad_norm": 0.716610312461853, "learning_rate": 8.47133159634222e-05, "loss": 1.2345, "step": 820 }, { "epoch": 0.26818014780939936, "grad_norm": 0.866495668888092, "learning_rate": 8.467575039004227e-05, "loss": 1.2124, "step": 821 }, { "epoch": 0.26850679841574454, "grad_norm": 1.089608073234558, "learning_rate": 8.463814706778304e-05, "loss": 1.69, "step": 822 }, { "epoch": 0.2688334490220897, "grad_norm": 1.1608694791793823, "learning_rate": 8.460050603758035e-05, "loss": 1.732, "step": 823 }, { "epoch": 0.26916009962843496, "grad_norm": 1.2007168531417847, "learning_rate": 8.456282734041121e-05, "loss": 1.83, "step": 824 }, { "epoch": 0.26948675023478014, "grad_norm": 1.6301062107086182, "learning_rate": 8.452511101729357e-05, "loss": 1.664, "step": 825 }, { "epoch": 0.2698134008411253, "grad_norm": 0.1741432547569275, "learning_rate": 8.448735710928635e-05, "loss": 0.6838, "step": 826 }, { "epoch": 0.2701400514474705, "grad_norm": 0.21807898581027985, "learning_rate": 8.444956565748937e-05, "loss": 0.8771, "step": 827 }, { "epoch": 0.2704667020538157, "grad_norm": 0.23946182429790497, "learning_rate": 8.441173670304337e-05, "loss": 0.7937, "step": 828 }, { "epoch": 0.27079335266016086, "grad_norm": 0.24787762761116028, "learning_rate": 8.437387028712984e-05, "loss": 0.8956, "step": 829 }, { "epoch": 0.27112000326650604, "grad_norm": 0.26188749074935913, "learning_rate": 8.433596645097114e-05, "loss": 0.8621, "step": 830 }, { "epoch": 0.2714466538728513, "grad_norm": 0.28136762976646423, "learning_rate": 8.429802523583032e-05, "loss": 0.8664, "step": 831 }, { "epoch": 0.27177330447919645, "grad_norm": 0.2663547992706299, "learning_rate": 8.42600466830111e-05, "loss": 0.8265, "step": 832 }, { "epoch": 0.27209995508554163, "grad_norm": 0.2984652817249298, "learning_rate": 8.422203083385791e-05, "loss": 0.9242, "step": 833 }, { "epoch": 0.2724266056918868, "grad_norm": 0.30643805861473083, "learning_rate": 8.418397772975571e-05, "loss": 0.914, "step": 834 }, { "epoch": 0.272753256298232, "grad_norm": 0.33440685272216797, "learning_rate": 8.414588741213004e-05, "loss": 0.8974, "step": 835 }, { "epoch": 0.2730799069045772, "grad_norm": 0.34194639325141907, "learning_rate": 8.410775992244699e-05, "loss": 0.9481, "step": 836 }, { "epoch": 0.27340655751092235, "grad_norm": 0.3455904722213745, "learning_rate": 8.406959530221308e-05, "loss": 0.8916, "step": 837 }, { "epoch": 0.2737332081172676, "grad_norm": 0.38675469160079956, "learning_rate": 8.403139359297526e-05, "loss": 0.981, "step": 838 }, { "epoch": 0.27405985872361277, "grad_norm": 0.38662129640579224, "learning_rate": 8.399315483632087e-05, "loss": 0.9426, "step": 839 }, { "epoch": 0.27438650932995795, "grad_norm": 0.4323444366455078, "learning_rate": 8.395487907387751e-05, "loss": 0.9693, "step": 840 }, { "epoch": 0.27471315993630313, "grad_norm": 0.4435561001300812, "learning_rate": 8.391656634731319e-05, "loss": 0.9983, "step": 841 }, { "epoch": 0.2750398105426483, "grad_norm": 0.5139126777648926, "learning_rate": 8.387821669833606e-05, "loss": 1.198, "step": 842 }, { "epoch": 0.2753664611489935, "grad_norm": 0.5524410605430603, "learning_rate": 8.383983016869448e-05, "loss": 1.18, "step": 843 }, { "epoch": 0.27569311175533867, "grad_norm": 0.6392261981964111, "learning_rate": 8.380140680017703e-05, "loss": 1.2321, "step": 844 }, { "epoch": 0.2760197623616839, "grad_norm": 0.6585685610771179, "learning_rate": 8.376294663461227e-05, "loss": 1.2603, "step": 845 }, { "epoch": 0.2763464129680291, "grad_norm": 0.8117931485176086, "learning_rate": 8.372444971386894e-05, "loss": 1.3741, "step": 846 }, { "epoch": 0.27667306357437427, "grad_norm": 0.9951660633087158, "learning_rate": 8.368591607985571e-05, "loss": 1.3947, "step": 847 }, { "epoch": 0.27699971418071945, "grad_norm": 1.1902111768722534, "learning_rate": 8.364734577452127e-05, "loss": 1.5964, "step": 848 }, { "epoch": 0.2773263647870646, "grad_norm": 1.2848105430603027, "learning_rate": 8.360873883985418e-05, "loss": 1.3661, "step": 849 }, { "epoch": 0.2776530153934098, "grad_norm": 1.5601998567581177, "learning_rate": 8.357009531788293e-05, "loss": 1.5122, "step": 850 }, { "epoch": 0.277979665999755, "grad_norm": 0.17106448113918304, "learning_rate": 8.353141525067579e-05, "loss": 0.6374, "step": 851 }, { "epoch": 0.2783063166061002, "grad_norm": 0.21366915106773376, "learning_rate": 8.349269868034087e-05, "loss": 0.8114, "step": 852 }, { "epoch": 0.2786329672124454, "grad_norm": 0.2161540687084198, "learning_rate": 8.345394564902594e-05, "loss": 0.8004, "step": 853 }, { "epoch": 0.2789596178187906, "grad_norm": 0.24288657307624817, "learning_rate": 8.341515619891856e-05, "loss": 0.8151, "step": 854 }, { "epoch": 0.27928626842513576, "grad_norm": 0.25571781396865845, "learning_rate": 8.337633037224583e-05, "loss": 0.8574, "step": 855 }, { "epoch": 0.27961291903148094, "grad_norm": 0.30678462982177734, "learning_rate": 8.333746821127455e-05, "loss": 0.8622, "step": 856 }, { "epoch": 0.2799395696378261, "grad_norm": 0.2756674289703369, "learning_rate": 8.329856975831103e-05, "loss": 0.8822, "step": 857 }, { "epoch": 0.2802662202441713, "grad_norm": 0.2979314923286438, "learning_rate": 8.325963505570104e-05, "loss": 0.8659, "step": 858 }, { "epoch": 0.28059287085051654, "grad_norm": 0.2941073775291443, "learning_rate": 8.322066414582992e-05, "loss": 0.8474, "step": 859 }, { "epoch": 0.2809195214568617, "grad_norm": 0.32220232486724854, "learning_rate": 8.318165707112233e-05, "loss": 0.865, "step": 860 }, { "epoch": 0.2812461720632069, "grad_norm": 0.33228668570518494, "learning_rate": 8.314261387404234e-05, "loss": 0.9357, "step": 861 }, { "epoch": 0.2815728226695521, "grad_norm": 0.3545335829257965, "learning_rate": 8.310353459709333e-05, "loss": 0.9402, "step": 862 }, { "epoch": 0.28189947327589726, "grad_norm": 0.3884226679801941, "learning_rate": 8.306441928281798e-05, "loss": 1.0393, "step": 863 }, { "epoch": 0.28222612388224244, "grad_norm": 0.40320566296577454, "learning_rate": 8.302526797379822e-05, "loss": 0.961, "step": 864 }, { "epoch": 0.2825527744885876, "grad_norm": 0.4174101948738098, "learning_rate": 8.298608071265507e-05, "loss": 0.9835, "step": 865 }, { "epoch": 0.28287942509493286, "grad_norm": 0.4447181820869446, "learning_rate": 8.29468575420488e-05, "loss": 1.0212, "step": 866 }, { "epoch": 0.28320607570127804, "grad_norm": 0.46445220708847046, "learning_rate": 8.29075985046787e-05, "loss": 1.0336, "step": 867 }, { "epoch": 0.2835327263076232, "grad_norm": 0.5036008358001709, "learning_rate": 8.286830364328314e-05, "loss": 1.0846, "step": 868 }, { "epoch": 0.2838593769139684, "grad_norm": 0.5870608687400818, "learning_rate": 8.282897300063946e-05, "loss": 1.051, "step": 869 }, { "epoch": 0.2841860275203136, "grad_norm": 0.6841760277748108, "learning_rate": 8.278960661956401e-05, "loss": 1.366, "step": 870 }, { "epoch": 0.28451267812665876, "grad_norm": 0.799982488155365, "learning_rate": 8.275020454291195e-05, "loss": 1.2628, "step": 871 }, { "epoch": 0.28483932873300394, "grad_norm": 1.000924825668335, "learning_rate": 8.271076681357741e-05, "loss": 1.4612, "step": 872 }, { "epoch": 0.2851659793393492, "grad_norm": 1.1383004188537598, "learning_rate": 8.267129347449322e-05, "loss": 1.6271, "step": 873 }, { "epoch": 0.28549262994569435, "grad_norm": 1.2377307415008545, "learning_rate": 8.26317845686311e-05, "loss": 1.2689, "step": 874 }, { "epoch": 0.28581928055203953, "grad_norm": 1.6466963291168213, "learning_rate": 8.259224013900137e-05, "loss": 1.6594, "step": 875 }, { "epoch": 0.2861459311583847, "grad_norm": 0.2105908840894699, "learning_rate": 8.255266022865309e-05, "loss": 0.7822, "step": 876 }, { "epoch": 0.2864725817647299, "grad_norm": 0.22548696398735046, "learning_rate": 8.251304488067393e-05, "loss": 0.7938, "step": 877 }, { "epoch": 0.2867992323710751, "grad_norm": 0.24404418468475342, "learning_rate": 8.247339413819015e-05, "loss": 0.7887, "step": 878 }, { "epoch": 0.28712588297742025, "grad_norm": 0.25363683700561523, "learning_rate": 8.243370804436649e-05, "loss": 0.9139, "step": 879 }, { "epoch": 0.2874525335837655, "grad_norm": 0.25927162170410156, "learning_rate": 8.239398664240627e-05, "loss": 0.8177, "step": 880 }, { "epoch": 0.28777918419011067, "grad_norm": 0.2730761170387268, "learning_rate": 8.235422997555114e-05, "loss": 0.9055, "step": 881 }, { "epoch": 0.28810583479645585, "grad_norm": 0.2785848081111908, "learning_rate": 8.231443808708122e-05, "loss": 0.9572, "step": 882 }, { "epoch": 0.28843248540280103, "grad_norm": 0.27791181206703186, "learning_rate": 8.227461102031493e-05, "loss": 0.803, "step": 883 }, { "epoch": 0.2887591360091462, "grad_norm": 0.28716158866882324, "learning_rate": 8.2234748818609e-05, "loss": 0.8582, "step": 884 }, { "epoch": 0.2890857866154914, "grad_norm": 0.3149946630001068, "learning_rate": 8.21948515253584e-05, "loss": 0.9189, "step": 885 }, { "epoch": 0.28941243722183657, "grad_norm": 0.3081023395061493, "learning_rate": 8.215491918399633e-05, "loss": 0.8939, "step": 886 }, { "epoch": 0.2897390878281818, "grad_norm": 0.31537654995918274, "learning_rate": 8.211495183799413e-05, "loss": 0.8625, "step": 887 }, { "epoch": 0.290065738434527, "grad_norm": 0.3406592905521393, "learning_rate": 8.20749495308612e-05, "loss": 0.9604, "step": 888 }, { "epoch": 0.29039238904087217, "grad_norm": 0.3376745581626892, "learning_rate": 8.20349123061451e-05, "loss": 0.8982, "step": 889 }, { "epoch": 0.29071903964721735, "grad_norm": 0.3791499435901642, "learning_rate": 8.19948402074313e-05, "loss": 0.9649, "step": 890 }, { "epoch": 0.29104569025356253, "grad_norm": 0.4459807872772217, "learning_rate": 8.195473327834329e-05, "loss": 1.1565, "step": 891 }, { "epoch": 0.2913723408599077, "grad_norm": 0.45107483863830566, "learning_rate": 8.191459156254247e-05, "loss": 0.9899, "step": 892 }, { "epoch": 0.2916989914662529, "grad_norm": 0.50025475025177, "learning_rate": 8.187441510372808e-05, "loss": 1.1841, "step": 893 }, { "epoch": 0.2920256420725981, "grad_norm": 0.5542461276054382, "learning_rate": 8.183420394563724e-05, "loss": 1.183, "step": 894 }, { "epoch": 0.2923522926789433, "grad_norm": 0.6245063543319702, "learning_rate": 8.179395813204477e-05, "loss": 1.3295, "step": 895 }, { "epoch": 0.2926789432852885, "grad_norm": 0.7166218757629395, "learning_rate": 8.17536777067633e-05, "loss": 1.3525, "step": 896 }, { "epoch": 0.29300559389163366, "grad_norm": 0.9630147814750671, "learning_rate": 8.171336271364308e-05, "loss": 1.4074, "step": 897 }, { "epoch": 0.29333224449797884, "grad_norm": 1.0214118957519531, "learning_rate": 8.167301319657201e-05, "loss": 1.4875, "step": 898 }, { "epoch": 0.293658895104324, "grad_norm": 1.444045901298523, "learning_rate": 8.163262919947557e-05, "loss": 1.6154, "step": 899 }, { "epoch": 0.2939855457106692, "grad_norm": 1.9283764362335205, "learning_rate": 8.159221076631678e-05, "loss": 2.093, "step": 900 }, { "epoch": 0.29431219631701444, "grad_norm": 0.2009824812412262, "learning_rate": 8.155175794109614e-05, "loss": 0.8172, "step": 901 }, { "epoch": 0.2946388469233596, "grad_norm": 0.21653543412685394, "learning_rate": 8.15112707678516e-05, "loss": 0.8646, "step": 902 }, { "epoch": 0.2949654975297048, "grad_norm": 0.26212337613105774, "learning_rate": 8.14707492906585e-05, "loss": 0.9042, "step": 903 }, { "epoch": 0.29529214813605, "grad_norm": 0.26977843046188354, "learning_rate": 8.143019355362952e-05, "loss": 0.8568, "step": 904 }, { "epoch": 0.29561879874239516, "grad_norm": 0.2698366045951843, "learning_rate": 8.138960360091463e-05, "loss": 0.8116, "step": 905 }, { "epoch": 0.29594544934874034, "grad_norm": 0.29477614164352417, "learning_rate": 8.134897947670108e-05, "loss": 0.8601, "step": 906 }, { "epoch": 0.2962720999550855, "grad_norm": 0.2830169200897217, "learning_rate": 8.130832122521327e-05, "loss": 0.8487, "step": 907 }, { "epoch": 0.29659875056143076, "grad_norm": 0.3099175691604614, "learning_rate": 8.12676288907128e-05, "loss": 0.9419, "step": 908 }, { "epoch": 0.29692540116777594, "grad_norm": 0.3113427758216858, "learning_rate": 8.122690251749834e-05, "loss": 0.94, "step": 909 }, { "epoch": 0.2972520517741211, "grad_norm": 0.317848265171051, "learning_rate": 8.118614214990561e-05, "loss": 0.9457, "step": 910 }, { "epoch": 0.2975787023804663, "grad_norm": 0.32310110330581665, "learning_rate": 8.114534783230739e-05, "loss": 0.93, "step": 911 }, { "epoch": 0.2979053529868115, "grad_norm": 0.3455216884613037, "learning_rate": 8.110451960911333e-05, "loss": 0.9539, "step": 912 }, { "epoch": 0.29823200359315666, "grad_norm": 0.3817412853240967, "learning_rate": 8.106365752477012e-05, "loss": 1.0052, "step": 913 }, { "epoch": 0.29855865419950184, "grad_norm": 0.39544934034347534, "learning_rate": 8.102276162376117e-05, "loss": 0.9988, "step": 914 }, { "epoch": 0.298885304805847, "grad_norm": 0.43939855694770813, "learning_rate": 8.09818319506068e-05, "loss": 1.1415, "step": 915 }, { "epoch": 0.29921195541219225, "grad_norm": 0.442859947681427, "learning_rate": 8.094086854986405e-05, "loss": 1.0522, "step": 916 }, { "epoch": 0.29953860601853743, "grad_norm": 0.4725445508956909, "learning_rate": 8.089987146612669e-05, "loss": 1.0607, "step": 917 }, { "epoch": 0.2998652566248826, "grad_norm": 0.5423445701599121, "learning_rate": 8.085884074402518e-05, "loss": 1.1992, "step": 918 }, { "epoch": 0.3001919072312278, "grad_norm": 0.5503812432289124, "learning_rate": 8.081777642822657e-05, "loss": 1.1793, "step": 919 }, { "epoch": 0.300518557837573, "grad_norm": 0.5893210172653198, "learning_rate": 8.077667856343449e-05, "loss": 1.0644, "step": 920 }, { "epoch": 0.30084520844391816, "grad_norm": 0.7819690108299255, "learning_rate": 8.073554719438908e-05, "loss": 1.4015, "step": 921 }, { "epoch": 0.30117185905026334, "grad_norm": 0.8868228793144226, "learning_rate": 8.069438236586695e-05, "loss": 1.3257, "step": 922 }, { "epoch": 0.30149850965660857, "grad_norm": 1.201609492301941, "learning_rate": 8.065318412268119e-05, "loss": 1.3868, "step": 923 }, { "epoch": 0.30182516026295375, "grad_norm": 1.1145105361938477, "learning_rate": 8.061195250968121e-05, "loss": 1.4812, "step": 924 }, { "epoch": 0.30215181086929893, "grad_norm": 1.567140817642212, "learning_rate": 8.057068757175276e-05, "loss": 1.737, "step": 925 }, { "epoch": 0.3024784614756441, "grad_norm": 0.21669495105743408, "learning_rate": 8.052938935381786e-05, "loss": 0.7445, "step": 926 }, { "epoch": 0.3028051120819893, "grad_norm": 0.22615468502044678, "learning_rate": 8.048805790083481e-05, "loss": 0.8108, "step": 927 }, { "epoch": 0.30313176268833447, "grad_norm": 0.2774372100830078, "learning_rate": 8.0446693257798e-05, "loss": 0.8657, "step": 928 }, { "epoch": 0.30345841329467965, "grad_norm": 0.2653120756149292, "learning_rate": 8.040529546973805e-05, "loss": 0.8441, "step": 929 }, { "epoch": 0.3037850639010249, "grad_norm": 0.303362637758255, "learning_rate": 8.036386458172161e-05, "loss": 0.869, "step": 930 }, { "epoch": 0.30411171450737007, "grad_norm": 0.29433003067970276, "learning_rate": 8.032240063885133e-05, "loss": 0.8341, "step": 931 }, { "epoch": 0.30443836511371525, "grad_norm": 0.301315575838089, "learning_rate": 8.028090368626591e-05, "loss": 0.8746, "step": 932 }, { "epoch": 0.30476501572006043, "grad_norm": 0.30583563446998596, "learning_rate": 8.023937376913996e-05, "loss": 0.8948, "step": 933 }, { "epoch": 0.3050916663264056, "grad_norm": 0.3299638330936432, "learning_rate": 8.019781093268396e-05, "loss": 0.8029, "step": 934 }, { "epoch": 0.3054183169327508, "grad_norm": 0.3463474214076996, "learning_rate": 8.015621522214429e-05, "loss": 0.8575, "step": 935 }, { "epoch": 0.30574496753909597, "grad_norm": 0.3461097776889801, "learning_rate": 8.0114586682803e-05, "loss": 0.9876, "step": 936 }, { "epoch": 0.3060716181454412, "grad_norm": 0.354095458984375, "learning_rate": 8.007292535997799e-05, "loss": 0.9575, "step": 937 }, { "epoch": 0.3063982687517864, "grad_norm": 0.3798564076423645, "learning_rate": 8.00312312990228e-05, "loss": 1.0164, "step": 938 }, { "epoch": 0.30672491935813156, "grad_norm": 0.3784736692905426, "learning_rate": 7.998950454532662e-05, "loss": 0.97, "step": 939 }, { "epoch": 0.30705156996447674, "grad_norm": 0.40412405133247375, "learning_rate": 7.99477451443142e-05, "loss": 0.9975, "step": 940 }, { "epoch": 0.3073782205708219, "grad_norm": 0.4350655674934387, "learning_rate": 7.990595314144587e-05, "loss": 1.005, "step": 941 }, { "epoch": 0.3077048711771671, "grad_norm": 0.4765323996543884, "learning_rate": 7.986412858221746e-05, "loss": 1.0305, "step": 942 }, { "epoch": 0.3080315217835123, "grad_norm": 0.5018781423568726, "learning_rate": 7.982227151216019e-05, "loss": 1.0618, "step": 943 }, { "epoch": 0.3083581723898575, "grad_norm": 0.574617862701416, "learning_rate": 7.978038197684073e-05, "loss": 1.1572, "step": 944 }, { "epoch": 0.3086848229962027, "grad_norm": 0.6548877358436584, "learning_rate": 7.973846002186103e-05, "loss": 1.2569, "step": 945 }, { "epoch": 0.3090114736025479, "grad_norm": 0.7577599883079529, "learning_rate": 7.969650569285839e-05, "loss": 1.3086, "step": 946 }, { "epoch": 0.30933812420889306, "grad_norm": 0.8888007998466492, "learning_rate": 7.965451903550531e-05, "loss": 1.1316, "step": 947 }, { "epoch": 0.30966477481523824, "grad_norm": 1.0013303756713867, "learning_rate": 7.961250009550953e-05, "loss": 1.3751, "step": 948 }, { "epoch": 0.3099914254215834, "grad_norm": 1.2923164367675781, "learning_rate": 7.95704489186139e-05, "loss": 1.5979, "step": 949 }, { "epoch": 0.3103180760279286, "grad_norm": 1.940639853477478, "learning_rate": 7.952836555059635e-05, "loss": 2.0671, "step": 950 }, { "epoch": 0.31064472663427384, "grad_norm": 0.20804435014724731, "learning_rate": 7.94862500372699e-05, "loss": 0.7342, "step": 951 }, { "epoch": 0.310971377240619, "grad_norm": 0.22356468439102173, "learning_rate": 7.944410242448253e-05, "loss": 0.8373, "step": 952 }, { "epoch": 0.3112980278469642, "grad_norm": 0.22745850682258606, "learning_rate": 7.940192275811717e-05, "loss": 0.8846, "step": 953 }, { "epoch": 0.3116246784533094, "grad_norm": 0.22750817239284515, "learning_rate": 7.935971108409166e-05, "loss": 0.8154, "step": 954 }, { "epoch": 0.31195132905965456, "grad_norm": 0.26259639859199524, "learning_rate": 7.931746744835865e-05, "loss": 0.8305, "step": 955 }, { "epoch": 0.31227797966599974, "grad_norm": 0.26343998312950134, "learning_rate": 7.927519189690562e-05, "loss": 0.9104, "step": 956 }, { "epoch": 0.3126046302723449, "grad_norm": 0.28506144881248474, "learning_rate": 7.923288447575479e-05, "loss": 0.8731, "step": 957 }, { "epoch": 0.31293128087869015, "grad_norm": 0.2877998948097229, "learning_rate": 7.919054523096306e-05, "loss": 0.9688, "step": 958 }, { "epoch": 0.31325793148503533, "grad_norm": 0.2984800636768341, "learning_rate": 7.914817420862196e-05, "loss": 0.8833, "step": 959 }, { "epoch": 0.3135845820913805, "grad_norm": 0.3453596234321594, "learning_rate": 7.910577145485765e-05, "loss": 0.8775, "step": 960 }, { "epoch": 0.3139112326977257, "grad_norm": 0.3432406187057495, "learning_rate": 7.906333701583082e-05, "loss": 0.8728, "step": 961 }, { "epoch": 0.3142378833040709, "grad_norm": 0.3565903604030609, "learning_rate": 7.902087093773663e-05, "loss": 0.9814, "step": 962 }, { "epoch": 0.31456453391041606, "grad_norm": 0.3908556401729584, "learning_rate": 7.897837326680473e-05, "loss": 1.0056, "step": 963 }, { "epoch": 0.31489118451676124, "grad_norm": 0.392490029335022, "learning_rate": 7.89358440492991e-05, "loss": 1.0565, "step": 964 }, { "epoch": 0.31521783512310647, "grad_norm": 0.4333183765411377, "learning_rate": 7.889328333151814e-05, "loss": 1.0835, "step": 965 }, { "epoch": 0.31554448572945165, "grad_norm": 0.5302640199661255, "learning_rate": 7.885069115979447e-05, "loss": 1.112, "step": 966 }, { "epoch": 0.31587113633579683, "grad_norm": 0.5419654846191406, "learning_rate": 7.880806758049499e-05, "loss": 1.166, "step": 967 }, { "epoch": 0.316197786942142, "grad_norm": 0.5007505416870117, "learning_rate": 7.876541264002078e-05, "loss": 0.9182, "step": 968 }, { "epoch": 0.3165244375484872, "grad_norm": 0.6056339144706726, "learning_rate": 7.872272638480706e-05, "loss": 1.2492, "step": 969 }, { "epoch": 0.31685108815483237, "grad_norm": 0.7134028673171997, "learning_rate": 7.868000886132316e-05, "loss": 1.3831, "step": 970 }, { "epoch": 0.31717773876117755, "grad_norm": 0.7521294355392456, "learning_rate": 7.863726011607243e-05, "loss": 1.4182, "step": 971 }, { "epoch": 0.3175043893675228, "grad_norm": 0.9576146602630615, "learning_rate": 7.859448019559217e-05, "loss": 1.386, "step": 972 }, { "epoch": 0.31783103997386797, "grad_norm": 1.1564505100250244, "learning_rate": 7.855166914645372e-05, "loss": 1.4906, "step": 973 }, { "epoch": 0.31815769058021315, "grad_norm": 1.5212230682373047, "learning_rate": 7.850882701526218e-05, "loss": 1.6107, "step": 974 }, { "epoch": 0.31848434118655833, "grad_norm": 2.16402268409729, "learning_rate": 7.846595384865662e-05, "loss": 2.8417, "step": 975 }, { "epoch": 0.3188109917929035, "grad_norm": 0.20599240064620972, "learning_rate": 7.84230496933098e-05, "loss": 0.794, "step": 976 }, { "epoch": 0.3191376423992487, "grad_norm": 0.21791720390319824, "learning_rate": 7.838011459592824e-05, "loss": 0.8069, "step": 977 }, { "epoch": 0.31946429300559387, "grad_norm": 0.24189670383930206, "learning_rate": 7.833714860325215e-05, "loss": 0.8253, "step": 978 }, { "epoch": 0.3197909436119391, "grad_norm": 0.23616375029087067, "learning_rate": 7.829415176205539e-05, "loss": 0.8416, "step": 979 }, { "epoch": 0.3201175942182843, "grad_norm": 0.2511419951915741, "learning_rate": 7.825112411914535e-05, "loss": 0.8737, "step": 980 }, { "epoch": 0.32044424482462947, "grad_norm": 0.24740338325500488, "learning_rate": 7.820806572136301e-05, "loss": 0.6864, "step": 981 }, { "epoch": 0.32077089543097465, "grad_norm": 0.27360159158706665, "learning_rate": 7.81649766155828e-05, "loss": 0.9015, "step": 982 }, { "epoch": 0.3210975460373198, "grad_norm": 0.27167263627052307, "learning_rate": 7.812185684871261e-05, "loss": 0.8641, "step": 983 }, { "epoch": 0.321424196643665, "grad_norm": 0.27509331703186035, "learning_rate": 7.807870646769364e-05, "loss": 0.8864, "step": 984 }, { "epoch": 0.3217508472500102, "grad_norm": 0.2927476167678833, "learning_rate": 7.80355255195005e-05, "loss": 0.8999, "step": 985 }, { "epoch": 0.3220774978563554, "grad_norm": 0.30333781242370605, "learning_rate": 7.799231405114102e-05, "loss": 0.8984, "step": 986 }, { "epoch": 0.3224041484627006, "grad_norm": 0.29891061782836914, "learning_rate": 7.794907210965627e-05, "loss": 0.8042, "step": 987 }, { "epoch": 0.3227307990690458, "grad_norm": 0.31980669498443604, "learning_rate": 7.790579974212052e-05, "loss": 0.9125, "step": 988 }, { "epoch": 0.32305744967539096, "grad_norm": 0.3497137427330017, "learning_rate": 7.78624969956411e-05, "loss": 1.0623, "step": 989 }, { "epoch": 0.32338410028173614, "grad_norm": 0.3825185298919678, "learning_rate": 7.781916391735847e-05, "loss": 0.9571, "step": 990 }, { "epoch": 0.3237107508880813, "grad_norm": 0.44200360774993896, "learning_rate": 7.77758005544461e-05, "loss": 0.9618, "step": 991 }, { "epoch": 0.3240374014944265, "grad_norm": 0.4567243754863739, "learning_rate": 7.773240695411042e-05, "loss": 1.0149, "step": 992 }, { "epoch": 0.32436405210077174, "grad_norm": 0.5161774158477783, "learning_rate": 7.768898316359076e-05, "loss": 1.1654, "step": 993 }, { "epoch": 0.3246907027071169, "grad_norm": 0.5306985974311829, "learning_rate": 7.764552923015935e-05, "loss": 1.1412, "step": 994 }, { "epoch": 0.3250173533134621, "grad_norm": 0.6168984174728394, "learning_rate": 7.76020452011212e-05, "loss": 1.158, "step": 995 }, { "epoch": 0.3253440039198073, "grad_norm": 0.707940936088562, "learning_rate": 7.755853112381411e-05, "loss": 1.3226, "step": 996 }, { "epoch": 0.32567065452615246, "grad_norm": 0.8782923817634583, "learning_rate": 7.751498704560858e-05, "loss": 1.3097, "step": 997 }, { "epoch": 0.32599730513249764, "grad_norm": 1.1435363292694092, "learning_rate": 7.747141301390777e-05, "loss": 1.5753, "step": 998 }, { "epoch": 0.3263239557388428, "grad_norm": 1.199203372001648, "learning_rate": 7.742780907614742e-05, "loss": 1.2035, "step": 999 }, { "epoch": 0.32665060634518805, "grad_norm": 1.617131233215332, "learning_rate": 7.73841752797959e-05, "loss": 1.5258, "step": 1000 }, { "epoch": 0.32697725695153323, "grad_norm": 0.21024766564369202, "learning_rate": 7.734051167235404e-05, "loss": 0.8054, "step": 1001 }, { "epoch": 0.3273039075578784, "grad_norm": 0.23412029445171356, "learning_rate": 7.729681830135506e-05, "loss": 0.8206, "step": 1002 }, { "epoch": 0.3276305581642236, "grad_norm": 0.26261237263679504, "learning_rate": 7.725309521436473e-05, "loss": 0.8588, "step": 1003 }, { "epoch": 0.3279572087705688, "grad_norm": 0.2516871690750122, "learning_rate": 7.720934245898101e-05, "loss": 0.842, "step": 1004 }, { "epoch": 0.32828385937691396, "grad_norm": 0.268623411655426, "learning_rate": 7.716556008283428e-05, "loss": 0.7826, "step": 1005 }, { "epoch": 0.32861050998325914, "grad_norm": 0.282501220703125, "learning_rate": 7.712174813358709e-05, "loss": 0.8606, "step": 1006 }, { "epoch": 0.32893716058960437, "grad_norm": 0.2929759621620178, "learning_rate": 7.707790665893422e-05, "loss": 0.9098, "step": 1007 }, { "epoch": 0.32926381119594955, "grad_norm": 0.30565446615219116, "learning_rate": 7.703403570660259e-05, "loss": 0.905, "step": 1008 }, { "epoch": 0.32959046180229473, "grad_norm": 0.3160889446735382, "learning_rate": 7.699013532435119e-05, "loss": 0.9065, "step": 1009 }, { "epoch": 0.3299171124086399, "grad_norm": 0.34107425808906555, "learning_rate": 7.694620555997107e-05, "loss": 0.9131, "step": 1010 }, { "epoch": 0.3302437630149851, "grad_norm": 0.3465249538421631, "learning_rate": 7.690224646128526e-05, "loss": 0.8906, "step": 1011 }, { "epoch": 0.3305704136213303, "grad_norm": 0.36077815294265747, "learning_rate": 7.685825807614872e-05, "loss": 0.9187, "step": 1012 }, { "epoch": 0.33089706422767545, "grad_norm": 0.37028247117996216, "learning_rate": 7.681424045244829e-05, "loss": 0.912, "step": 1013 }, { "epoch": 0.3312237148340207, "grad_norm": 0.3931543827056885, "learning_rate": 7.677019363810268e-05, "loss": 1.0148, "step": 1014 }, { "epoch": 0.33155036544036587, "grad_norm": 0.4445292055606842, "learning_rate": 7.672611768106227e-05, "loss": 1.015, "step": 1015 }, { "epoch": 0.33187701604671105, "grad_norm": 0.49006712436676025, "learning_rate": 7.668201262930927e-05, "loss": 0.9695, "step": 1016 }, { "epoch": 0.33220366665305623, "grad_norm": 0.5494130253791809, "learning_rate": 7.663787853085755e-05, "loss": 1.2143, "step": 1017 }, { "epoch": 0.3325303172594014, "grad_norm": 0.5494057536125183, "learning_rate": 7.659371543375258e-05, "loss": 1.1659, "step": 1018 }, { "epoch": 0.3328569678657466, "grad_norm": 0.5971184372901917, "learning_rate": 7.654952338607137e-05, "loss": 1.1892, "step": 1019 }, { "epoch": 0.33318361847209177, "grad_norm": 0.7538458704948425, "learning_rate": 7.650530243592248e-05, "loss": 1.1604, "step": 1020 }, { "epoch": 0.33351026907843695, "grad_norm": 0.9583027362823486, "learning_rate": 7.646105263144595e-05, "loss": 1.3371, "step": 1021 }, { "epoch": 0.3338369196847822, "grad_norm": 1.0476845502853394, "learning_rate": 7.64167740208132e-05, "loss": 1.3216, "step": 1022 }, { "epoch": 0.33416357029112737, "grad_norm": 1.0443453788757324, "learning_rate": 7.637246665222704e-05, "loss": 1.225, "step": 1023 }, { "epoch": 0.33449022089747255, "grad_norm": 1.3026036024093628, "learning_rate": 7.632813057392151e-05, "loss": 1.4609, "step": 1024 }, { "epoch": 0.3348168715038177, "grad_norm": 1.555985450744629, "learning_rate": 7.628376583416204e-05, "loss": 1.7155, "step": 1025 }, { "epoch": 0.3351435221101629, "grad_norm": 0.19100616872310638, "learning_rate": 7.623937248124513e-05, "loss": 0.7413, "step": 1026 }, { "epoch": 0.3354701727165081, "grad_norm": 0.22931453585624695, "learning_rate": 7.619495056349849e-05, "loss": 0.894, "step": 1027 }, { "epoch": 0.33579682332285327, "grad_norm": 0.2343945950269699, "learning_rate": 7.615050012928092e-05, "loss": 0.8266, "step": 1028 }, { "epoch": 0.3361234739291985, "grad_norm": 0.27221325039863586, "learning_rate": 7.610602122698227e-05, "loss": 0.8433, "step": 1029 }, { "epoch": 0.3364501245355437, "grad_norm": 0.2684449553489685, "learning_rate": 7.606151390502337e-05, "loss": 0.7658, "step": 1030 }, { "epoch": 0.33677677514188886, "grad_norm": 0.2658660411834717, "learning_rate": 7.6016978211856e-05, "loss": 0.8912, "step": 1031 }, { "epoch": 0.33710342574823404, "grad_norm": 0.27399852871894836, "learning_rate": 7.597241419596279e-05, "loss": 0.8434, "step": 1032 }, { "epoch": 0.3374300763545792, "grad_norm": 0.27935051918029785, "learning_rate": 7.592782190585725e-05, "loss": 0.8468, "step": 1033 }, { "epoch": 0.3377567269609244, "grad_norm": 0.3147827684879303, "learning_rate": 7.588320139008365e-05, "loss": 0.8781, "step": 1034 }, { "epoch": 0.3380833775672696, "grad_norm": 0.31882667541503906, "learning_rate": 7.583855269721697e-05, "loss": 0.9354, "step": 1035 }, { "epoch": 0.3384100281736148, "grad_norm": 0.3448106348514557, "learning_rate": 7.579387587586292e-05, "loss": 0.8532, "step": 1036 }, { "epoch": 0.33873667877996, "grad_norm": 0.3815080225467682, "learning_rate": 7.574917097465774e-05, "loss": 0.945, "step": 1037 }, { "epoch": 0.3390633293863052, "grad_norm": 0.3913522958755493, "learning_rate": 7.570443804226833e-05, "loss": 0.8925, "step": 1038 }, { "epoch": 0.33938997999265036, "grad_norm": 0.42671769857406616, "learning_rate": 7.565967712739205e-05, "loss": 1.0106, "step": 1039 }, { "epoch": 0.33971663059899554, "grad_norm": 0.4863613247871399, "learning_rate": 7.561488827875675e-05, "loss": 1.0284, "step": 1040 }, { "epoch": 0.3400432812053407, "grad_norm": 0.507326066493988, "learning_rate": 7.557007154512065e-05, "loss": 1.0476, "step": 1041 }, { "epoch": 0.3403699318116859, "grad_norm": 0.5292708277702332, "learning_rate": 7.55252269752724e-05, "loss": 1.0151, "step": 1042 }, { "epoch": 0.34069658241803114, "grad_norm": 0.5429087281227112, "learning_rate": 7.548035461803087e-05, "loss": 1.0862, "step": 1043 }, { "epoch": 0.3410232330243763, "grad_norm": 0.6534260511398315, "learning_rate": 7.543545452224523e-05, "loss": 1.1288, "step": 1044 }, { "epoch": 0.3413498836307215, "grad_norm": 0.8366156816482544, "learning_rate": 7.539052673679483e-05, "loss": 1.5115, "step": 1045 }, { "epoch": 0.3416765342370667, "grad_norm": 0.8968381881713867, "learning_rate": 7.534557131058917e-05, "loss": 1.1607, "step": 1046 }, { "epoch": 0.34200318484341186, "grad_norm": 1.1521880626678467, "learning_rate": 7.530058829256785e-05, "loss": 1.2325, "step": 1047 }, { "epoch": 0.34232983544975704, "grad_norm": 1.234046459197998, "learning_rate": 7.525557773170048e-05, "loss": 1.6835, "step": 1048 }, { "epoch": 0.3426564860561022, "grad_norm": 2.0295400619506836, "learning_rate": 7.521053967698669e-05, "loss": 2.3934, "step": 1049 }, { "epoch": 0.34298313666244745, "grad_norm": 2.4439151287078857, "learning_rate": 7.516547417745598e-05, "loss": 2.534, "step": 1050 }, { "epoch": 0.34330978726879263, "grad_norm": 0.20437824726104736, "learning_rate": 7.512038128216782e-05, "loss": 0.8084, "step": 1051 }, { "epoch": 0.3436364378751378, "grad_norm": 0.22019585967063904, "learning_rate": 7.507526104021141e-05, "loss": 0.7823, "step": 1052 }, { "epoch": 0.343963088481483, "grad_norm": 0.24627773463726044, "learning_rate": 7.503011350070579e-05, "loss": 0.87, "step": 1053 }, { "epoch": 0.3442897390878282, "grad_norm": 0.2620931565761566, "learning_rate": 7.498493871279967e-05, "loss": 0.8776, "step": 1054 }, { "epoch": 0.34461638969417335, "grad_norm": 0.2705698609352112, "learning_rate": 7.493973672567144e-05, "loss": 0.8141, "step": 1055 }, { "epoch": 0.34494304030051853, "grad_norm": 0.2702180743217468, "learning_rate": 7.48945075885291e-05, "loss": 0.871, "step": 1056 }, { "epoch": 0.34526969090686377, "grad_norm": 0.29457515478134155, "learning_rate": 7.484925135061022e-05, "loss": 0.9953, "step": 1057 }, { "epoch": 0.34559634151320895, "grad_norm": 0.27578428387641907, "learning_rate": 7.480396806118186e-05, "loss": 0.8458, "step": 1058 }, { "epoch": 0.34592299211955413, "grad_norm": 0.3459113836288452, "learning_rate": 7.475865776954051e-05, "loss": 0.9171, "step": 1059 }, { "epoch": 0.3462496427258993, "grad_norm": 0.300814151763916, "learning_rate": 7.47133205250121e-05, "loss": 0.9346, "step": 1060 }, { "epoch": 0.3465762933322445, "grad_norm": 0.31863105297088623, "learning_rate": 7.466795637695184e-05, "loss": 0.9133, "step": 1061 }, { "epoch": 0.34690294393858967, "grad_norm": 0.3595605790615082, "learning_rate": 7.462256537474429e-05, "loss": 0.9152, "step": 1062 }, { "epoch": 0.34722959454493485, "grad_norm": 0.3383408784866333, "learning_rate": 7.457714756780322e-05, "loss": 0.8437, "step": 1063 }, { "epoch": 0.3475562451512801, "grad_norm": 0.3689708709716797, "learning_rate": 7.453170300557156e-05, "loss": 1.0964, "step": 1064 }, { "epoch": 0.34788289575762527, "grad_norm": 0.3869151175022125, "learning_rate": 7.448623173752139e-05, "loss": 1.0209, "step": 1065 }, { "epoch": 0.34820954636397045, "grad_norm": 0.4113345146179199, "learning_rate": 7.444073381315388e-05, "loss": 1.0003, "step": 1066 }, { "epoch": 0.3485361969703156, "grad_norm": 0.4470776915550232, "learning_rate": 7.439520928199917e-05, "loss": 1.0222, "step": 1067 }, { "epoch": 0.3488628475766608, "grad_norm": 0.4541364908218384, "learning_rate": 7.434965819361638e-05, "loss": 0.9672, "step": 1068 }, { "epoch": 0.349189498183006, "grad_norm": 0.4776630103588104, "learning_rate": 7.430408059759357e-05, "loss": 1.1079, "step": 1069 }, { "epoch": 0.34951614878935117, "grad_norm": 0.5205119252204895, "learning_rate": 7.425847654354764e-05, "loss": 0.9712, "step": 1070 }, { "epoch": 0.3498427993956964, "grad_norm": 0.58984375, "learning_rate": 7.421284608112431e-05, "loss": 1.2611, "step": 1071 }, { "epoch": 0.3501694500020416, "grad_norm": 0.8614017367362976, "learning_rate": 7.416718925999797e-05, "loss": 1.2577, "step": 1072 }, { "epoch": 0.35049610060838676, "grad_norm": 0.9795945286750793, "learning_rate": 7.412150612987182e-05, "loss": 1.2588, "step": 1073 }, { "epoch": 0.35082275121473194, "grad_norm": 1.1379746198654175, "learning_rate": 7.407579674047763e-05, "loss": 1.1972, "step": 1074 }, { "epoch": 0.3511494018210771, "grad_norm": 1.3634546995162964, "learning_rate": 7.403006114157575e-05, "loss": 1.9137, "step": 1075 }, { "epoch": 0.3514760524274223, "grad_norm": 0.19136269390583038, "learning_rate": 7.398429938295511e-05, "loss": 0.6746, "step": 1076 }, { "epoch": 0.3518027030337675, "grad_norm": 0.25908541679382324, "learning_rate": 7.393851151443307e-05, "loss": 0.8532, "step": 1077 }, { "epoch": 0.3521293536401127, "grad_norm": 0.2595674991607666, "learning_rate": 7.389269758585546e-05, "loss": 0.8023, "step": 1078 }, { "epoch": 0.3524560042464579, "grad_norm": 0.27125805616378784, "learning_rate": 7.384685764709645e-05, "loss": 0.8122, "step": 1079 }, { "epoch": 0.3527826548528031, "grad_norm": 0.282625675201416, "learning_rate": 7.380099174805852e-05, "loss": 0.878, "step": 1080 }, { "epoch": 0.35310930545914826, "grad_norm": 0.28404882550239563, "learning_rate": 7.375509993867242e-05, "loss": 0.7957, "step": 1081 }, { "epoch": 0.35343595606549344, "grad_norm": 0.3119284212589264, "learning_rate": 7.370918226889713e-05, "loss": 0.8864, "step": 1082 }, { "epoch": 0.3537626066718386, "grad_norm": 0.3108453154563904, "learning_rate": 7.366323878871973e-05, "loss": 0.9002, "step": 1083 }, { "epoch": 0.3540892572781838, "grad_norm": 0.3398928642272949, "learning_rate": 7.361726954815547e-05, "loss": 0.8979, "step": 1084 }, { "epoch": 0.35441590788452904, "grad_norm": 0.33085235953330994, "learning_rate": 7.357127459724755e-05, "loss": 0.9091, "step": 1085 }, { "epoch": 0.3547425584908742, "grad_norm": 0.39169177412986755, "learning_rate": 7.352525398606724e-05, "loss": 0.8724, "step": 1086 }, { "epoch": 0.3550692090972194, "grad_norm": 0.3578439950942993, "learning_rate": 7.347920776471374e-05, "loss": 0.8364, "step": 1087 }, { "epoch": 0.3553958597035646, "grad_norm": 0.34978634119033813, "learning_rate": 7.343313598331406e-05, "loss": 0.8644, "step": 1088 }, { "epoch": 0.35572251030990976, "grad_norm": 0.40582436323165894, "learning_rate": 7.33870386920231e-05, "loss": 1.0534, "step": 1089 }, { "epoch": 0.35604916091625494, "grad_norm": 0.4203055202960968, "learning_rate": 7.33409159410235e-05, "loss": 0.9638, "step": 1090 }, { "epoch": 0.3563758115226001, "grad_norm": 0.45389387011528015, "learning_rate": 7.329476778052565e-05, "loss": 1.0034, "step": 1091 }, { "epoch": 0.35670246212894535, "grad_norm": 0.47722360491752625, "learning_rate": 7.324859426076756e-05, "loss": 0.9551, "step": 1092 }, { "epoch": 0.35702911273529053, "grad_norm": 0.4822474718093872, "learning_rate": 7.320239543201489e-05, "loss": 1.0724, "step": 1093 }, { "epoch": 0.3573557633416357, "grad_norm": 0.559536337852478, "learning_rate": 7.315617134456079e-05, "loss": 1.2312, "step": 1094 }, { "epoch": 0.3576824139479809, "grad_norm": 0.5810602307319641, "learning_rate": 7.310992204872595e-05, "loss": 0.9705, "step": 1095 }, { "epoch": 0.3580090645543261, "grad_norm": 0.7775752544403076, "learning_rate": 7.306364759485853e-05, "loss": 1.3561, "step": 1096 }, { "epoch": 0.35833571516067125, "grad_norm": 0.8379966020584106, "learning_rate": 7.301734803333403e-05, "loss": 1.2233, "step": 1097 }, { "epoch": 0.35866236576701643, "grad_norm": 0.927143931388855, "learning_rate": 7.297102341455528e-05, "loss": 1.3993, "step": 1098 }, { "epoch": 0.35898901637336167, "grad_norm": 1.2338497638702393, "learning_rate": 7.292467378895243e-05, "loss": 1.2431, "step": 1099 }, { "epoch": 0.35931566697970685, "grad_norm": 1.6832832098007202, "learning_rate": 7.28782992069828e-05, "loss": 1.5705, "step": 1100 }, { "epoch": 0.35964231758605203, "grad_norm": 0.2085774838924408, "learning_rate": 7.283189971913094e-05, "loss": 0.8405, "step": 1101 }, { "epoch": 0.3599689681923972, "grad_norm": 0.23439809679985046, "learning_rate": 7.278547537590845e-05, "loss": 0.7714, "step": 1102 }, { "epoch": 0.3602956187987424, "grad_norm": 0.25842559337615967, "learning_rate": 7.273902622785405e-05, "loss": 0.8394, "step": 1103 }, { "epoch": 0.36062226940508757, "grad_norm": 0.2644081115722656, "learning_rate": 7.269255232553339e-05, "loss": 0.8266, "step": 1104 }, { "epoch": 0.36094892001143275, "grad_norm": 0.26839661598205566, "learning_rate": 7.264605371953915e-05, "loss": 0.8877, "step": 1105 }, { "epoch": 0.361275570617778, "grad_norm": 0.2822781503200531, "learning_rate": 7.259953046049084e-05, "loss": 0.9052, "step": 1106 }, { "epoch": 0.36160222122412317, "grad_norm": 0.30244433879852295, "learning_rate": 7.255298259903482e-05, "loss": 0.9289, "step": 1107 }, { "epoch": 0.36192887183046835, "grad_norm": 0.29979464411735535, "learning_rate": 7.250641018584428e-05, "loss": 0.9039, "step": 1108 }, { "epoch": 0.3622555224368135, "grad_norm": 0.31162193417549133, "learning_rate": 7.245981327161905e-05, "loss": 0.968, "step": 1109 }, { "epoch": 0.3625821730431587, "grad_norm": 0.32680559158325195, "learning_rate": 7.241319190708575e-05, "loss": 0.8592, "step": 1110 }, { "epoch": 0.3629088236495039, "grad_norm": 0.332279235124588, "learning_rate": 7.236654614299748e-05, "loss": 0.8753, "step": 1111 }, { "epoch": 0.36323547425584907, "grad_norm": 0.36247286200523376, "learning_rate": 7.231987603013401e-05, "loss": 0.9475, "step": 1112 }, { "epoch": 0.3635621248621943, "grad_norm": 0.3807973563671112, "learning_rate": 7.227318161930157e-05, "loss": 0.9499, "step": 1113 }, { "epoch": 0.3638887754685395, "grad_norm": 0.40924564003944397, "learning_rate": 7.222646296133287e-05, "loss": 0.9879, "step": 1114 }, { "epoch": 0.36421542607488466, "grad_norm": 0.43880122900009155, "learning_rate": 7.217972010708696e-05, "loss": 0.9948, "step": 1115 }, { "epoch": 0.36454207668122984, "grad_norm": 0.4344307780265808, "learning_rate": 7.213295310744928e-05, "loss": 1.0174, "step": 1116 }, { "epoch": 0.364868727287575, "grad_norm": 0.4451688528060913, "learning_rate": 7.208616201333156e-05, "loss": 0.9336, "step": 1117 }, { "epoch": 0.3651953778939202, "grad_norm": 0.5209891200065613, "learning_rate": 7.203934687567173e-05, "loss": 1.0403, "step": 1118 }, { "epoch": 0.3655220285002654, "grad_norm": 0.5560318231582642, "learning_rate": 7.199250774543391e-05, "loss": 1.0855, "step": 1119 }, { "epoch": 0.3658486791066106, "grad_norm": 0.6434129476547241, "learning_rate": 7.194564467360834e-05, "loss": 1.1184, "step": 1120 }, { "epoch": 0.3661753297129558, "grad_norm": 0.8035168051719666, "learning_rate": 7.189875771121129e-05, "loss": 1.1999, "step": 1121 }, { "epoch": 0.366501980319301, "grad_norm": 0.9151061177253723, "learning_rate": 7.18518469092851e-05, "loss": 1.2509, "step": 1122 }, { "epoch": 0.36682863092564616, "grad_norm": 1.119385004043579, "learning_rate": 7.180491231889802e-05, "loss": 1.6826, "step": 1123 }, { "epoch": 0.36715528153199134, "grad_norm": 1.2659269571304321, "learning_rate": 7.17579539911442e-05, "loss": 1.6055, "step": 1124 }, { "epoch": 0.3674819321383365, "grad_norm": 1.693357229232788, "learning_rate": 7.171097197714363e-05, "loss": 1.8833, "step": 1125 }, { "epoch": 0.3678085827446817, "grad_norm": 0.19237908720970154, "learning_rate": 7.166396632804212e-05, "loss": 0.646, "step": 1126 }, { "epoch": 0.3681352333510269, "grad_norm": 0.25181078910827637, "learning_rate": 7.161693709501114e-05, "loss": 0.8216, "step": 1127 }, { "epoch": 0.3684618839573721, "grad_norm": 0.26569420099258423, "learning_rate": 7.156988432924791e-05, "loss": 0.8657, "step": 1128 }, { "epoch": 0.3687885345637173, "grad_norm": 0.2794082462787628, "learning_rate": 7.152280808197522e-05, "loss": 0.7595, "step": 1129 }, { "epoch": 0.3691151851700625, "grad_norm": 0.2683536112308502, "learning_rate": 7.147570840444145e-05, "loss": 0.8327, "step": 1130 }, { "epoch": 0.36944183577640766, "grad_norm": 0.27695196866989136, "learning_rate": 7.142858534792045e-05, "loss": 0.8344, "step": 1131 }, { "epoch": 0.36976848638275284, "grad_norm": 0.28606536984443665, "learning_rate": 7.138143896371157e-05, "loss": 0.8636, "step": 1132 }, { "epoch": 0.370095136989098, "grad_norm": 0.2818695902824402, "learning_rate": 7.133426930313951e-05, "loss": 0.8219, "step": 1133 }, { "epoch": 0.3704217875954432, "grad_norm": 0.3215191960334778, "learning_rate": 7.128707641755434e-05, "loss": 0.9497, "step": 1134 }, { "epoch": 0.37074843820178843, "grad_norm": 0.3110451400279999, "learning_rate": 7.123986035833141e-05, "loss": 0.8942, "step": 1135 }, { "epoch": 0.3710750888081336, "grad_norm": 0.31207698583602905, "learning_rate": 7.119262117687127e-05, "loss": 0.9575, "step": 1136 }, { "epoch": 0.3714017394144788, "grad_norm": 0.314654678106308, "learning_rate": 7.114535892459967e-05, "loss": 0.8195, "step": 1137 }, { "epoch": 0.371728390020824, "grad_norm": 0.34628891944885254, "learning_rate": 7.109807365296748e-05, "loss": 0.8786, "step": 1138 }, { "epoch": 0.37205504062716915, "grad_norm": 0.3640112578868866, "learning_rate": 7.105076541345058e-05, "loss": 0.9262, "step": 1139 }, { "epoch": 0.37238169123351433, "grad_norm": 0.4010256230831146, "learning_rate": 7.100343425754993e-05, "loss": 0.9402, "step": 1140 }, { "epoch": 0.3727083418398595, "grad_norm": 0.4014125466346741, "learning_rate": 7.095608023679138e-05, "loss": 1.0435, "step": 1141 }, { "epoch": 0.37303499244620475, "grad_norm": 0.46626025438308716, "learning_rate": 7.090870340272568e-05, "loss": 0.992, "step": 1142 }, { "epoch": 0.37336164305254993, "grad_norm": 0.5319213271141052, "learning_rate": 7.086130380692841e-05, "loss": 1.1192, "step": 1143 }, { "epoch": 0.3736882936588951, "grad_norm": 0.5756850838661194, "learning_rate": 7.081388150099999e-05, "loss": 1.1887, "step": 1144 }, { "epoch": 0.3740149442652403, "grad_norm": 0.6586177349090576, "learning_rate": 7.076643653656549e-05, "loss": 1.2207, "step": 1145 }, { "epoch": 0.37434159487158547, "grad_norm": 0.6644164323806763, "learning_rate": 7.071896896527464e-05, "loss": 1.194, "step": 1146 }, { "epoch": 0.37466824547793065, "grad_norm": 0.9249393939971924, "learning_rate": 7.067147883880185e-05, "loss": 1.3419, "step": 1147 }, { "epoch": 0.37499489608427583, "grad_norm": 1.1207913160324097, "learning_rate": 7.062396620884605e-05, "loss": 1.5119, "step": 1148 }, { "epoch": 0.37532154669062107, "grad_norm": 1.3481462001800537, "learning_rate": 7.057643112713063e-05, "loss": 1.2274, "step": 1149 }, { "epoch": 0.37564819729696625, "grad_norm": 1.6863646507263184, "learning_rate": 7.05288736454035e-05, "loss": 2.1351, "step": 1150 }, { "epoch": 0.3759748479033114, "grad_norm": 0.19038569927215576, "learning_rate": 7.048129381543687e-05, "loss": 0.6988, "step": 1151 }, { "epoch": 0.3763014985096566, "grad_norm": 0.2359238564968109, "learning_rate": 7.043369168902732e-05, "loss": 0.8121, "step": 1152 }, { "epoch": 0.3766281491160018, "grad_norm": 0.25924691557884216, "learning_rate": 7.038606731799574e-05, "loss": 0.8618, "step": 1153 }, { "epoch": 0.37695479972234697, "grad_norm": 0.24869892001152039, "learning_rate": 7.033842075418718e-05, "loss": 0.8703, "step": 1154 }, { "epoch": 0.37728145032869215, "grad_norm": 0.2840399742126465, "learning_rate": 7.029075204947085e-05, "loss": 0.7961, "step": 1155 }, { "epoch": 0.3776081009350374, "grad_norm": 0.2748403549194336, "learning_rate": 7.024306125574009e-05, "loss": 0.9051, "step": 1156 }, { "epoch": 0.37793475154138256, "grad_norm": 0.284496009349823, "learning_rate": 7.019534842491228e-05, "loss": 0.8884, "step": 1157 }, { "epoch": 0.37826140214772774, "grad_norm": 0.3085760176181793, "learning_rate": 7.014761360892882e-05, "loss": 0.9337, "step": 1158 }, { "epoch": 0.3785880527540729, "grad_norm": 0.3391752243041992, "learning_rate": 7.009985685975495e-05, "loss": 0.9858, "step": 1159 }, { "epoch": 0.3789147033604181, "grad_norm": 0.3305496871471405, "learning_rate": 7.005207822937988e-05, "loss": 0.8196, "step": 1160 }, { "epoch": 0.3792413539667633, "grad_norm": 0.34923458099365234, "learning_rate": 7.00042777698166e-05, "loss": 0.908, "step": 1161 }, { "epoch": 0.37956800457310846, "grad_norm": 0.41269630193710327, "learning_rate": 6.99564555331019e-05, "loss": 0.9519, "step": 1162 }, { "epoch": 0.3798946551794537, "grad_norm": 0.3687300682067871, "learning_rate": 6.990861157129622e-05, "loss": 1.0597, "step": 1163 }, { "epoch": 0.3802213057857989, "grad_norm": 0.40411537885665894, "learning_rate": 6.986074593648367e-05, "loss": 0.9359, "step": 1164 }, { "epoch": 0.38054795639214406, "grad_norm": 0.4321344792842865, "learning_rate": 6.981285868077198e-05, "loss": 1.0351, "step": 1165 }, { "epoch": 0.38087460699848924, "grad_norm": 0.4461156725883484, "learning_rate": 6.976494985629242e-05, "loss": 1.0586, "step": 1166 }, { "epoch": 0.3812012576048344, "grad_norm": 0.5063657164573669, "learning_rate": 6.971701951519972e-05, "loss": 1.163, "step": 1167 }, { "epoch": 0.3815279082111796, "grad_norm": 0.5494873523712158, "learning_rate": 6.966906770967199e-05, "loss": 1.1235, "step": 1168 }, { "epoch": 0.3818545588175248, "grad_norm": 0.6220507621765137, "learning_rate": 6.962109449191077e-05, "loss": 1.2436, "step": 1169 }, { "epoch": 0.38218120942387, "grad_norm": 0.6774858832359314, "learning_rate": 6.957309991414092e-05, "loss": 1.3258, "step": 1170 }, { "epoch": 0.3825078600302152, "grad_norm": 0.8799903392791748, "learning_rate": 6.952508402861051e-05, "loss": 1.4338, "step": 1171 }, { "epoch": 0.3828345106365604, "grad_norm": 1.1180598735809326, "learning_rate": 6.94770468875908e-05, "loss": 1.8223, "step": 1172 }, { "epoch": 0.38316116124290556, "grad_norm": 1.154464602470398, "learning_rate": 6.942898854337621e-05, "loss": 1.3532, "step": 1173 }, { "epoch": 0.38348781184925074, "grad_norm": 1.271112084388733, "learning_rate": 6.938090904828428e-05, "loss": 1.5536, "step": 1174 }, { "epoch": 0.3838144624555959, "grad_norm": 1.7882853746414185, "learning_rate": 6.933280845465551e-05, "loss": 1.9543, "step": 1175 }, { "epoch": 0.3841411130619411, "grad_norm": 0.1903381198644638, "learning_rate": 6.92846868148534e-05, "loss": 0.6825, "step": 1176 }, { "epoch": 0.38446776366828633, "grad_norm": 0.21171070635318756, "learning_rate": 6.923654418126434e-05, "loss": 0.7353, "step": 1177 }, { "epoch": 0.3847944142746315, "grad_norm": 0.22646358609199524, "learning_rate": 6.918838060629762e-05, "loss": 0.7659, "step": 1178 }, { "epoch": 0.3851210648809767, "grad_norm": 0.2556747496128082, "learning_rate": 6.914019614238527e-05, "loss": 0.8941, "step": 1179 }, { "epoch": 0.3854477154873219, "grad_norm": 0.278025358915329, "learning_rate": 6.909199084198212e-05, "loss": 0.8642, "step": 1180 }, { "epoch": 0.38577436609366705, "grad_norm": 0.2806093692779541, "learning_rate": 6.904376475756563e-05, "loss": 0.8426, "step": 1181 }, { "epoch": 0.38610101670001223, "grad_norm": 0.3061642646789551, "learning_rate": 6.899551794163592e-05, "loss": 0.9506, "step": 1182 }, { "epoch": 0.3864276673063574, "grad_norm": 0.31807222962379456, "learning_rate": 6.894725044671566e-05, "loss": 0.896, "step": 1183 }, { "epoch": 0.38675431791270265, "grad_norm": 0.330069899559021, "learning_rate": 6.889896232535004e-05, "loss": 0.9255, "step": 1184 }, { "epoch": 0.38708096851904783, "grad_norm": 0.3431348502635956, "learning_rate": 6.885065363010671e-05, "loss": 0.9511, "step": 1185 }, { "epoch": 0.387407619125393, "grad_norm": 0.3761681020259857, "learning_rate": 6.88023244135757e-05, "loss": 0.9307, "step": 1186 }, { "epoch": 0.3877342697317382, "grad_norm": 0.352243572473526, "learning_rate": 6.875397472836937e-05, "loss": 0.9937, "step": 1187 }, { "epoch": 0.38806092033808337, "grad_norm": 0.38508570194244385, "learning_rate": 6.870560462712243e-05, "loss": 0.95, "step": 1188 }, { "epoch": 0.38838757094442855, "grad_norm": 0.41849932074546814, "learning_rate": 6.865721416249175e-05, "loss": 1.0807, "step": 1189 }, { "epoch": 0.38871422155077373, "grad_norm": 0.4687519669532776, "learning_rate": 6.860880338715638e-05, "loss": 1.0743, "step": 1190 }, { "epoch": 0.38904087215711897, "grad_norm": 0.4491303563117981, "learning_rate": 6.856037235381746e-05, "loss": 1.1039, "step": 1191 }, { "epoch": 0.38936752276346415, "grad_norm": 0.5182649493217468, "learning_rate": 6.851192111519826e-05, "loss": 0.9833, "step": 1192 }, { "epoch": 0.3896941733698093, "grad_norm": 0.5373132824897766, "learning_rate": 6.846344972404399e-05, "loss": 1.1743, "step": 1193 }, { "epoch": 0.3900208239761545, "grad_norm": 0.5523989796638489, "learning_rate": 6.841495823312177e-05, "loss": 1.1548, "step": 1194 }, { "epoch": 0.3903474745824997, "grad_norm": 0.6846272349357605, "learning_rate": 6.836644669522065e-05, "loss": 1.2027, "step": 1195 }, { "epoch": 0.39067412518884487, "grad_norm": 0.7508804202079773, "learning_rate": 6.831791516315151e-05, "loss": 1.2353, "step": 1196 }, { "epoch": 0.39100077579519005, "grad_norm": 1.0120075941085815, "learning_rate": 6.826936368974696e-05, "loss": 1.6322, "step": 1197 }, { "epoch": 0.3913274264015353, "grad_norm": 1.1982024908065796, "learning_rate": 6.822079232786134e-05, "loss": 1.674, "step": 1198 }, { "epoch": 0.39165407700788046, "grad_norm": 1.4332988262176514, "learning_rate": 6.817220113037062e-05, "loss": 1.2769, "step": 1199 }, { "epoch": 0.39198072761422564, "grad_norm": 1.8049565553665161, "learning_rate": 6.81235901501724e-05, "loss": 1.7269, "step": 1200 }, { "epoch": 0.3923073782205708, "grad_norm": 0.20991112291812897, "learning_rate": 6.807495944018577e-05, "loss": 0.7704, "step": 1201 }, { "epoch": 0.392634028826916, "grad_norm": 0.23148652911186218, "learning_rate": 6.802630905335137e-05, "loss": 0.7713, "step": 1202 }, { "epoch": 0.3929606794332612, "grad_norm": 0.24124076962471008, "learning_rate": 6.797763904263115e-05, "loss": 0.8331, "step": 1203 }, { "epoch": 0.39328733003960636, "grad_norm": 0.2620198130607605, "learning_rate": 6.792894946100854e-05, "loss": 0.8113, "step": 1204 }, { "epoch": 0.3936139806459516, "grad_norm": 0.26299262046813965, "learning_rate": 6.788024036148821e-05, "loss": 0.8568, "step": 1205 }, { "epoch": 0.3939406312522968, "grad_norm": 0.27623122930526733, "learning_rate": 6.783151179709609e-05, "loss": 0.8808, "step": 1206 }, { "epoch": 0.39426728185864196, "grad_norm": 0.3048432171344757, "learning_rate": 6.778276382087926e-05, "loss": 0.9913, "step": 1207 }, { "epoch": 0.39459393246498714, "grad_norm": 0.3026992678642273, "learning_rate": 6.773399648590602e-05, "loss": 0.9271, "step": 1208 }, { "epoch": 0.3949205830713323, "grad_norm": 0.3055267632007599, "learning_rate": 6.768520984526569e-05, "loss": 0.8672, "step": 1209 }, { "epoch": 0.3952472336776775, "grad_norm": 0.2968289256095886, "learning_rate": 6.76364039520686e-05, "loss": 0.7734, "step": 1210 }, { "epoch": 0.3955738842840227, "grad_norm": 0.32888948917388916, "learning_rate": 6.758757885944608e-05, "loss": 0.9411, "step": 1211 }, { "epoch": 0.3959005348903679, "grad_norm": 0.334842711687088, "learning_rate": 6.75387346205503e-05, "loss": 0.9304, "step": 1212 }, { "epoch": 0.3962271854967131, "grad_norm": 0.3538110852241516, "learning_rate": 6.74898712885543e-05, "loss": 1.0136, "step": 1213 }, { "epoch": 0.3965538361030583, "grad_norm": 0.4119766652584076, "learning_rate": 6.744098891665194e-05, "loss": 0.9688, "step": 1214 }, { "epoch": 0.39688048670940346, "grad_norm": 0.40925925970077515, "learning_rate": 6.739208755805778e-05, "loss": 0.9954, "step": 1215 }, { "epoch": 0.39720713731574864, "grad_norm": 0.4552319049835205, "learning_rate": 6.734316726600702e-05, "loss": 0.96, "step": 1216 }, { "epoch": 0.3975337879220938, "grad_norm": 0.45739418268203735, "learning_rate": 6.729422809375551e-05, "loss": 0.9417, "step": 1217 }, { "epoch": 0.397860438528439, "grad_norm": 0.5212276577949524, "learning_rate": 6.724527009457966e-05, "loss": 1.0401, "step": 1218 }, { "epoch": 0.39818708913478423, "grad_norm": 0.5565895438194275, "learning_rate": 6.719629332177634e-05, "loss": 1.1682, "step": 1219 }, { "epoch": 0.3985137397411294, "grad_norm": 0.6466187834739685, "learning_rate": 6.714729782866291e-05, "loss": 1.2577, "step": 1220 }, { "epoch": 0.3988403903474746, "grad_norm": 0.7221575975418091, "learning_rate": 6.709828366857702e-05, "loss": 1.2678, "step": 1221 }, { "epoch": 0.3991670409538198, "grad_norm": 0.9043142199516296, "learning_rate": 6.704925089487675e-05, "loss": 1.3762, "step": 1222 }, { "epoch": 0.39949369156016495, "grad_norm": 1.0332460403442383, "learning_rate": 6.700019956094035e-05, "loss": 1.2477, "step": 1223 }, { "epoch": 0.39982034216651013, "grad_norm": 1.4478338956832886, "learning_rate": 6.695112972016633e-05, "loss": 1.4505, "step": 1224 }, { "epoch": 0.4001469927728553, "grad_norm": 1.8933912515640259, "learning_rate": 6.690204142597333e-05, "loss": 1.4914, "step": 1225 }, { "epoch": 0.40047364337920055, "grad_norm": 0.21706634759902954, "learning_rate": 6.68529347318001e-05, "loss": 0.7785, "step": 1226 }, { "epoch": 0.40080029398554573, "grad_norm": 0.2556523084640503, "learning_rate": 6.680380969110537e-05, "loss": 0.8661, "step": 1227 }, { "epoch": 0.4011269445918909, "grad_norm": 0.2508922815322876, "learning_rate": 6.67546663573679e-05, "loss": 0.86, "step": 1228 }, { "epoch": 0.4014535951982361, "grad_norm": 0.2551276385784149, "learning_rate": 6.670550478408632e-05, "loss": 0.8221, "step": 1229 }, { "epoch": 0.40178024580458127, "grad_norm": 0.2717500627040863, "learning_rate": 6.665632502477914e-05, "loss": 0.8709, "step": 1230 }, { "epoch": 0.40210689641092645, "grad_norm": 0.2777538299560547, "learning_rate": 6.660712713298468e-05, "loss": 0.8241, "step": 1231 }, { "epoch": 0.40243354701727163, "grad_norm": 0.29997768998146057, "learning_rate": 6.655791116226094e-05, "loss": 0.8535, "step": 1232 }, { "epoch": 0.4027601976236168, "grad_norm": 0.3282143771648407, "learning_rate": 6.650867716618567e-05, "loss": 0.9073, "step": 1233 }, { "epoch": 0.40308684822996205, "grad_norm": 0.3342377841472626, "learning_rate": 6.645942519835623e-05, "loss": 0.9987, "step": 1234 }, { "epoch": 0.4034134988363072, "grad_norm": 0.31825751066207886, "learning_rate": 6.64101553123895e-05, "loss": 0.7599, "step": 1235 }, { "epoch": 0.4037401494426524, "grad_norm": 0.3360239863395691, "learning_rate": 6.636086756192193e-05, "loss": 0.7941, "step": 1236 }, { "epoch": 0.4040668000489976, "grad_norm": 0.3323500156402588, "learning_rate": 6.631156200060935e-05, "loss": 0.945, "step": 1237 }, { "epoch": 0.40439345065534277, "grad_norm": 0.4027165174484253, "learning_rate": 6.626223868212702e-05, "loss": 0.9231, "step": 1238 }, { "epoch": 0.40472010126168795, "grad_norm": 0.42029932141304016, "learning_rate": 6.621289766016955e-05, "loss": 0.9594, "step": 1239 }, { "epoch": 0.40504675186803313, "grad_norm": 0.4373820424079895, "learning_rate": 6.616353898845076e-05, "loss": 0.9466, "step": 1240 }, { "epoch": 0.40537340247437836, "grad_norm": 0.48459959030151367, "learning_rate": 6.611416272070377e-05, "loss": 0.9867, "step": 1241 }, { "epoch": 0.40570005308072354, "grad_norm": 0.4633182883262634, "learning_rate": 6.606476891068074e-05, "loss": 0.9491, "step": 1242 }, { "epoch": 0.4060267036870687, "grad_norm": 0.5320441722869873, "learning_rate": 6.601535761215305e-05, "loss": 1.1246, "step": 1243 }, { "epoch": 0.4063533542934139, "grad_norm": 0.5146458148956299, "learning_rate": 6.596592887891103e-05, "loss": 1.0492, "step": 1244 }, { "epoch": 0.4066800048997591, "grad_norm": 0.6374284029006958, "learning_rate": 6.591648276476402e-05, "loss": 1.1577, "step": 1245 }, { "epoch": 0.40700665550610426, "grad_norm": 0.6750211715698242, "learning_rate": 6.586701932354031e-05, "loss": 1.2225, "step": 1246 }, { "epoch": 0.40733330611244944, "grad_norm": 0.9183369278907776, "learning_rate": 6.581753860908699e-05, "loss": 1.301, "step": 1247 }, { "epoch": 0.4076599567187947, "grad_norm": 1.0853521823883057, "learning_rate": 6.576804067527002e-05, "loss": 1.5473, "step": 1248 }, { "epoch": 0.40798660732513986, "grad_norm": 1.1827778816223145, "learning_rate": 6.571852557597407e-05, "loss": 1.4952, "step": 1249 }, { "epoch": 0.40831325793148504, "grad_norm": 1.3170537948608398, "learning_rate": 6.566899336510248e-05, "loss": 1.6403, "step": 1250 }, { "epoch": 0.4086399085378302, "grad_norm": 0.2041776031255722, "learning_rate": 6.561944409657726e-05, "loss": 0.6767, "step": 1251 }, { "epoch": 0.4089665591441754, "grad_norm": 0.23141330480575562, "learning_rate": 6.556987782433894e-05, "loss": 0.8457, "step": 1252 }, { "epoch": 0.4092932097505206, "grad_norm": 0.25107938051223755, "learning_rate": 6.552029460234664e-05, "loss": 0.8766, "step": 1253 }, { "epoch": 0.40961986035686576, "grad_norm": 0.26021620631217957, "learning_rate": 6.547069448457785e-05, "loss": 0.9075, "step": 1254 }, { "epoch": 0.409946510963211, "grad_norm": 0.29366645216941833, "learning_rate": 6.542107752502848e-05, "loss": 0.8482, "step": 1255 }, { "epoch": 0.4102731615695562, "grad_norm": 0.2926657497882843, "learning_rate": 6.537144377771279e-05, "loss": 0.8924, "step": 1256 }, { "epoch": 0.41059981217590136, "grad_norm": 0.30195921659469604, "learning_rate": 6.53217932966633e-05, "loss": 0.9467, "step": 1257 }, { "epoch": 0.41092646278224654, "grad_norm": 0.29315873980522156, "learning_rate": 6.527212613593074e-05, "loss": 0.7876, "step": 1258 }, { "epoch": 0.4112531133885917, "grad_norm": 0.29667970538139343, "learning_rate": 6.522244234958404e-05, "loss": 0.8602, "step": 1259 }, { "epoch": 0.4115797639949369, "grad_norm": 0.3073457181453705, "learning_rate": 6.517274199171019e-05, "loss": 0.9042, "step": 1260 }, { "epoch": 0.4119064146012821, "grad_norm": 0.3291796147823334, "learning_rate": 6.512302511641419e-05, "loss": 0.9297, "step": 1261 }, { "epoch": 0.4122330652076273, "grad_norm": 0.36080867052078247, "learning_rate": 6.507329177781911e-05, "loss": 0.8909, "step": 1262 }, { "epoch": 0.4125597158139725, "grad_norm": 0.39371705055236816, "learning_rate": 6.502354203006588e-05, "loss": 1.0463, "step": 1263 }, { "epoch": 0.4128863664203177, "grad_norm": 0.3974571228027344, "learning_rate": 6.497377592731329e-05, "loss": 0.9081, "step": 1264 }, { "epoch": 0.41321301702666285, "grad_norm": 0.41697776317596436, "learning_rate": 6.492399352373795e-05, "loss": 0.9524, "step": 1265 }, { "epoch": 0.41353966763300803, "grad_norm": 0.4638747572898865, "learning_rate": 6.487419487353421e-05, "loss": 1.0998, "step": 1266 }, { "epoch": 0.4138663182393532, "grad_norm": 0.5170689821243286, "learning_rate": 6.482438003091414e-05, "loss": 1.212, "step": 1267 }, { "epoch": 0.4141929688456984, "grad_norm": 0.5564174652099609, "learning_rate": 6.47745490501074e-05, "loss": 1.0914, "step": 1268 }, { "epoch": 0.41451961945204363, "grad_norm": 0.7147950530052185, "learning_rate": 6.47247019853612e-05, "loss": 1.2383, "step": 1269 }, { "epoch": 0.4148462700583888, "grad_norm": 0.6999964118003845, "learning_rate": 6.467483889094033e-05, "loss": 1.2716, "step": 1270 }, { "epoch": 0.415172920664734, "grad_norm": 0.9136797785758972, "learning_rate": 6.462495982112697e-05, "loss": 1.1405, "step": 1271 }, { "epoch": 0.41549957127107917, "grad_norm": 0.9877931475639343, "learning_rate": 6.457506483022068e-05, "loss": 1.5912, "step": 1272 }, { "epoch": 0.41582622187742435, "grad_norm": 1.415334939956665, "learning_rate": 6.452515397253844e-05, "loss": 1.7894, "step": 1273 }, { "epoch": 0.41615287248376953, "grad_norm": 1.3017021417617798, "learning_rate": 6.44752273024144e-05, "loss": 1.4178, "step": 1274 }, { "epoch": 0.4164795230901147, "grad_norm": 1.7668936252593994, "learning_rate": 6.442528487419996e-05, "loss": 1.9307, "step": 1275 }, { "epoch": 0.41680617369645995, "grad_norm": 0.20308293402194977, "learning_rate": 6.437532674226372e-05, "loss": 0.7709, "step": 1276 }, { "epoch": 0.4171328243028051, "grad_norm": 0.21541514992713928, "learning_rate": 6.432535296099132e-05, "loss": 0.7705, "step": 1277 }, { "epoch": 0.4174594749091503, "grad_norm": 0.24598878622055054, "learning_rate": 6.427536358478542e-05, "loss": 0.8412, "step": 1278 }, { "epoch": 0.4177861255154955, "grad_norm": 0.2694220244884491, "learning_rate": 6.422535866806576e-05, "loss": 0.8584, "step": 1279 }, { "epoch": 0.41811277612184067, "grad_norm": 0.2715321183204651, "learning_rate": 6.417533826526888e-05, "loss": 0.8804, "step": 1280 }, { "epoch": 0.41843942672818585, "grad_norm": 0.26800206303596497, "learning_rate": 6.412530243084824e-05, "loss": 0.7912, "step": 1281 }, { "epoch": 0.41876607733453103, "grad_norm": 0.2986787259578705, "learning_rate": 6.407525121927409e-05, "loss": 0.8956, "step": 1282 }, { "epoch": 0.41909272794087626, "grad_norm": 0.3155290186405182, "learning_rate": 6.40251846850334e-05, "loss": 0.8807, "step": 1283 }, { "epoch": 0.41941937854722144, "grad_norm": 0.3152468502521515, "learning_rate": 6.397510288262986e-05, "loss": 0.8617, "step": 1284 }, { "epoch": 0.4197460291535666, "grad_norm": 0.33172619342803955, "learning_rate": 6.392500586658376e-05, "loss": 0.9051, "step": 1285 }, { "epoch": 0.4200726797599118, "grad_norm": 0.39893805980682373, "learning_rate": 6.387489369143191e-05, "loss": 1.0182, "step": 1286 }, { "epoch": 0.420399330366257, "grad_norm": 0.34438183903694153, "learning_rate": 6.38247664117277e-05, "loss": 0.848, "step": 1287 }, { "epoch": 0.42072598097260216, "grad_norm": 0.37023016810417175, "learning_rate": 6.377462408204093e-05, "loss": 0.8406, "step": 1288 }, { "epoch": 0.42105263157894735, "grad_norm": 0.41819244623184204, "learning_rate": 6.372446675695778e-05, "loss": 0.9544, "step": 1289 }, { "epoch": 0.4213792821852926, "grad_norm": 0.4439786374568939, "learning_rate": 6.367429449108072e-05, "loss": 1.0086, "step": 1290 }, { "epoch": 0.42170593279163776, "grad_norm": 0.49234476685523987, "learning_rate": 6.362410733902855e-05, "loss": 1.0202, "step": 1291 }, { "epoch": 0.42203258339798294, "grad_norm": 0.5539708733558655, "learning_rate": 6.357390535543623e-05, "loss": 1.1179, "step": 1292 }, { "epoch": 0.4223592340043281, "grad_norm": 0.5921236276626587, "learning_rate": 6.35236885949549e-05, "loss": 1.3069, "step": 1293 }, { "epoch": 0.4226858846106733, "grad_norm": 0.5858140587806702, "learning_rate": 6.347345711225176e-05, "loss": 1.2039, "step": 1294 }, { "epoch": 0.4230125352170185, "grad_norm": 0.7424845695495605, "learning_rate": 6.342321096201003e-05, "loss": 1.4528, "step": 1295 }, { "epoch": 0.42333918582336366, "grad_norm": 0.8141841292381287, "learning_rate": 6.33729501989289e-05, "loss": 1.4032, "step": 1296 }, { "epoch": 0.4236658364297089, "grad_norm": 1.0641597509384155, "learning_rate": 6.332267487772352e-05, "loss": 1.4153, "step": 1297 }, { "epoch": 0.4239924870360541, "grad_norm": 1.2870875597000122, "learning_rate": 6.327238505312484e-05, "loss": 1.6028, "step": 1298 }, { "epoch": 0.42431913764239926, "grad_norm": 1.4582892656326294, "learning_rate": 6.322208077987958e-05, "loss": 1.3119, "step": 1299 }, { "epoch": 0.42464578824874444, "grad_norm": 1.7859705686569214, "learning_rate": 6.317176211275022e-05, "loss": 1.7957, "step": 1300 }, { "epoch": 0.4249724388550896, "grad_norm": 0.19623303413391113, "learning_rate": 6.312142910651492e-05, "loss": 0.7682, "step": 1301 }, { "epoch": 0.4252990894614348, "grad_norm": 0.22996629774570465, "learning_rate": 6.307108181596743e-05, "loss": 0.8693, "step": 1302 }, { "epoch": 0.42562574006778, "grad_norm": 0.24149446189403534, "learning_rate": 6.302072029591707e-05, "loss": 0.8444, "step": 1303 }, { "epoch": 0.4259523906741252, "grad_norm": 0.25310564041137695, "learning_rate": 6.297034460118861e-05, "loss": 0.8612, "step": 1304 }, { "epoch": 0.4262790412804704, "grad_norm": 0.26213711500167847, "learning_rate": 6.29199547866223e-05, "loss": 0.8087, "step": 1305 }, { "epoch": 0.4266056918868156, "grad_norm": 0.28710299730300903, "learning_rate": 6.286955090707371e-05, "loss": 0.877, "step": 1306 }, { "epoch": 0.42693234249316075, "grad_norm": 0.3216702342033386, "learning_rate": 6.281913301741378e-05, "loss": 0.9834, "step": 1307 }, { "epoch": 0.42725899309950593, "grad_norm": 0.3331913948059082, "learning_rate": 6.276870117252867e-05, "loss": 1.0185, "step": 1308 }, { "epoch": 0.4275856437058511, "grad_norm": 0.3292556405067444, "learning_rate": 6.271825542731971e-05, "loss": 0.9194, "step": 1309 }, { "epoch": 0.4279122943121963, "grad_norm": 0.32852330803871155, "learning_rate": 6.26677958367034e-05, "loss": 0.9441, "step": 1310 }, { "epoch": 0.42823894491854153, "grad_norm": 0.33652690052986145, "learning_rate": 6.261732245561129e-05, "loss": 0.8488, "step": 1311 }, { "epoch": 0.4285655955248867, "grad_norm": 0.3745363652706146, "learning_rate": 6.256683533898995e-05, "loss": 0.8509, "step": 1312 }, { "epoch": 0.4288922461312319, "grad_norm": 0.3785022795200348, "learning_rate": 6.251633454180091e-05, "loss": 0.8239, "step": 1313 }, { "epoch": 0.42921889673757707, "grad_norm": 0.4264315366744995, "learning_rate": 6.24658201190206e-05, "loss": 1.065, "step": 1314 }, { "epoch": 0.42954554734392225, "grad_norm": 0.43724411725997925, "learning_rate": 6.241529212564025e-05, "loss": 0.9565, "step": 1315 }, { "epoch": 0.42987219795026743, "grad_norm": 0.468350887298584, "learning_rate": 6.236475061666588e-05, "loss": 1.0929, "step": 1316 }, { "epoch": 0.4301988485566126, "grad_norm": 0.5075478553771973, "learning_rate": 6.231419564711826e-05, "loss": 1.0679, "step": 1317 }, { "epoch": 0.43052549916295785, "grad_norm": 0.5075100660324097, "learning_rate": 6.226362727203272e-05, "loss": 1.0657, "step": 1318 }, { "epoch": 0.43085214976930303, "grad_norm": 0.5745466947555542, "learning_rate": 6.22130455464593e-05, "loss": 1.0917, "step": 1319 }, { "epoch": 0.4311788003756482, "grad_norm": 0.6840534210205078, "learning_rate": 6.216245052546251e-05, "loss": 1.0422, "step": 1320 }, { "epoch": 0.4315054509819934, "grad_norm": 0.7854779958724976, "learning_rate": 6.211184226412131e-05, "loss": 1.2574, "step": 1321 }, { "epoch": 0.43183210158833857, "grad_norm": 0.9489834904670715, "learning_rate": 6.206122081752913e-05, "loss": 1.4515, "step": 1322 }, { "epoch": 0.43215875219468375, "grad_norm": 0.9874739050865173, "learning_rate": 6.201058624079371e-05, "loss": 1.2792, "step": 1323 }, { "epoch": 0.43248540280102893, "grad_norm": 1.4044889211654663, "learning_rate": 6.195993858903713e-05, "loss": 1.5486, "step": 1324 }, { "epoch": 0.43281205340737416, "grad_norm": 2.303447723388672, "learning_rate": 6.190927791739565e-05, "loss": 1.998, "step": 1325 }, { "epoch": 0.43313870401371934, "grad_norm": 0.1984492391347885, "learning_rate": 6.185860428101974e-05, "loss": 0.7388, "step": 1326 }, { "epoch": 0.4334653546200645, "grad_norm": 0.24812500178813934, "learning_rate": 6.180791773507396e-05, "loss": 0.7755, "step": 1327 }, { "epoch": 0.4337920052264097, "grad_norm": 0.2576265335083008, "learning_rate": 6.175721833473697e-05, "loss": 0.796, "step": 1328 }, { "epoch": 0.4341186558327549, "grad_norm": 0.2764826714992523, "learning_rate": 6.170650613520137e-05, "loss": 0.8901, "step": 1329 }, { "epoch": 0.43444530643910007, "grad_norm": 0.28213411569595337, "learning_rate": 6.16557811916737e-05, "loss": 0.8744, "step": 1330 }, { "epoch": 0.43477195704544525, "grad_norm": 0.2741318345069885, "learning_rate": 6.160504355937441e-05, "loss": 0.7944, "step": 1331 }, { "epoch": 0.4350986076517905, "grad_norm": 0.3129563331604004, "learning_rate": 6.155429329353772e-05, "loss": 0.9256, "step": 1332 }, { "epoch": 0.43542525825813566, "grad_norm": 0.2871616780757904, "learning_rate": 6.150353044941166e-05, "loss": 0.87, "step": 1333 }, { "epoch": 0.43575190886448084, "grad_norm": 0.2862168550491333, "learning_rate": 6.145275508225789e-05, "loss": 0.9194, "step": 1334 }, { "epoch": 0.436078559470826, "grad_norm": 0.3074105381965637, "learning_rate": 6.140196724735173e-05, "loss": 0.8538, "step": 1335 }, { "epoch": 0.4364052100771712, "grad_norm": 0.3276168704032898, "learning_rate": 6.135116699998208e-05, "loss": 0.903, "step": 1336 }, { "epoch": 0.4367318606835164, "grad_norm": 0.3311191499233246, "learning_rate": 6.130035439545137e-05, "loss": 0.8529, "step": 1337 }, { "epoch": 0.43705851128986156, "grad_norm": 0.36968547105789185, "learning_rate": 6.12495294890754e-05, "loss": 1.0096, "step": 1338 }, { "epoch": 0.43738516189620674, "grad_norm": 0.3730202317237854, "learning_rate": 6.119869233618347e-05, "loss": 0.8173, "step": 1339 }, { "epoch": 0.437711812502552, "grad_norm": 0.3859659433364868, "learning_rate": 6.114784299211812e-05, "loss": 0.9545, "step": 1340 }, { "epoch": 0.43803846310889716, "grad_norm": 0.4324837327003479, "learning_rate": 6.109698151223524e-05, "loss": 1.0696, "step": 1341 }, { "epoch": 0.43836511371524234, "grad_norm": 0.509543776512146, "learning_rate": 6.10461079519039e-05, "loss": 0.9784, "step": 1342 }, { "epoch": 0.4386917643215875, "grad_norm": 0.5357346534729004, "learning_rate": 6.099522236650628e-05, "loss": 1.109, "step": 1343 }, { "epoch": 0.4390184149279327, "grad_norm": 0.5872611403465271, "learning_rate": 6.09443248114377e-05, "loss": 1.1932, "step": 1344 }, { "epoch": 0.4393450655342779, "grad_norm": 0.6826693415641785, "learning_rate": 6.089341534210652e-05, "loss": 1.2509, "step": 1345 }, { "epoch": 0.43967171614062306, "grad_norm": 0.7309541702270508, "learning_rate": 6.084249401393403e-05, "loss": 1.2348, "step": 1346 }, { "epoch": 0.4399983667469683, "grad_norm": 0.897671639919281, "learning_rate": 6.0791560882354424e-05, "loss": 1.3749, "step": 1347 }, { "epoch": 0.4403250173533135, "grad_norm": 0.9450259804725647, "learning_rate": 6.07406160028148e-05, "loss": 1.3895, "step": 1348 }, { "epoch": 0.44065166795965865, "grad_norm": 1.1689409017562866, "learning_rate": 6.0689659430775e-05, "loss": 1.5146, "step": 1349 }, { "epoch": 0.44097831856600384, "grad_norm": 1.6155471801757812, "learning_rate": 6.063869122170761e-05, "loss": 1.6751, "step": 1350 }, { "epoch": 0.441304969172349, "grad_norm": 0.227553129196167, "learning_rate": 6.058771143109789e-05, "loss": 0.8437, "step": 1351 }, { "epoch": 0.4416316197786942, "grad_norm": 0.25323745608329773, "learning_rate": 6.053672011444369e-05, "loss": 0.8681, "step": 1352 }, { "epoch": 0.4419582703850394, "grad_norm": 0.26084786653518677, "learning_rate": 6.048571732725543e-05, "loss": 0.8891, "step": 1353 }, { "epoch": 0.4422849209913846, "grad_norm": 0.2727871239185333, "learning_rate": 6.043470312505599e-05, "loss": 0.7828, "step": 1354 }, { "epoch": 0.4426115715977298, "grad_norm": 0.2814069390296936, "learning_rate": 6.038367756338072e-05, "loss": 0.8744, "step": 1355 }, { "epoch": 0.44293822220407497, "grad_norm": 0.2910081446170807, "learning_rate": 6.0332640697777273e-05, "loss": 0.7748, "step": 1356 }, { "epoch": 0.44326487281042015, "grad_norm": 0.2924244701862335, "learning_rate": 6.028159258380567e-05, "loss": 0.769, "step": 1357 }, { "epoch": 0.44359152341676533, "grad_norm": 0.30017054080963135, "learning_rate": 6.0230533277038127e-05, "loss": 0.8374, "step": 1358 }, { "epoch": 0.4439181740231105, "grad_norm": 0.31334736943244934, "learning_rate": 6.01794628330591e-05, "loss": 0.8224, "step": 1359 }, { "epoch": 0.4442448246294557, "grad_norm": 0.3215758800506592, "learning_rate": 6.01283813074651e-05, "loss": 0.8468, "step": 1360 }, { "epoch": 0.44457147523580093, "grad_norm": 0.3455059230327606, "learning_rate": 6.007728875586476e-05, "loss": 0.9048, "step": 1361 }, { "epoch": 0.4448981258421461, "grad_norm": 0.36075106263160706, "learning_rate": 6.002618523387868e-05, "loss": 0.8895, "step": 1362 }, { "epoch": 0.4452247764484913, "grad_norm": 0.37401801347732544, "learning_rate": 5.9975070797139446e-05, "loss": 0.8716, "step": 1363 }, { "epoch": 0.44555142705483647, "grad_norm": 0.40770915150642395, "learning_rate": 5.992394550129148e-05, "loss": 0.9879, "step": 1364 }, { "epoch": 0.44587807766118165, "grad_norm": 0.4294146001338959, "learning_rate": 5.9872809401991034e-05, "loss": 1.0252, "step": 1365 }, { "epoch": 0.44620472826752683, "grad_norm": 0.4494366943836212, "learning_rate": 5.9821662554906144e-05, "loss": 1.0504, "step": 1366 }, { "epoch": 0.446531378873872, "grad_norm": 0.49997806549072266, "learning_rate": 5.977050501571653e-05, "loss": 0.9862, "step": 1367 }, { "epoch": 0.44685802948021724, "grad_norm": 0.5091699957847595, "learning_rate": 5.971933684011355e-05, "loss": 0.9708, "step": 1368 }, { "epoch": 0.4471846800865624, "grad_norm": 0.6185327172279358, "learning_rate": 5.966815808380015e-05, "loss": 1.1143, "step": 1369 }, { "epoch": 0.4475113306929076, "grad_norm": 0.7364172339439392, "learning_rate": 5.961696880249079e-05, "loss": 1.3154, "step": 1370 }, { "epoch": 0.4478379812992528, "grad_norm": 0.8025093674659729, "learning_rate": 5.9565769051911376e-05, "loss": 1.1813, "step": 1371 }, { "epoch": 0.44816463190559797, "grad_norm": 0.9280282258987427, "learning_rate": 5.951455888779925e-05, "loss": 1.0802, "step": 1372 }, { "epoch": 0.44849128251194315, "grad_norm": 1.183429479598999, "learning_rate": 5.9463338365903035e-05, "loss": 1.1206, "step": 1373 }, { "epoch": 0.4488179331182883, "grad_norm": 1.2044509649276733, "learning_rate": 5.941210754198266e-05, "loss": 1.4664, "step": 1374 }, { "epoch": 0.44914458372463356, "grad_norm": 1.775682806968689, "learning_rate": 5.936086647180928e-05, "loss": 2.0872, "step": 1375 }, { "epoch": 0.44947123433097874, "grad_norm": 0.19256451725959778, "learning_rate": 5.9309615211165185e-05, "loss": 0.7676, "step": 1376 }, { "epoch": 0.4497978849373239, "grad_norm": 0.22578822076320648, "learning_rate": 5.925835381584377e-05, "loss": 0.797, "step": 1377 }, { "epoch": 0.4501245355436691, "grad_norm": 0.24020101130008698, "learning_rate": 5.9207082341649454e-05, "loss": 0.7621, "step": 1378 }, { "epoch": 0.4504511861500143, "grad_norm": 0.262273371219635, "learning_rate": 5.9155800844397625e-05, "loss": 0.785, "step": 1379 }, { "epoch": 0.45077783675635946, "grad_norm": 0.2687849700450897, "learning_rate": 5.9104509379914586e-05, "loss": 0.847, "step": 1380 }, { "epoch": 0.45110448736270464, "grad_norm": 0.2742252051830292, "learning_rate": 5.905320800403752e-05, "loss": 0.798, "step": 1381 }, { "epoch": 0.4514311379690499, "grad_norm": 0.28020089864730835, "learning_rate": 5.900189677261434e-05, "loss": 0.7844, "step": 1382 }, { "epoch": 0.45175778857539506, "grad_norm": 0.3002532422542572, "learning_rate": 5.8950575741503744e-05, "loss": 0.9231, "step": 1383 }, { "epoch": 0.45208443918174024, "grad_norm": 0.2936133146286011, "learning_rate": 5.889924496657506e-05, "loss": 0.8288, "step": 1384 }, { "epoch": 0.4524110897880854, "grad_norm": 0.3212607502937317, "learning_rate": 5.884790450370825e-05, "loss": 0.8293, "step": 1385 }, { "epoch": 0.4527377403944306, "grad_norm": 0.3269292414188385, "learning_rate": 5.87965544087938e-05, "loss": 0.8997, "step": 1386 }, { "epoch": 0.4530643910007758, "grad_norm": 0.35607802867889404, "learning_rate": 5.874519473773271e-05, "loss": 0.9919, "step": 1387 }, { "epoch": 0.45339104160712096, "grad_norm": 0.3684079647064209, "learning_rate": 5.869382554643639e-05, "loss": 0.9916, "step": 1388 }, { "epoch": 0.4537176922134662, "grad_norm": 0.3907953202724457, "learning_rate": 5.864244689082659e-05, "loss": 1.0021, "step": 1389 }, { "epoch": 0.4540443428198114, "grad_norm": 0.3965863883495331, "learning_rate": 5.85910588268354e-05, "loss": 0.8898, "step": 1390 }, { "epoch": 0.45437099342615656, "grad_norm": 0.4447333514690399, "learning_rate": 5.853966141040512e-05, "loss": 0.9954, "step": 1391 }, { "epoch": 0.45469764403250174, "grad_norm": 0.4704182744026184, "learning_rate": 5.848825469748828e-05, "loss": 1.0979, "step": 1392 }, { "epoch": 0.4550242946388469, "grad_norm": 0.47324079275131226, "learning_rate": 5.843683874404746e-05, "loss": 0.9627, "step": 1393 }, { "epoch": 0.4553509452451921, "grad_norm": 0.618899941444397, "learning_rate": 5.838541360605538e-05, "loss": 1.104, "step": 1394 }, { "epoch": 0.4556775958515373, "grad_norm": 0.6069836616516113, "learning_rate": 5.833397933949469e-05, "loss": 1.2049, "step": 1395 }, { "epoch": 0.4560042464578825, "grad_norm": 0.7420676946640015, "learning_rate": 5.8282536000358e-05, "loss": 1.2061, "step": 1396 }, { "epoch": 0.4563308970642277, "grad_norm": 1.0195621252059937, "learning_rate": 5.823108364464782e-05, "loss": 1.4885, "step": 1397 }, { "epoch": 0.45665754767057287, "grad_norm": 1.0798938274383545, "learning_rate": 5.817962232837645e-05, "loss": 1.4734, "step": 1398 }, { "epoch": 0.45698419827691805, "grad_norm": 1.271763563156128, "learning_rate": 5.8128152107565946e-05, "loss": 1.8257, "step": 1399 }, { "epoch": 0.45731084888326323, "grad_norm": 1.607129693031311, "learning_rate": 5.807667303824806e-05, "loss": 1.8979, "step": 1400 }, { "epoch": 0.4576374994896084, "grad_norm": 0.2017160803079605, "learning_rate": 5.8025185176464204e-05, "loss": 0.6849, "step": 1401 }, { "epoch": 0.4579641500959536, "grad_norm": 0.21873755753040314, "learning_rate": 5.7973688578265304e-05, "loss": 0.7677, "step": 1402 }, { "epoch": 0.45829080070229883, "grad_norm": 0.25168201327323914, "learning_rate": 5.792218329971184e-05, "loss": 0.8155, "step": 1403 }, { "epoch": 0.458617451308644, "grad_norm": 0.2593826651573181, "learning_rate": 5.7870669396873754e-05, "loss": 0.8939, "step": 1404 }, { "epoch": 0.4589441019149892, "grad_norm": 0.24560409784317017, "learning_rate": 5.7819146925830324e-05, "loss": 0.7275, "step": 1405 }, { "epoch": 0.45927075252133437, "grad_norm": 0.273952841758728, "learning_rate": 5.7767615942670204e-05, "loss": 0.8585, "step": 1406 }, { "epoch": 0.45959740312767955, "grad_norm": 0.2819421887397766, "learning_rate": 5.7716076503491314e-05, "loss": 0.8319, "step": 1407 }, { "epoch": 0.45992405373402473, "grad_norm": 0.3027994632720947, "learning_rate": 5.766452866440072e-05, "loss": 0.8609, "step": 1408 }, { "epoch": 0.4602507043403699, "grad_norm": 0.3026481866836548, "learning_rate": 5.761297248151469e-05, "loss": 0.8992, "step": 1409 }, { "epoch": 0.46057735494671515, "grad_norm": 0.3143552839756012, "learning_rate": 5.756140801095858e-05, "loss": 0.9488, "step": 1410 }, { "epoch": 0.4609040055530603, "grad_norm": 0.3467939496040344, "learning_rate": 5.750983530886672e-05, "loss": 1.0314, "step": 1411 }, { "epoch": 0.4612306561594055, "grad_norm": 0.3449691832065582, "learning_rate": 5.745825443138246e-05, "loss": 0.9967, "step": 1412 }, { "epoch": 0.4615573067657507, "grad_norm": 0.35669663548469543, "learning_rate": 5.740666543465798e-05, "loss": 1.001, "step": 1413 }, { "epoch": 0.46188395737209587, "grad_norm": 0.40556496381759644, "learning_rate": 5.735506837485437e-05, "loss": 0.9929, "step": 1414 }, { "epoch": 0.46221060797844105, "grad_norm": 0.3812430500984192, "learning_rate": 5.730346330814145e-05, "loss": 1.0193, "step": 1415 }, { "epoch": 0.4625372585847862, "grad_norm": 0.44601941108703613, "learning_rate": 5.7251850290697774e-05, "loss": 0.9902, "step": 1416 }, { "epoch": 0.46286390919113146, "grad_norm": 0.42666640877723694, "learning_rate": 5.7200229378710546e-05, "loss": 1.0404, "step": 1417 }, { "epoch": 0.46319055979747664, "grad_norm": 0.4538075625896454, "learning_rate": 5.714860062837557e-05, "loss": 1.0535, "step": 1418 }, { "epoch": 0.4635172104038218, "grad_norm": 0.5554001331329346, "learning_rate": 5.7096964095897174e-05, "loss": 1.1326, "step": 1419 }, { "epoch": 0.463843861010167, "grad_norm": 0.5905495882034302, "learning_rate": 5.7045319837488186e-05, "loss": 1.2233, "step": 1420 }, { "epoch": 0.4641705116165122, "grad_norm": 0.6906095147132874, "learning_rate": 5.6993667909369794e-05, "loss": 1.292, "step": 1421 }, { "epoch": 0.46449716222285736, "grad_norm": 0.793144941329956, "learning_rate": 5.694200836777158e-05, "loss": 1.2007, "step": 1422 }, { "epoch": 0.46482381282920254, "grad_norm": 1.0440678596496582, "learning_rate": 5.68903412689314e-05, "loss": 1.3643, "step": 1423 }, { "epoch": 0.4651504634355478, "grad_norm": 1.2731975317001343, "learning_rate": 5.683866666909533e-05, "loss": 1.6552, "step": 1424 }, { "epoch": 0.46547711404189296, "grad_norm": 1.792528510093689, "learning_rate": 5.6786984624517636e-05, "loss": 1.6505, "step": 1425 }, { "epoch": 0.46580376464823814, "grad_norm": 0.20550623536109924, "learning_rate": 5.6735295191460636e-05, "loss": 0.7223, "step": 1426 }, { "epoch": 0.4661304152545833, "grad_norm": 0.24376057088375092, "learning_rate": 5.668359842619474e-05, "loss": 0.8327, "step": 1427 }, { "epoch": 0.4664570658609285, "grad_norm": 0.23944585025310516, "learning_rate": 5.663189438499833e-05, "loss": 0.8034, "step": 1428 }, { "epoch": 0.4667837164672737, "grad_norm": 0.2565159499645233, "learning_rate": 5.6580183124157714e-05, "loss": 0.8629, "step": 1429 }, { "epoch": 0.46711036707361886, "grad_norm": 0.2701554298400879, "learning_rate": 5.652846469996702e-05, "loss": 0.8289, "step": 1430 }, { "epoch": 0.4674370176799641, "grad_norm": 0.30046141147613525, "learning_rate": 5.647673916872822e-05, "loss": 0.8658, "step": 1431 }, { "epoch": 0.4677636682863093, "grad_norm": 0.2801269590854645, "learning_rate": 5.6425006586751004e-05, "loss": 0.9112, "step": 1432 }, { "epoch": 0.46809031889265446, "grad_norm": 0.3160743713378906, "learning_rate": 5.6373267010352736e-05, "loss": 0.8969, "step": 1433 }, { "epoch": 0.46841696949899964, "grad_norm": 0.3159767687320709, "learning_rate": 5.632152049585843e-05, "loss": 0.9211, "step": 1434 }, { "epoch": 0.4687436201053448, "grad_norm": 0.31142839789390564, "learning_rate": 5.626976709960057e-05, "loss": 0.8939, "step": 1435 }, { "epoch": 0.46907027071169, "grad_norm": 0.31656891107559204, "learning_rate": 5.621800687791922e-05, "loss": 0.8701, "step": 1436 }, { "epoch": 0.4693969213180352, "grad_norm": 0.3449065387248993, "learning_rate": 5.616623988716181e-05, "loss": 0.9217, "step": 1437 }, { "epoch": 0.4697235719243804, "grad_norm": 0.39621448516845703, "learning_rate": 5.611446618368319e-05, "loss": 0.9877, "step": 1438 }, { "epoch": 0.4700502225307256, "grad_norm": 0.39008018374443054, "learning_rate": 5.606268582384548e-05, "loss": 0.889, "step": 1439 }, { "epoch": 0.4703768731370708, "grad_norm": 0.443905234336853, "learning_rate": 5.601089886401808e-05, "loss": 0.9621, "step": 1440 }, { "epoch": 0.47070352374341595, "grad_norm": 0.46601757407188416, "learning_rate": 5.595910536057753e-05, "loss": 0.9748, "step": 1441 }, { "epoch": 0.47103017434976113, "grad_norm": 0.4815382957458496, "learning_rate": 5.5907305369907534e-05, "loss": 0.9838, "step": 1442 }, { "epoch": 0.4713568249561063, "grad_norm": 0.5498834252357483, "learning_rate": 5.5855498948398846e-05, "loss": 1.069, "step": 1443 }, { "epoch": 0.4716834755624515, "grad_norm": 0.5492748022079468, "learning_rate": 5.5803686152449184e-05, "loss": 0.996, "step": 1444 }, { "epoch": 0.4720101261687967, "grad_norm": 0.6215171217918396, "learning_rate": 5.575186703846328e-05, "loss": 1.0071, "step": 1445 }, { "epoch": 0.4723367767751419, "grad_norm": 0.8084948658943176, "learning_rate": 5.5700041662852684e-05, "loss": 1.2216, "step": 1446 }, { "epoch": 0.4726634273814871, "grad_norm": 0.9253719449043274, "learning_rate": 5.56482100820358e-05, "loss": 1.1535, "step": 1447 }, { "epoch": 0.47299007798783227, "grad_norm": 1.2240797281265259, "learning_rate": 5.559637235243773e-05, "loss": 1.5206, "step": 1448 }, { "epoch": 0.47331672859417745, "grad_norm": 1.3156620264053345, "learning_rate": 5.5544528530490345e-05, "loss": 1.3771, "step": 1449 }, { "epoch": 0.47364337920052263, "grad_norm": 1.7129418849945068, "learning_rate": 5.5492678672632094e-05, "loss": 2.4055, "step": 1450 }, { "epoch": 0.4739700298068678, "grad_norm": 0.17648789286613464, "learning_rate": 5.5440822835308026e-05, "loss": 0.6887, "step": 1451 }, { "epoch": 0.474296680413213, "grad_norm": 0.22524596750736237, "learning_rate": 5.538896107496967e-05, "loss": 0.7308, "step": 1452 }, { "epoch": 0.4746233310195582, "grad_norm": 0.23709015548229218, "learning_rate": 5.5337093448075025e-05, "loss": 0.8185, "step": 1453 }, { "epoch": 0.4749499816259034, "grad_norm": 0.24522243440151215, "learning_rate": 5.528522001108849e-05, "loss": 0.7561, "step": 1454 }, { "epoch": 0.4752766322322486, "grad_norm": 0.2625182569026947, "learning_rate": 5.523334082048075e-05, "loss": 0.8076, "step": 1455 }, { "epoch": 0.47560328283859377, "grad_norm": 0.27700942754745483, "learning_rate": 5.5181455932728785e-05, "loss": 0.8427, "step": 1456 }, { "epoch": 0.47592993344493895, "grad_norm": 0.2912745475769043, "learning_rate": 5.512956540431577e-05, "loss": 0.8505, "step": 1457 }, { "epoch": 0.4762565840512841, "grad_norm": 0.2912481129169464, "learning_rate": 5.5077669291731006e-05, "loss": 0.9061, "step": 1458 }, { "epoch": 0.4765832346576293, "grad_norm": 0.36340487003326416, "learning_rate": 5.502576765146989e-05, "loss": 0.9754, "step": 1459 }, { "epoch": 0.47690988526397454, "grad_norm": 0.3103990852832794, "learning_rate": 5.497386054003385e-05, "loss": 0.9334, "step": 1460 }, { "epoch": 0.4772365358703197, "grad_norm": 0.33423370122909546, "learning_rate": 5.492194801393023e-05, "loss": 0.8043, "step": 1461 }, { "epoch": 0.4775631864766649, "grad_norm": 0.3521210253238678, "learning_rate": 5.487003012967228e-05, "loss": 0.9447, "step": 1462 }, { "epoch": 0.4778898370830101, "grad_norm": 0.342441201210022, "learning_rate": 5.4818106943779105e-05, "loss": 0.9133, "step": 1463 }, { "epoch": 0.47821648768935526, "grad_norm": 0.39511221647262573, "learning_rate": 5.476617851277559e-05, "loss": 1.0284, "step": 1464 }, { "epoch": 0.47854313829570044, "grad_norm": 0.39670830965042114, "learning_rate": 5.471424489319227e-05, "loss": 0.9131, "step": 1465 }, { "epoch": 0.4788697889020456, "grad_norm": 0.4468030333518982, "learning_rate": 5.466230614156539e-05, "loss": 0.93, "step": 1466 }, { "epoch": 0.47919643950839086, "grad_norm": 0.47740623354911804, "learning_rate": 5.461036231443676e-05, "loss": 1.0926, "step": 1467 }, { "epoch": 0.47952309011473604, "grad_norm": 0.5126772522926331, "learning_rate": 5.455841346835371e-05, "loss": 0.9977, "step": 1468 }, { "epoch": 0.4798497407210812, "grad_norm": 0.5842729210853577, "learning_rate": 5.4506459659869036e-05, "loss": 1.0636, "step": 1469 }, { "epoch": 0.4801763913274264, "grad_norm": 0.633167564868927, "learning_rate": 5.445450094554094e-05, "loss": 1.2103, "step": 1470 }, { "epoch": 0.4805030419337716, "grad_norm": 0.7382978200912476, "learning_rate": 5.440253738193297e-05, "loss": 1.3609, "step": 1471 }, { "epoch": 0.48082969254011676, "grad_norm": 1.0082590579986572, "learning_rate": 5.435056902561393e-05, "loss": 1.5858, "step": 1472 }, { "epoch": 0.48115634314646194, "grad_norm": 1.1187267303466797, "learning_rate": 5.4298595933157884e-05, "loss": 1.5626, "step": 1473 }, { "epoch": 0.4814829937528072, "grad_norm": 1.2796682119369507, "learning_rate": 5.4246618161144006e-05, "loss": 1.774, "step": 1474 }, { "epoch": 0.48180964435915236, "grad_norm": 1.722538709640503, "learning_rate": 5.4194635766156575e-05, "loss": 2.2056, "step": 1475 }, { "epoch": 0.48213629496549754, "grad_norm": 0.22073139250278473, "learning_rate": 5.414264880478493e-05, "loss": 0.8193, "step": 1476 }, { "epoch": 0.4824629455718427, "grad_norm": 0.23783589899539948, "learning_rate": 5.409065733362337e-05, "loss": 0.8528, "step": 1477 }, { "epoch": 0.4827895961781879, "grad_norm": 0.2497948557138443, "learning_rate": 5.403866140927109e-05, "loss": 0.7723, "step": 1478 }, { "epoch": 0.4831162467845331, "grad_norm": 0.25545740127563477, "learning_rate": 5.3986661088332115e-05, "loss": 0.8591, "step": 1479 }, { "epoch": 0.48344289739087826, "grad_norm": 0.2744065821170807, "learning_rate": 5.3934656427415295e-05, "loss": 0.8505, "step": 1480 }, { "epoch": 0.4837695479972235, "grad_norm": 0.2947370111942291, "learning_rate": 5.3882647483134196e-05, "loss": 0.8756, "step": 1481 }, { "epoch": 0.4840961986035687, "grad_norm": 0.3097544312477112, "learning_rate": 5.3830634312107056e-05, "loss": 0.8883, "step": 1482 }, { "epoch": 0.48442284920991385, "grad_norm": 0.31207266449928284, "learning_rate": 5.3778616970956663e-05, "loss": 0.8534, "step": 1483 }, { "epoch": 0.48474949981625903, "grad_norm": 0.38118842244148254, "learning_rate": 5.3726595516310405e-05, "loss": 0.9782, "step": 1484 }, { "epoch": 0.4850761504226042, "grad_norm": 0.315085232257843, "learning_rate": 5.367457000480011e-05, "loss": 0.8068, "step": 1485 }, { "epoch": 0.4854028010289494, "grad_norm": 0.34587183594703674, "learning_rate": 5.3622540493062046e-05, "loss": 0.8694, "step": 1486 }, { "epoch": 0.4857294516352946, "grad_norm": 0.3551337718963623, "learning_rate": 5.3570507037736826e-05, "loss": 0.9294, "step": 1487 }, { "epoch": 0.4860561022416398, "grad_norm": 0.3878238797187805, "learning_rate": 5.351846969546935e-05, "loss": 0.8701, "step": 1488 }, { "epoch": 0.486382752847985, "grad_norm": 0.4408702254295349, "learning_rate": 5.346642852290876e-05, "loss": 1.2004, "step": 1489 }, { "epoch": 0.48670940345433017, "grad_norm": 0.5383159518241882, "learning_rate": 5.341438357670838e-05, "loss": 1.0944, "step": 1490 }, { "epoch": 0.48703605406067535, "grad_norm": 0.48991337418556213, "learning_rate": 5.336233491352559e-05, "loss": 1.0647, "step": 1491 }, { "epoch": 0.48736270466702053, "grad_norm": 0.5360439419746399, "learning_rate": 5.3310282590021875e-05, "loss": 1.0419, "step": 1492 }, { "epoch": 0.4876893552733657, "grad_norm": 0.5581495761871338, "learning_rate": 5.325822666286268e-05, "loss": 1.2277, "step": 1493 }, { "epoch": 0.4880160058797109, "grad_norm": 0.6016497015953064, "learning_rate": 5.320616718871736e-05, "loss": 1.0748, "step": 1494 }, { "epoch": 0.4883426564860561, "grad_norm": 0.6058472990989685, "learning_rate": 5.315410422425917e-05, "loss": 1.1947, "step": 1495 }, { "epoch": 0.4886693070924013, "grad_norm": 0.7601789236068726, "learning_rate": 5.310203782616513e-05, "loss": 1.4049, "step": 1496 }, { "epoch": 0.4889959576987465, "grad_norm": 0.9896534085273743, "learning_rate": 5.304996805111599e-05, "loss": 1.3961, "step": 1497 }, { "epoch": 0.48932260830509167, "grad_norm": 1.224902629852295, "learning_rate": 5.299789495579621e-05, "loss": 1.6296, "step": 1498 }, { "epoch": 0.48964925891143685, "grad_norm": 1.2904133796691895, "learning_rate": 5.294581859689387e-05, "loss": 1.5572, "step": 1499 }, { "epoch": 0.489975909517782, "grad_norm": 1.9275236129760742, "learning_rate": 5.2893739031100554e-05, "loss": 2.2083, "step": 1500 }, { "epoch": 0.4903025601241272, "grad_norm": 0.18931369483470917, "learning_rate": 5.2841656315111366e-05, "loss": 0.69, "step": 1501 }, { "epoch": 0.49062921073047244, "grad_norm": 0.2273533195257187, "learning_rate": 5.278957050562485e-05, "loss": 0.8017, "step": 1502 }, { "epoch": 0.4909558613368176, "grad_norm": 0.24772115051746368, "learning_rate": 5.273748165934289e-05, "loss": 0.8714, "step": 1503 }, { "epoch": 0.4912825119431628, "grad_norm": 0.2568557858467102, "learning_rate": 5.268538983297072e-05, "loss": 0.827, "step": 1504 }, { "epoch": 0.491609162549508, "grad_norm": 0.25856682658195496, "learning_rate": 5.263329508321676e-05, "loss": 0.8657, "step": 1505 }, { "epoch": 0.49193581315585316, "grad_norm": 0.2738785445690155, "learning_rate": 5.258119746679266e-05, "loss": 0.8616, "step": 1506 }, { "epoch": 0.49226246376219834, "grad_norm": 0.28627508878707886, "learning_rate": 5.252909704041318e-05, "loss": 0.7742, "step": 1507 }, { "epoch": 0.4925891143685435, "grad_norm": 0.3042372763156891, "learning_rate": 5.247699386079613e-05, "loss": 0.874, "step": 1508 }, { "epoch": 0.49291576497488876, "grad_norm": 0.328125536441803, "learning_rate": 5.2424887984662294e-05, "loss": 0.8562, "step": 1509 }, { "epoch": 0.49324241558123394, "grad_norm": 0.3250144124031067, "learning_rate": 5.2372779468735446e-05, "loss": 0.9107, "step": 1510 }, { "epoch": 0.4935690661875791, "grad_norm": 0.35796797275543213, "learning_rate": 5.232066836974219e-05, "loss": 0.98, "step": 1511 }, { "epoch": 0.4938957167939243, "grad_norm": 0.3610784113407135, "learning_rate": 5.226855474441197e-05, "loss": 0.9441, "step": 1512 }, { "epoch": 0.4942223674002695, "grad_norm": 0.36936962604522705, "learning_rate": 5.2216438649476954e-05, "loss": 0.9559, "step": 1513 }, { "epoch": 0.49454901800661466, "grad_norm": 0.3782363533973694, "learning_rate": 5.2164320141672006e-05, "loss": 0.8496, "step": 1514 }, { "epoch": 0.49487566861295984, "grad_norm": 0.37794479727745056, "learning_rate": 5.211219927773464e-05, "loss": 0.8517, "step": 1515 }, { "epoch": 0.4952023192193051, "grad_norm": 0.48705726861953735, "learning_rate": 5.206007611440491e-05, "loss": 1.0162, "step": 1516 }, { "epoch": 0.49552896982565026, "grad_norm": 0.4541557729244232, "learning_rate": 5.200795070842539e-05, "loss": 1.0469, "step": 1517 }, { "epoch": 0.49585562043199544, "grad_norm": 0.4859015643596649, "learning_rate": 5.195582311654107e-05, "loss": 0.9468, "step": 1518 }, { "epoch": 0.4961822710383406, "grad_norm": 0.6146647334098816, "learning_rate": 5.190369339549933e-05, "loss": 1.0863, "step": 1519 }, { "epoch": 0.4965089216446858, "grad_norm": 0.6606687307357788, "learning_rate": 5.18515616020499e-05, "loss": 1.3782, "step": 1520 }, { "epoch": 0.496835572251031, "grad_norm": 0.7691556215286255, "learning_rate": 5.179942779294472e-05, "loss": 1.1182, "step": 1521 }, { "epoch": 0.49716222285737616, "grad_norm": 0.9613664746284485, "learning_rate": 5.174729202493794e-05, "loss": 1.3084, "step": 1522 }, { "epoch": 0.4974888734637214, "grad_norm": 1.210338830947876, "learning_rate": 5.169515435478587e-05, "loss": 1.4691, "step": 1523 }, { "epoch": 0.4978155240700666, "grad_norm": 1.183440923690796, "learning_rate": 5.164301483924685e-05, "loss": 1.4419, "step": 1524 }, { "epoch": 0.49814217467641175, "grad_norm": 1.764573574066162, "learning_rate": 5.159087353508125e-05, "loss": 2.0149, "step": 1525 }, { "epoch": 0.49846882528275693, "grad_norm": 0.2246791124343872, "learning_rate": 5.153873049905138e-05, "loss": 0.7747, "step": 1526 }, { "epoch": 0.4987954758891021, "grad_norm": 0.24696335196495056, "learning_rate": 5.1486585787921427e-05, "loss": 0.7843, "step": 1527 }, { "epoch": 0.4991221264954473, "grad_norm": 0.24796541035175323, "learning_rate": 5.1434439458457426e-05, "loss": 0.8426, "step": 1528 }, { "epoch": 0.4994487771017925, "grad_norm": 0.2630245089530945, "learning_rate": 5.1382291567427175e-05, "loss": 0.9231, "step": 1529 }, { "epoch": 0.4997754277081377, "grad_norm": 0.2528044879436493, "learning_rate": 5.133014217160014e-05, "loss": 0.7866, "step": 1530 }, { "epoch": 0.5001020783144828, "grad_norm": 0.2634943425655365, "learning_rate": 5.127799132774744e-05, "loss": 0.7849, "step": 1531 }, { "epoch": 0.500428728920828, "grad_norm": 0.27956002950668335, "learning_rate": 5.122583909264178e-05, "loss": 0.8601, "step": 1532 }, { "epoch": 0.500428728920828, "eval_loss": 1.0430185794830322, "eval_runtime": 500.0584, "eval_samples_per_second": 5.155, "eval_steps_per_second": 2.578, "step": 1532 } ], "logging_steps": 1, "max_steps": 3061, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 766, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0089220027501773e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }