diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" new file mode 100644--- /dev/null +++ "b/last-checkpoint/trainer_state.json" @@ -0,0 +1,5411 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.250214364460414, + "eval_steps": 766, + "global_step": 766, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00032665060634518803, + "grad_norm": 0.22650057077407837, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.8308, + "step": 1 + }, + { + "epoch": 0.00032665060634518803, + "eval_loss": 1.508634090423584, + "eval_runtime": 502.8072, + "eval_samples_per_second": 5.127, + "eval_steps_per_second": 2.564, + "step": 1 + }, + { + "epoch": 0.0006533012126903761, + "grad_norm": 0.2044776827096939, + "learning_rate": 4.000000000000001e-06, + "loss": 0.9804, + "step": 2 + }, + { + "epoch": 0.0009799518190355642, + "grad_norm": 0.2313065379858017, + "learning_rate": 6e-06, + "loss": 1.067, + "step": 3 + }, + { + "epoch": 0.0013066024253807521, + "grad_norm": 0.26630067825317383, + "learning_rate": 8.000000000000001e-06, + "loss": 1.0556, + "step": 4 + }, + { + "epoch": 0.0016332530317259401, + "grad_norm": 0.2948451340198517, + "learning_rate": 1e-05, + "loss": 1.117, + "step": 5 + }, + { + "epoch": 0.0019599036380711283, + "grad_norm": 0.2725197970867157, + "learning_rate": 1.2e-05, + "loss": 1.1418, + "step": 6 + }, + { + "epoch": 0.0022865542444163163, + "grad_norm": 0.29806721210479736, + "learning_rate": 1.4000000000000001e-05, + "loss": 1.0675, + "step": 7 + }, + { + "epoch": 0.0026132048507615043, + "grad_norm": 0.2886989414691925, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.1935, + "step": 8 + }, + { + "epoch": 0.0029398554571066922, + "grad_norm": 0.2739482522010803, + "learning_rate": 1.8e-05, + "loss": 1.2054, + "step": 9 + }, + { + "epoch": 0.0032665060634518802, + "grad_norm": 0.28947803378105164, + "learning_rate": 2e-05, + "loss": 1.1734, + "step": 10 + }, + { + "epoch": 0.003593156669797068, + "grad_norm": 0.3251422941684723, + "learning_rate": 2.2000000000000003e-05, + "loss": 1.1786, + "step": 11 + }, + { + "epoch": 0.003919807276142257, + "grad_norm": 0.2556726038455963, + "learning_rate": 2.4e-05, + "loss": 1.25, + "step": 12 + }, + { + "epoch": 0.004246457882487445, + "grad_norm": 0.30588826537132263, + "learning_rate": 2.6000000000000002e-05, + "loss": 1.2101, + "step": 13 + }, + { + "epoch": 0.0045731084888326326, + "grad_norm": 0.3118837773799896, + "learning_rate": 2.8000000000000003e-05, + "loss": 1.2211, + "step": 14 + }, + { + "epoch": 0.0048997590951778205, + "grad_norm": 0.41697078943252563, + "learning_rate": 3e-05, + "loss": 1.4213, + "step": 15 + }, + { + "epoch": 0.0052264097015230085, + "grad_norm": 0.354201078414917, + "learning_rate": 3.2000000000000005e-05, + "loss": 1.3762, + "step": 16 + }, + { + "epoch": 0.0055530603078681965, + "grad_norm": 0.43570902943611145, + "learning_rate": 3.4000000000000007e-05, + "loss": 1.394, + "step": 17 + }, + { + "epoch": 0.0058797109142133845, + "grad_norm": 0.4452773928642273, + "learning_rate": 3.6e-05, + "loss": 1.4338, + "step": 18 + }, + { + "epoch": 0.0062063615205585725, + "grad_norm": 0.3898261487483978, + "learning_rate": 3.8e-05, + "loss": 1.4906, + "step": 19 + }, + { + "epoch": 0.0065330121269037604, + "grad_norm": 0.4592307507991791, + "learning_rate": 4e-05, + "loss": 1.752, + "step": 20 + }, + { + "epoch": 0.006859662733248948, + "grad_norm": 0.590508759021759, + "learning_rate": 4.2e-05, + "loss": 1.7154, + "step": 21 + }, + { + "epoch": 0.007186313339594136, + "grad_norm": 0.6255390048027039, + "learning_rate": 4.4000000000000006e-05, + "loss": 1.8005, + "step": 22 + }, + { + "epoch": 0.007512963945939324, + "grad_norm": 0.7723678946495056, + "learning_rate": 4.600000000000001e-05, + "loss": 2.1126, + "step": 23 + }, + { + "epoch": 0.007839614552284513, + "grad_norm": 1.2195842266082764, + "learning_rate": 4.8e-05, + "loss": 2.4707, + "step": 24 + }, + { + "epoch": 0.0081662651586297, + "grad_norm": 1.7578188180923462, + "learning_rate": 5e-05, + "loss": 3.5729, + "step": 25 + }, + { + "epoch": 0.00849291576497489, + "grad_norm": 0.24383209645748138, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.8761, + "step": 26 + }, + { + "epoch": 0.008819566371320076, + "grad_norm": 0.34012460708618164, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.9546, + "step": 27 + }, + { + "epoch": 0.009146216977665265, + "grad_norm": 0.36797860264778137, + "learning_rate": 5.6000000000000006e-05, + "loss": 1.0231, + "step": 28 + }, + { + "epoch": 0.009472867584010452, + "grad_norm": 0.35745948553085327, + "learning_rate": 5.8e-05, + "loss": 0.9924, + "step": 29 + }, + { + "epoch": 0.009799518190355641, + "grad_norm": 0.3466523587703705, + "learning_rate": 6e-05, + "loss": 1.0294, + "step": 30 + }, + { + "epoch": 0.010126168796700828, + "grad_norm": 0.3309411406517029, + "learning_rate": 6.2e-05, + "loss": 1.0698, + "step": 31 + }, + { + "epoch": 0.010452819403046017, + "grad_norm": 0.3155065178871155, + "learning_rate": 6.400000000000001e-05, + "loss": 1.0731, + "step": 32 + }, + { + "epoch": 0.010779470009391204, + "grad_norm": 0.2679576277732849, + "learning_rate": 6.6e-05, + "loss": 1.0446, + "step": 33 + }, + { + "epoch": 0.011106120615736393, + "grad_norm": 0.2870541214942932, + "learning_rate": 6.800000000000001e-05, + "loss": 1.0705, + "step": 34 + }, + { + "epoch": 0.011432771222081582, + "grad_norm": 0.25413063168525696, + "learning_rate": 7e-05, + "loss": 1.0629, + "step": 35 + }, + { + "epoch": 0.011759421828426769, + "grad_norm": 0.2709527611732483, + "learning_rate": 7.2e-05, + "loss": 1.0573, + "step": 36 + }, + { + "epoch": 0.012086072434771958, + "grad_norm": 0.3181741237640381, + "learning_rate": 7.4e-05, + "loss": 1.1551, + "step": 37 + }, + { + "epoch": 0.012412723041117145, + "grad_norm": 0.32461702823638916, + "learning_rate": 7.6e-05, + "loss": 1.2136, + "step": 38 + }, + { + "epoch": 0.012739373647462334, + "grad_norm": 0.3279399871826172, + "learning_rate": 7.800000000000001e-05, + "loss": 1.1586, + "step": 39 + }, + { + "epoch": 0.013066024253807521, + "grad_norm": 0.4179019033908844, + "learning_rate": 8e-05, + "loss": 1.1029, + "step": 40 + }, + { + "epoch": 0.01339267486015271, + "grad_norm": 0.3966725170612335, + "learning_rate": 8.2e-05, + "loss": 1.1146, + "step": 41 + }, + { + "epoch": 0.013719325466497897, + "grad_norm": 0.40615811944007874, + "learning_rate": 8.4e-05, + "loss": 1.2046, + "step": 42 + }, + { + "epoch": 0.014045976072843086, + "grad_norm": 0.47675764560699463, + "learning_rate": 8.6e-05, + "loss": 1.3448, + "step": 43 + }, + { + "epoch": 0.014372626679188273, + "grad_norm": 0.5963281393051147, + "learning_rate": 8.800000000000001e-05, + "loss": 1.4645, + "step": 44 + }, + { + "epoch": 0.014699277285533462, + "grad_norm": 0.6444793343544006, + "learning_rate": 9e-05, + "loss": 1.5156, + "step": 45 + }, + { + "epoch": 0.015025927891878649, + "grad_norm": 0.946388304233551, + "learning_rate": 9.200000000000001e-05, + "loss": 1.6008, + "step": 46 + }, + { + "epoch": 0.015352578498223838, + "grad_norm": 1.00883150100708, + "learning_rate": 9.4e-05, + "loss": 1.3275, + "step": 47 + }, + { + "epoch": 0.015679229104569026, + "grad_norm": 1.5137501955032349, + "learning_rate": 9.6e-05, + "loss": 1.7482, + "step": 48 + }, + { + "epoch": 0.016005879710914214, + "grad_norm": 1.725042700767517, + "learning_rate": 9.8e-05, + "loss": 1.568, + "step": 49 + }, + { + "epoch": 0.0163325303172594, + "grad_norm": 2.654313564300537, + "learning_rate": 0.0001, + "loss": 2.1327, + "step": 50 + }, + { + "epoch": 0.016659180923604588, + "grad_norm": 0.2734032869338989, + "learning_rate": 9.999997278438182e-05, + "loss": 0.8992, + "step": 51 + }, + { + "epoch": 0.01698583152994978, + "grad_norm": 0.34941044449806213, + "learning_rate": 9.999989113755686e-05, + "loss": 0.8448, + "step": 52 + }, + { + "epoch": 0.017312482136294965, + "grad_norm": 0.4039852023124695, + "learning_rate": 9.999975505961402e-05, + "loss": 1.0142, + "step": 53 + }, + { + "epoch": 0.017639132742640153, + "grad_norm": 0.41006627678871155, + "learning_rate": 9.999956455070144e-05, + "loss": 0.9617, + "step": 54 + }, + { + "epoch": 0.017965783348985343, + "grad_norm": 0.3158213198184967, + "learning_rate": 9.99993196110265e-05, + "loss": 1.0481, + "step": 55 + }, + { + "epoch": 0.01829243395533053, + "grad_norm": 0.34790340065956116, + "learning_rate": 9.99990202408559e-05, + "loss": 0.9547, + "step": 56 + }, + { + "epoch": 0.018619084561675717, + "grad_norm": 0.3128991723060608, + "learning_rate": 9.999866644051546e-05, + "loss": 0.9676, + "step": 57 + }, + { + "epoch": 0.018945735168020904, + "grad_norm": 0.338254451751709, + "learning_rate": 9.99982582103904e-05, + "loss": 1.0016, + "step": 58 + }, + { + "epoch": 0.019272385774366095, + "grad_norm": 0.30693137645721436, + "learning_rate": 9.999779555092509e-05, + "loss": 0.9375, + "step": 59 + }, + { + "epoch": 0.019599036380711282, + "grad_norm": 0.35348108410835266, + "learning_rate": 9.999727846262321e-05, + "loss": 1.074, + "step": 60 + }, + { + "epoch": 0.01992568698705647, + "grad_norm": 0.3581663966178894, + "learning_rate": 9.999670694604768e-05, + "loss": 1.0606, + "step": 61 + }, + { + "epoch": 0.020252337593401656, + "grad_norm": 0.3733651638031006, + "learning_rate": 9.999608100182066e-05, + "loss": 1.1619, + "step": 62 + }, + { + "epoch": 0.020578988199746847, + "grad_norm": 0.4465019702911377, + "learning_rate": 9.999540063062356e-05, + "loss": 1.1113, + "step": 63 + }, + { + "epoch": 0.020905638806092034, + "grad_norm": 0.5543861985206604, + "learning_rate": 9.999466583319708e-05, + "loss": 1.1657, + "step": 64 + }, + { + "epoch": 0.02123228941243722, + "grad_norm": 0.4180990755558014, + "learning_rate": 9.99938766103411e-05, + "loss": 1.1938, + "step": 65 + }, + { + "epoch": 0.02155894001878241, + "grad_norm": 0.4393234848976135, + "learning_rate": 9.99930329629148e-05, + "loss": 1.1258, + "step": 66 + }, + { + "epoch": 0.0218855906251276, + "grad_norm": 0.4633965790271759, + "learning_rate": 9.999213489183659e-05, + "loss": 1.2432, + "step": 67 + }, + { + "epoch": 0.022212241231472786, + "grad_norm": 0.5040664672851562, + "learning_rate": 9.999118239808416e-05, + "loss": 1.1911, + "step": 68 + }, + { + "epoch": 0.022538891837817973, + "grad_norm": 0.6548953652381897, + "learning_rate": 9.99901754826944e-05, + "loss": 1.3242, + "step": 69 + }, + { + "epoch": 0.022865542444163164, + "grad_norm": 0.733322024345398, + "learning_rate": 9.998911414676346e-05, + "loss": 1.4367, + "step": 70 + }, + { + "epoch": 0.02319219305050835, + "grad_norm": 1.140091896057129, + "learning_rate": 9.998799839144675e-05, + "loss": 1.745, + "step": 71 + }, + { + "epoch": 0.023518843656853538, + "grad_norm": 1.2057875394821167, + "learning_rate": 9.998682821795888e-05, + "loss": 1.5357, + "step": 72 + }, + { + "epoch": 0.023845494263198725, + "grad_norm": 1.2741526365280151, + "learning_rate": 9.998560362757376e-05, + "loss": 1.5184, + "step": 73 + }, + { + "epoch": 0.024172144869543916, + "grad_norm": 1.6893465518951416, + "learning_rate": 9.998432462162449e-05, + "loss": 1.7909, + "step": 74 + }, + { + "epoch": 0.024498795475889103, + "grad_norm": 2.0071094036102295, + "learning_rate": 9.998299120150342e-05, + "loss": 2.2062, + "step": 75 + }, + { + "epoch": 0.02482544608223429, + "grad_norm": 0.21972136199474335, + "learning_rate": 9.998160336866219e-05, + "loss": 0.8062, + "step": 76 + }, + { + "epoch": 0.025152096688579477, + "grad_norm": 0.3399101793766022, + "learning_rate": 9.998016112461158e-05, + "loss": 0.9169, + "step": 77 + }, + { + "epoch": 0.025478747294924668, + "grad_norm": 0.3948090374469757, + "learning_rate": 9.997866447092168e-05, + "loss": 1.0111, + "step": 78 + }, + { + "epoch": 0.025805397901269855, + "grad_norm": 0.3974277675151825, + "learning_rate": 9.997711340922177e-05, + "loss": 0.9985, + "step": 79 + }, + { + "epoch": 0.026132048507615042, + "grad_norm": 0.3781171143054962, + "learning_rate": 9.997550794120039e-05, + "loss": 0.8534, + "step": 80 + }, + { + "epoch": 0.02645869911396023, + "grad_norm": 0.44847583770751953, + "learning_rate": 9.997384806860526e-05, + "loss": 0.9546, + "step": 81 + }, + { + "epoch": 0.02678534972030542, + "grad_norm": 0.33786600828170776, + "learning_rate": 9.99721337932434e-05, + "loss": 0.9573, + "step": 82 + }, + { + "epoch": 0.027112000326650607, + "grad_norm": 0.30058935284614563, + "learning_rate": 9.997036511698098e-05, + "loss": 0.8942, + "step": 83 + }, + { + "epoch": 0.027438650932995794, + "grad_norm": 0.2836489677429199, + "learning_rate": 9.996854204174344e-05, + "loss": 0.9423, + "step": 84 + }, + { + "epoch": 0.02776530153934098, + "grad_norm": 0.29414936900138855, + "learning_rate": 9.996666456951542e-05, + "loss": 1.0213, + "step": 85 + }, + { + "epoch": 0.02809195214568617, + "grad_norm": 0.2995252311229706, + "learning_rate": 9.99647327023408e-05, + "loss": 0.9666, + "step": 86 + }, + { + "epoch": 0.02841860275203136, + "grad_norm": 0.34682995080947876, + "learning_rate": 9.996274644232261e-05, + "loss": 1.0678, + "step": 87 + }, + { + "epoch": 0.028745253358376546, + "grad_norm": 0.3426355719566345, + "learning_rate": 9.99607057916232e-05, + "loss": 1.0813, + "step": 88 + }, + { + "epoch": 0.029071903964721736, + "grad_norm": 0.3705107271671295, + "learning_rate": 9.995861075246405e-05, + "loss": 1.0771, + "step": 89 + }, + { + "epoch": 0.029398554571066923, + "grad_norm": 0.37640541791915894, + "learning_rate": 9.995646132712586e-05, + "loss": 1.1776, + "step": 90 + }, + { + "epoch": 0.02972520517741211, + "grad_norm": 0.4144461452960968, + "learning_rate": 9.995425751794856e-05, + "loss": 1.0535, + "step": 91 + }, + { + "epoch": 0.030051855783757297, + "grad_norm": 0.5124989748001099, + "learning_rate": 9.995199932733126e-05, + "loss": 1.2394, + "step": 92 + }, + { + "epoch": 0.030378506390102488, + "grad_norm": 0.4783899784088135, + "learning_rate": 9.994968675773228e-05, + "loss": 1.0975, + "step": 93 + }, + { + "epoch": 0.030705156996447675, + "grad_norm": 0.568493127822876, + "learning_rate": 9.994731981166918e-05, + "loss": 1.1831, + "step": 94 + }, + { + "epoch": 0.031031807602792862, + "grad_norm": 0.7573971748352051, + "learning_rate": 9.994489849171863e-05, + "loss": 1.5349, + "step": 95 + }, + { + "epoch": 0.03135845820913805, + "grad_norm": 0.8267963528633118, + "learning_rate": 9.994242280051656e-05, + "loss": 1.4655, + "step": 96 + }, + { + "epoch": 0.03168510881548324, + "grad_norm": 1.118879795074463, + "learning_rate": 9.993989274075806e-05, + "loss": 1.7716, + "step": 97 + }, + { + "epoch": 0.03201175942182843, + "grad_norm": 1.2966365814208984, + "learning_rate": 9.99373083151974e-05, + "loss": 1.5928, + "step": 98 + }, + { + "epoch": 0.032338410028173614, + "grad_norm": 1.382350206375122, + "learning_rate": 9.99346695266481e-05, + "loss": 1.5455, + "step": 99 + }, + { + "epoch": 0.0326650606345188, + "grad_norm": 2.4098920822143555, + "learning_rate": 9.993197637798277e-05, + "loss": 2.2737, + "step": 100 + }, + { + "epoch": 0.03299171124086399, + "grad_norm": 0.18814091384410858, + "learning_rate": 9.992922887213324e-05, + "loss": 0.7321, + "step": 101 + }, + { + "epoch": 0.033318361847209176, + "grad_norm": 0.21184566617012024, + "learning_rate": 9.992642701209051e-05, + "loss": 0.8945, + "step": 102 + }, + { + "epoch": 0.03364501245355437, + "grad_norm": 0.21299266815185547, + "learning_rate": 9.992357080090479e-05, + "loss": 0.9516, + "step": 103 + }, + { + "epoch": 0.03397166305989956, + "grad_norm": 0.21715138852596283, + "learning_rate": 9.992066024168539e-05, + "loss": 0.9654, + "step": 104 + }, + { + "epoch": 0.034298313666244744, + "grad_norm": 0.25743889808654785, + "learning_rate": 9.991769533760082e-05, + "loss": 0.8758, + "step": 105 + }, + { + "epoch": 0.03462496427258993, + "grad_norm": 0.24404282867908478, + "learning_rate": 9.991467609187875e-05, + "loss": 0.9195, + "step": 106 + }, + { + "epoch": 0.03495161487893512, + "grad_norm": 0.24752084910869598, + "learning_rate": 9.9911602507806e-05, + "loss": 0.8898, + "step": 107 + }, + { + "epoch": 0.035278265485280305, + "grad_norm": 0.28402474522590637, + "learning_rate": 9.990847458872857e-05, + "loss": 0.9336, + "step": 108 + }, + { + "epoch": 0.03560491609162549, + "grad_norm": 0.24856555461883545, + "learning_rate": 9.990529233805157e-05, + "loss": 0.9425, + "step": 109 + }, + { + "epoch": 0.035931566697970686, + "grad_norm": 0.28930795192718506, + "learning_rate": 9.990205575923927e-05, + "loss": 0.9346, + "step": 110 + }, + { + "epoch": 0.03625821730431587, + "grad_norm": 0.3242413401603699, + "learning_rate": 9.989876485581513e-05, + "loss": 0.8545, + "step": 111 + }, + { + "epoch": 0.03658486791066106, + "grad_norm": 0.3469846844673157, + "learning_rate": 9.989541963136166e-05, + "loss": 0.9725, + "step": 112 + }, + { + "epoch": 0.03691151851700625, + "grad_norm": 0.42651814222335815, + "learning_rate": 9.98920200895206e-05, + "loss": 1.136, + "step": 113 + }, + { + "epoch": 0.037238169123351435, + "grad_norm": 0.4039931297302246, + "learning_rate": 9.988856623399272e-05, + "loss": 1.0276, + "step": 114 + }, + { + "epoch": 0.03756481972969662, + "grad_norm": 0.46665310859680176, + "learning_rate": 9.988505806853803e-05, + "loss": 1.1116, + "step": 115 + }, + { + "epoch": 0.03789147033604181, + "grad_norm": 0.46870648860931396, + "learning_rate": 9.988149559697556e-05, + "loss": 1.09, + "step": 116 + }, + { + "epoch": 0.038218120942386996, + "grad_norm": 0.5010095238685608, + "learning_rate": 9.987787882318353e-05, + "loss": 1.2364, + "step": 117 + }, + { + "epoch": 0.03854477154873219, + "grad_norm": 0.5289409756660461, + "learning_rate": 9.987420775109926e-05, + "loss": 1.1562, + "step": 118 + }, + { + "epoch": 0.03887142215507738, + "grad_norm": 0.5613381862640381, + "learning_rate": 9.987048238471913e-05, + "loss": 1.3162, + "step": 119 + }, + { + "epoch": 0.039198072761422564, + "grad_norm": 0.6171817779541016, + "learning_rate": 9.98667027280987e-05, + "loss": 1.2698, + "step": 120 + }, + { + "epoch": 0.03952472336776775, + "grad_norm": 0.7703320384025574, + "learning_rate": 9.986286878535258e-05, + "loss": 1.4442, + "step": 121 + }, + { + "epoch": 0.03985137397411294, + "grad_norm": 0.8612945675849915, + "learning_rate": 9.98589805606545e-05, + "loss": 1.4071, + "step": 122 + }, + { + "epoch": 0.040178024580458126, + "grad_norm": 1.3705520629882812, + "learning_rate": 9.985503805823729e-05, + "loss": 1.4146, + "step": 123 + }, + { + "epoch": 0.04050467518680331, + "grad_norm": 1.9134920835494995, + "learning_rate": 9.985104128239284e-05, + "loss": 2.0188, + "step": 124 + }, + { + "epoch": 0.04083132579314851, + "grad_norm": 2.2865049839019775, + "learning_rate": 9.984699023747215e-05, + "loss": 1.9591, + "step": 125 + }, + { + "epoch": 0.041157976399493694, + "grad_norm": 0.17751002311706543, + "learning_rate": 9.984288492788527e-05, + "loss": 0.8177, + "step": 126 + }, + { + "epoch": 0.04148462700583888, + "grad_norm": 0.18720225989818573, + "learning_rate": 9.983872535810137e-05, + "loss": 0.796, + "step": 127 + }, + { + "epoch": 0.04181127761218407, + "grad_norm": 0.2171001136302948, + "learning_rate": 9.983451153264862e-05, + "loss": 0.893, + "step": 128 + }, + { + "epoch": 0.042137928218529255, + "grad_norm": 0.23457252979278564, + "learning_rate": 9.983024345611434e-05, + "loss": 0.985, + "step": 129 + }, + { + "epoch": 0.04246457882487444, + "grad_norm": 0.26218998432159424, + "learning_rate": 9.982592113314484e-05, + "loss": 0.9618, + "step": 130 + }, + { + "epoch": 0.04279122943121963, + "grad_norm": 0.23409634828567505, + "learning_rate": 9.98215445684455e-05, + "loss": 0.8384, + "step": 131 + }, + { + "epoch": 0.04311788003756482, + "grad_norm": 0.25260603427886963, + "learning_rate": 9.981711376678077e-05, + "loss": 0.9468, + "step": 132 + }, + { + "epoch": 0.04344453064391001, + "grad_norm": 0.25879567861557007, + "learning_rate": 9.981262873297412e-05, + "loss": 0.9695, + "step": 133 + }, + { + "epoch": 0.0437711812502552, + "grad_norm": 0.27561235427856445, + "learning_rate": 9.980808947190809e-05, + "loss": 0.9748, + "step": 134 + }, + { + "epoch": 0.044097831856600385, + "grad_norm": 0.2770557701587677, + "learning_rate": 9.98034959885242e-05, + "loss": 0.9308, + "step": 135 + }, + { + "epoch": 0.04442448246294557, + "grad_norm": 0.31946492195129395, + "learning_rate": 9.979884828782305e-05, + "loss": 1.0077, + "step": 136 + }, + { + "epoch": 0.04475113306929076, + "grad_norm": 0.3412545621395111, + "learning_rate": 9.979414637486424e-05, + "loss": 1.0537, + "step": 137 + }, + { + "epoch": 0.045077783675635946, + "grad_norm": 0.3484194874763489, + "learning_rate": 9.978939025476639e-05, + "loss": 1.0126, + "step": 138 + }, + { + "epoch": 0.04540443428198113, + "grad_norm": 0.36984875798225403, + "learning_rate": 9.978457993270713e-05, + "loss": 1.0589, + "step": 139 + }, + { + "epoch": 0.04573108488832633, + "grad_norm": 0.46515074372291565, + "learning_rate": 9.97797154139231e-05, + "loss": 1.2081, + "step": 140 + }, + { + "epoch": 0.046057735494671515, + "grad_norm": 0.42576462030410767, + "learning_rate": 9.97747967037099e-05, + "loss": 1.1499, + "step": 141 + }, + { + "epoch": 0.0463843861010167, + "grad_norm": 0.47039994597435, + "learning_rate": 9.976982380742221e-05, + "loss": 1.1558, + "step": 142 + }, + { + "epoch": 0.04671103670736189, + "grad_norm": 0.49022629857063293, + "learning_rate": 9.976479673047363e-05, + "loss": 1.1308, + "step": 143 + }, + { + "epoch": 0.047037687313707076, + "grad_norm": 0.6336291432380676, + "learning_rate": 9.975971547833674e-05, + "loss": 1.4362, + "step": 144 + }, + { + "epoch": 0.04736433792005226, + "grad_norm": 0.8383249640464783, + "learning_rate": 9.975458005654314e-05, + "loss": 1.5723, + "step": 145 + }, + { + "epoch": 0.04769098852639745, + "grad_norm": 0.9654211401939392, + "learning_rate": 9.974939047068337e-05, + "loss": 1.4471, + "step": 146 + }, + { + "epoch": 0.04801763913274264, + "grad_norm": 1.1386771202087402, + "learning_rate": 9.974414672640693e-05, + "loss": 1.5832, + "step": 147 + }, + { + "epoch": 0.04834428973908783, + "grad_norm": 1.7021054029464722, + "learning_rate": 9.973884882942232e-05, + "loss": 2.0502, + "step": 148 + }, + { + "epoch": 0.04867094034543302, + "grad_norm": 1.9076801538467407, + "learning_rate": 9.973349678549692e-05, + "loss": 2.0686, + "step": 149 + }, + { + "epoch": 0.048997590951778205, + "grad_norm": 1.787797451019287, + "learning_rate": 9.972809060045714e-05, + "loss": 2.0867, + "step": 150 + }, + { + "epoch": 0.04932424155812339, + "grad_norm": 0.1957893669605255, + "learning_rate": 9.972263028018826e-05, + "loss": 0.7799, + "step": 151 + }, + { + "epoch": 0.04965089216446858, + "grad_norm": 0.2222263514995575, + "learning_rate": 9.971711583063452e-05, + "loss": 0.887, + "step": 152 + }, + { + "epoch": 0.04997754277081377, + "grad_norm": 0.22523343563079834, + "learning_rate": 9.97115472577991e-05, + "loss": 0.941, + "step": 153 + }, + { + "epoch": 0.050304193377158954, + "grad_norm": 0.23509541153907776, + "learning_rate": 9.970592456774408e-05, + "loss": 0.9283, + "step": 154 + }, + { + "epoch": 0.05063084398350414, + "grad_norm": 0.23556609451770782, + "learning_rate": 9.970024776659046e-05, + "loss": 0.9119, + "step": 155 + }, + { + "epoch": 0.050957494589849335, + "grad_norm": 0.2595606744289398, + "learning_rate": 9.969451686051814e-05, + "loss": 0.9602, + "step": 156 + }, + { + "epoch": 0.05128414519619452, + "grad_norm": 0.2726079523563385, + "learning_rate": 9.968873185576593e-05, + "loss": 0.9543, + "step": 157 + }, + { + "epoch": 0.05161079580253971, + "grad_norm": 0.27437275648117065, + "learning_rate": 9.968289275863152e-05, + "loss": 0.985, + "step": 158 + }, + { + "epoch": 0.051937446408884896, + "grad_norm": 0.2603570222854614, + "learning_rate": 9.967699957547152e-05, + "loss": 0.8353, + "step": 159 + }, + { + "epoch": 0.052264097015230083, + "grad_norm": 0.28538116812705994, + "learning_rate": 9.967105231270137e-05, + "loss": 0.982, + "step": 160 + }, + { + "epoch": 0.05259074762157527, + "grad_norm": 0.28932714462280273, + "learning_rate": 9.966505097679542e-05, + "loss": 0.8386, + "step": 161 + }, + { + "epoch": 0.05291739822792046, + "grad_norm": 0.31023868918418884, + "learning_rate": 9.965899557428686e-05, + "loss": 0.9743, + "step": 162 + }, + { + "epoch": 0.05324404883426565, + "grad_norm": 0.3417147696018219, + "learning_rate": 9.965288611176777e-05, + "loss": 0.998, + "step": 163 + }, + { + "epoch": 0.05357069944061084, + "grad_norm": 0.35479384660720825, + "learning_rate": 9.964672259588905e-05, + "loss": 1.0415, + "step": 164 + }, + { + "epoch": 0.053897350046956026, + "grad_norm": 0.3958076238632202, + "learning_rate": 9.964050503336047e-05, + "loss": 0.9943, + "step": 165 + }, + { + "epoch": 0.05422400065330121, + "grad_norm": 0.4142689108848572, + "learning_rate": 9.96342334309506e-05, + "loss": 1.1481, + "step": 166 + }, + { + "epoch": 0.0545506512596464, + "grad_norm": 0.46137475967407227, + "learning_rate": 9.962790779548688e-05, + "loss": 1.2285, + "step": 167 + }, + { + "epoch": 0.05487730186599159, + "grad_norm": 0.4913260340690613, + "learning_rate": 9.962152813385554e-05, + "loss": 1.2842, + "step": 168 + }, + { + "epoch": 0.055203952472336774, + "grad_norm": 0.5377975106239319, + "learning_rate": 9.961509445300163e-05, + "loss": 1.1915, + "step": 169 + }, + { + "epoch": 0.05553060307868196, + "grad_norm": 0.607846200466156, + "learning_rate": 9.960860675992904e-05, + "loss": 1.1763, + "step": 170 + }, + { + "epoch": 0.055857253685027156, + "grad_norm": 0.6850741505622864, + "learning_rate": 9.960206506170042e-05, + "loss": 1.3445, + "step": 171 + }, + { + "epoch": 0.05618390429137234, + "grad_norm": 0.849875807762146, + "learning_rate": 9.959546936543722e-05, + "loss": 1.5409, + "step": 172 + }, + { + "epoch": 0.05651055489771753, + "grad_norm": 0.9521230459213257, + "learning_rate": 9.95888196783197e-05, + "loss": 1.4537, + "step": 173 + }, + { + "epoch": 0.05683720550406272, + "grad_norm": 1.3079942464828491, + "learning_rate": 9.958211600758683e-05, + "loss": 1.6594, + "step": 174 + }, + { + "epoch": 0.057163856110407904, + "grad_norm": 2.0800368785858154, + "learning_rate": 9.957535836053644e-05, + "loss": 2.1626, + "step": 175 + }, + { + "epoch": 0.05749050671675309, + "grad_norm": 0.234994575381279, + "learning_rate": 9.956854674452504e-05, + "loss": 0.8112, + "step": 176 + }, + { + "epoch": 0.05781715732309828, + "grad_norm": 0.2850241959095001, + "learning_rate": 9.956168116696794e-05, + "loss": 0.9069, + "step": 177 + }, + { + "epoch": 0.05814380792944347, + "grad_norm": 0.2719287574291229, + "learning_rate": 9.955476163533915e-05, + "loss": 0.8626, + "step": 178 + }, + { + "epoch": 0.05847045853578866, + "grad_norm": 0.2707035541534424, + "learning_rate": 9.954778815717147e-05, + "loss": 0.8741, + "step": 179 + }, + { + "epoch": 0.05879710914213385, + "grad_norm": 0.2905394732952118, + "learning_rate": 9.954076074005641e-05, + "loss": 0.8914, + "step": 180 + }, + { + "epoch": 0.059123759748479034, + "grad_norm": 0.29349029064178467, + "learning_rate": 9.953367939164418e-05, + "loss": 0.9079, + "step": 181 + }, + { + "epoch": 0.05945041035482422, + "grad_norm": 0.26738202571868896, + "learning_rate": 9.952654411964368e-05, + "loss": 0.9479, + "step": 182 + }, + { + "epoch": 0.05977706096116941, + "grad_norm": 0.2822587192058563, + "learning_rate": 9.951935493182259e-05, + "loss": 0.9442, + "step": 183 + }, + { + "epoch": 0.060103711567514595, + "grad_norm": 0.2845873236656189, + "learning_rate": 9.95121118360072e-05, + "loss": 0.9895, + "step": 184 + }, + { + "epoch": 0.06043036217385978, + "grad_norm": 0.2944308817386627, + "learning_rate": 9.950481484008256e-05, + "loss": 0.9662, + "step": 185 + }, + { + "epoch": 0.060757012780204976, + "grad_norm": 0.3118637204170227, + "learning_rate": 9.949746395199233e-05, + "loss": 0.9222, + "step": 186 + }, + { + "epoch": 0.06108366338655016, + "grad_norm": 0.32590070366859436, + "learning_rate": 9.949005917973888e-05, + "loss": 0.9805, + "step": 187 + }, + { + "epoch": 0.06141031399289535, + "grad_norm": 0.32632124423980713, + "learning_rate": 9.948260053138323e-05, + "loss": 0.939, + "step": 188 + }, + { + "epoch": 0.06173696459924054, + "grad_norm": 0.38124093413352966, + "learning_rate": 9.947508801504503e-05, + "loss": 1.0358, + "step": 189 + }, + { + "epoch": 0.062063615205585725, + "grad_norm": 0.39029499888420105, + "learning_rate": 9.946752163890263e-05, + "loss": 1.0518, + "step": 190 + }, + { + "epoch": 0.06239026581193091, + "grad_norm": 0.38209250569343567, + "learning_rate": 9.945990141119295e-05, + "loss": 1.0458, + "step": 191 + }, + { + "epoch": 0.0627169164182761, + "grad_norm": 0.4655872583389282, + "learning_rate": 9.945222734021154e-05, + "loss": 1.1645, + "step": 192 + }, + { + "epoch": 0.06304356702462129, + "grad_norm": 0.4615199863910675, + "learning_rate": 9.944449943431262e-05, + "loss": 1.0565, + "step": 193 + }, + { + "epoch": 0.06337021763096648, + "grad_norm": 0.48887377977371216, + "learning_rate": 9.943671770190896e-05, + "loss": 1.1269, + "step": 194 + }, + { + "epoch": 0.06369686823731166, + "grad_norm": 0.5362581014633179, + "learning_rate": 9.942888215147193e-05, + "loss": 1.2089, + "step": 195 + }, + { + "epoch": 0.06402351884365685, + "grad_norm": 0.633625864982605, + "learning_rate": 9.942099279153154e-05, + "loss": 1.2551, + "step": 196 + }, + { + "epoch": 0.06435016945000205, + "grad_norm": 0.7443326711654663, + "learning_rate": 9.941304963067632e-05, + "loss": 1.24, + "step": 197 + }, + { + "epoch": 0.06467682005634723, + "grad_norm": 1.032091736793518, + "learning_rate": 9.940505267755341e-05, + "loss": 1.3817, + "step": 198 + }, + { + "epoch": 0.06500347066269242, + "grad_norm": 1.4411945343017578, + "learning_rate": 9.939700194086847e-05, + "loss": 1.5045, + "step": 199 + }, + { + "epoch": 0.0653301212690376, + "grad_norm": 2.1002960205078125, + "learning_rate": 9.938889742938575e-05, + "loss": 1.7348, + "step": 200 + }, + { + "epoch": 0.0656567718753828, + "grad_norm": 0.2001526653766632, + "learning_rate": 9.938073915192798e-05, + "loss": 0.7617, + "step": 201 + }, + { + "epoch": 0.06598342248172798, + "grad_norm": 0.22948387265205383, + "learning_rate": 9.937252711737652e-05, + "loss": 0.815, + "step": 202 + }, + { + "epoch": 0.06631007308807317, + "grad_norm": 0.2272295504808426, + "learning_rate": 9.936426133467115e-05, + "loss": 0.8564, + "step": 203 + }, + { + "epoch": 0.06663672369441835, + "grad_norm": 0.25728172063827515, + "learning_rate": 9.935594181281022e-05, + "loss": 0.9621, + "step": 204 + }, + { + "epoch": 0.06696337430076355, + "grad_norm": 0.2590310275554657, + "learning_rate": 9.934756856085059e-05, + "loss": 0.9696, + "step": 205 + }, + { + "epoch": 0.06729002490710874, + "grad_norm": 0.2755409777164459, + "learning_rate": 9.933914158790756e-05, + "loss": 0.9406, + "step": 206 + }, + { + "epoch": 0.06761667551345392, + "grad_norm": 0.2778630256652832, + "learning_rate": 9.933066090315494e-05, + "loss": 0.9644, + "step": 207 + }, + { + "epoch": 0.06794332611979911, + "grad_norm": 0.29143768548965454, + "learning_rate": 9.932212651582502e-05, + "loss": 0.9866, + "step": 208 + }, + { + "epoch": 0.0682699767261443, + "grad_norm": 0.28746140003204346, + "learning_rate": 9.931353843520856e-05, + "loss": 0.9637, + "step": 209 + }, + { + "epoch": 0.06859662733248949, + "grad_norm": 0.3338890075683594, + "learning_rate": 9.930489667065474e-05, + "loss": 0.9844, + "step": 210 + }, + { + "epoch": 0.06892327793883467, + "grad_norm": 0.32576242089271545, + "learning_rate": 9.929620123157121e-05, + "loss": 1.0607, + "step": 211 + }, + { + "epoch": 0.06924992854517986, + "grad_norm": 0.3407037556171417, + "learning_rate": 9.928745212742403e-05, + "loss": 1.0002, + "step": 212 + }, + { + "epoch": 0.06957657915152506, + "grad_norm": 0.35422274470329285, + "learning_rate": 9.927864936773769e-05, + "loss": 0.962, + "step": 213 + }, + { + "epoch": 0.06990322975787024, + "grad_norm": 0.42136797308921814, + "learning_rate": 9.926979296209509e-05, + "loss": 1.1253, + "step": 214 + }, + { + "epoch": 0.07022988036421543, + "grad_norm": 0.4093579649925232, + "learning_rate": 9.926088292013755e-05, + "loss": 1.0692, + "step": 215 + }, + { + "epoch": 0.07055653097056061, + "grad_norm": 0.48680227994918823, + "learning_rate": 9.925191925156474e-05, + "loss": 1.1906, + "step": 216 + }, + { + "epoch": 0.0708831815769058, + "grad_norm": 0.4694482088088989, + "learning_rate": 9.924290196613475e-05, + "loss": 1.1077, + "step": 217 + }, + { + "epoch": 0.07120983218325098, + "grad_norm": 0.485873818397522, + "learning_rate": 9.923383107366402e-05, + "loss": 1.1221, + "step": 218 + }, + { + "epoch": 0.07153648278959618, + "grad_norm": 0.5874084830284119, + "learning_rate": 9.922470658402731e-05, + "loss": 1.2042, + "step": 219 + }, + { + "epoch": 0.07186313339594137, + "grad_norm": 0.682409405708313, + "learning_rate": 9.921552850715783e-05, + "loss": 1.337, + "step": 220 + }, + { + "epoch": 0.07218978400228655, + "grad_norm": 0.8160572052001953, + "learning_rate": 9.920629685304701e-05, + "loss": 1.5398, + "step": 221 + }, + { + "epoch": 0.07251643460863175, + "grad_norm": 0.9582510590553284, + "learning_rate": 9.919701163174466e-05, + "loss": 1.3404, + "step": 222 + }, + { + "epoch": 0.07284308521497693, + "grad_norm": 1.3406293392181396, + "learning_rate": 9.918767285335892e-05, + "loss": 1.6518, + "step": 223 + }, + { + "epoch": 0.07316973582132212, + "grad_norm": 1.5214787721633911, + "learning_rate": 9.917828052805622e-05, + "loss": 1.8417, + "step": 224 + }, + { + "epoch": 0.0734963864276673, + "grad_norm": 2.5116734504699707, + "learning_rate": 9.916883466606127e-05, + "loss": 2.2562, + "step": 225 + }, + { + "epoch": 0.0738230370340125, + "grad_norm": 0.19113443791866302, + "learning_rate": 9.915933527765707e-05, + "loss": 0.8004, + "step": 226 + }, + { + "epoch": 0.07414968764035769, + "grad_norm": 0.20771871507167816, + "learning_rate": 9.914978237318487e-05, + "loss": 0.8554, + "step": 227 + }, + { + "epoch": 0.07447633824670287, + "grad_norm": 0.21872685849666595, + "learning_rate": 9.914017596304421e-05, + "loss": 0.9158, + "step": 228 + }, + { + "epoch": 0.07480298885304806, + "grad_norm": 0.23601579666137695, + "learning_rate": 9.913051605769288e-05, + "loss": 0.8477, + "step": 229 + }, + { + "epoch": 0.07512963945939324, + "grad_norm": 0.24787437915802002, + "learning_rate": 9.912080266764687e-05, + "loss": 0.8886, + "step": 230 + }, + { + "epoch": 0.07545629006573844, + "grad_norm": 0.24604691565036774, + "learning_rate": 9.911103580348044e-05, + "loss": 0.9259, + "step": 231 + }, + { + "epoch": 0.07578294067208362, + "grad_norm": 0.2649064362049103, + "learning_rate": 9.910121547582601e-05, + "loss": 0.9131, + "step": 232 + }, + { + "epoch": 0.07610959127842881, + "grad_norm": 0.2932969033718109, + "learning_rate": 9.909134169537426e-05, + "loss": 0.8468, + "step": 233 + }, + { + "epoch": 0.07643624188477399, + "grad_norm": 0.27797630429267883, + "learning_rate": 9.908141447287403e-05, + "loss": 0.9126, + "step": 234 + }, + { + "epoch": 0.07676289249111919, + "grad_norm": 0.29813864827156067, + "learning_rate": 9.907143381913231e-05, + "loss": 0.8744, + "step": 235 + }, + { + "epoch": 0.07708954309746438, + "grad_norm": 0.31166505813598633, + "learning_rate": 9.906139974501432e-05, + "loss": 0.9738, + "step": 236 + }, + { + "epoch": 0.07741619370380956, + "grad_norm": 0.3513356149196625, + "learning_rate": 9.905131226144337e-05, + "loss": 0.9285, + "step": 237 + }, + { + "epoch": 0.07774284431015475, + "grad_norm": 0.356487512588501, + "learning_rate": 9.904117137940099e-05, + "loss": 0.9841, + "step": 238 + }, + { + "epoch": 0.07806949491649993, + "grad_norm": 0.3558920621871948, + "learning_rate": 9.903097710992675e-05, + "loss": 1.0901, + "step": 239 + }, + { + "epoch": 0.07839614552284513, + "grad_norm": 0.4005250036716461, + "learning_rate": 9.90207294641184e-05, + "loss": 1.1738, + "step": 240 + }, + { + "epoch": 0.07872279612919031, + "grad_norm": 0.4491129517555237, + "learning_rate": 9.901042845313178e-05, + "loss": 1.17, + "step": 241 + }, + { + "epoch": 0.0790494467355355, + "grad_norm": 0.4596916139125824, + "learning_rate": 9.900007408818082e-05, + "loss": 1.1344, + "step": 242 + }, + { + "epoch": 0.0793760973418807, + "grad_norm": 0.48606646060943604, + "learning_rate": 9.898966638053755e-05, + "loss": 1.0872, + "step": 243 + }, + { + "epoch": 0.07970274794822588, + "grad_norm": 0.5776427388191223, + "learning_rate": 9.897920534153207e-05, + "loss": 1.2504, + "step": 244 + }, + { + "epoch": 0.08002939855457107, + "grad_norm": 0.5997623801231384, + "learning_rate": 9.896869098255249e-05, + "loss": 1.1232, + "step": 245 + }, + { + "epoch": 0.08035604916091625, + "grad_norm": 0.7988643646240234, + "learning_rate": 9.895812331504502e-05, + "loss": 1.4547, + "step": 246 + }, + { + "epoch": 0.08068269976726145, + "grad_norm": 0.9789859056472778, + "learning_rate": 9.894750235051389e-05, + "loss": 1.4768, + "step": 247 + }, + { + "epoch": 0.08100935037360663, + "grad_norm": 1.010258674621582, + "learning_rate": 9.893682810052132e-05, + "loss": 1.8305, + "step": 248 + }, + { + "epoch": 0.08133600097995182, + "grad_norm": 1.1983157396316528, + "learning_rate": 9.89261005766876e-05, + "loss": 1.5781, + "step": 249 + }, + { + "epoch": 0.08166265158629701, + "grad_norm": 1.4921436309814453, + "learning_rate": 9.891531979069096e-05, + "loss": 2.0553, + "step": 250 + }, + { + "epoch": 0.0819893021926422, + "grad_norm": 0.21676473319530487, + "learning_rate": 9.890448575426761e-05, + "loss": 0.6872, + "step": 251 + }, + { + "epoch": 0.08231595279898739, + "grad_norm": 0.22214944660663605, + "learning_rate": 9.889359847921176e-05, + "loss": 0.7853, + "step": 252 + }, + { + "epoch": 0.08264260340533257, + "grad_norm": 0.2550159990787506, + "learning_rate": 9.888265797737561e-05, + "loss": 0.8511, + "step": 253 + }, + { + "epoch": 0.08296925401167776, + "grad_norm": 0.2533634603023529, + "learning_rate": 9.887166426066921e-05, + "loss": 0.8282, + "step": 254 + }, + { + "epoch": 0.08329590461802294, + "grad_norm": 0.29216915369033813, + "learning_rate": 9.886061734106061e-05, + "loss": 0.9312, + "step": 255 + }, + { + "epoch": 0.08362255522436814, + "grad_norm": 0.2742856442928314, + "learning_rate": 9.884951723057574e-05, + "loss": 0.9006, + "step": 256 + }, + { + "epoch": 0.08394920583071332, + "grad_norm": 0.29381704330444336, + "learning_rate": 9.883836394129849e-05, + "loss": 0.9064, + "step": 257 + }, + { + "epoch": 0.08427585643705851, + "grad_norm": 0.2808193862438202, + "learning_rate": 9.882715748537056e-05, + "loss": 0.9602, + "step": 258 + }, + { + "epoch": 0.0846025070434037, + "grad_norm": 0.3137573003768921, + "learning_rate": 9.881589787499164e-05, + "loss": 0.9881, + "step": 259 + }, + { + "epoch": 0.08492915764974888, + "grad_norm": 0.32701578736305237, + "learning_rate": 9.880458512241917e-05, + "loss": 0.9211, + "step": 260 + }, + { + "epoch": 0.08525580825609408, + "grad_norm": 0.34949010610580444, + "learning_rate": 9.879321923996852e-05, + "loss": 1.0025, + "step": 261 + }, + { + "epoch": 0.08558245886243926, + "grad_norm": 0.3389429450035095, + "learning_rate": 9.878180024001283e-05, + "loss": 0.956, + "step": 262 + }, + { + "epoch": 0.08590910946878445, + "grad_norm": 0.41225770115852356, + "learning_rate": 9.877032813498315e-05, + "loss": 1.1321, + "step": 263 + }, + { + "epoch": 0.08623576007512963, + "grad_norm": 0.39792096614837646, + "learning_rate": 9.875880293736828e-05, + "loss": 1.0584, + "step": 264 + }, + { + "epoch": 0.08656241068147483, + "grad_norm": 0.447539359331131, + "learning_rate": 9.874722465971483e-05, + "loss": 1.1351, + "step": 265 + }, + { + "epoch": 0.08688906128782002, + "grad_norm": 0.4593818783760071, + "learning_rate": 9.87355933146272e-05, + "loss": 1.1266, + "step": 266 + }, + { + "epoch": 0.0872157118941652, + "grad_norm": 0.554883599281311, + "learning_rate": 9.872390891476757e-05, + "loss": 1.1753, + "step": 267 + }, + { + "epoch": 0.0875423625005104, + "grad_norm": 0.5333263874053955, + "learning_rate": 9.871217147285588e-05, + "loss": 1.1131, + "step": 268 + }, + { + "epoch": 0.08786901310685558, + "grad_norm": 0.7088521122932434, + "learning_rate": 9.870038100166973e-05, + "loss": 1.3864, + "step": 269 + }, + { + "epoch": 0.08819566371320077, + "grad_norm": 0.7951180934906006, + "learning_rate": 9.868853751404461e-05, + "loss": 1.3547, + "step": 270 + }, + { + "epoch": 0.08852231431954595, + "grad_norm": 0.9721757173538208, + "learning_rate": 9.867664102287359e-05, + "loss": 1.6935, + "step": 271 + }, + { + "epoch": 0.08884896492589114, + "grad_norm": 1.0832263231277466, + "learning_rate": 9.866469154110748e-05, + "loss": 1.3659, + "step": 272 + }, + { + "epoch": 0.08917561553223634, + "grad_norm": 1.3910161256790161, + "learning_rate": 9.86526890817548e-05, + "loss": 1.5507, + "step": 273 + }, + { + "epoch": 0.08950226613858152, + "grad_norm": 1.494617223739624, + "learning_rate": 9.864063365788169e-05, + "loss": 1.974, + "step": 274 + }, + { + "epoch": 0.08982891674492671, + "grad_norm": 1.6420493125915527, + "learning_rate": 9.862852528261202e-05, + "loss": 1.6818, + "step": 275 + }, + { + "epoch": 0.09015556735127189, + "grad_norm": 0.20619602501392365, + "learning_rate": 9.861636396912724e-05, + "loss": 0.7741, + "step": 276 + }, + { + "epoch": 0.09048221795761709, + "grad_norm": 0.23640893399715424, + "learning_rate": 9.860414973066647e-05, + "loss": 0.8814, + "step": 277 + }, + { + "epoch": 0.09080886856396227, + "grad_norm": 0.2566302716732025, + "learning_rate": 9.859188258052644e-05, + "loss": 0.9219, + "step": 278 + }, + { + "epoch": 0.09113551917030746, + "grad_norm": 0.27351245284080505, + "learning_rate": 9.857956253206144e-05, + "loss": 0.9595, + "step": 279 + }, + { + "epoch": 0.09146216977665265, + "grad_norm": 0.27393215894699097, + "learning_rate": 9.856718959868343e-05, + "loss": 0.8775, + "step": 280 + }, + { + "epoch": 0.09178882038299783, + "grad_norm": 0.28952500224113464, + "learning_rate": 9.855476379386186e-05, + "loss": 0.907, + "step": 281 + }, + { + "epoch": 0.09211547098934303, + "grad_norm": 0.27546462416648865, + "learning_rate": 9.854228513112376e-05, + "loss": 0.8872, + "step": 282 + }, + { + "epoch": 0.09244212159568821, + "grad_norm": 0.30211374163627625, + "learning_rate": 9.852975362405372e-05, + "loss": 0.9283, + "step": 283 + }, + { + "epoch": 0.0927687722020334, + "grad_norm": 0.32662448287010193, + "learning_rate": 9.851716928629386e-05, + "loss": 0.9332, + "step": 284 + }, + { + "epoch": 0.09309542280837858, + "grad_norm": 0.32408496737480164, + "learning_rate": 9.85045321315438e-05, + "loss": 0.8625, + "step": 285 + }, + { + "epoch": 0.09342207341472378, + "grad_norm": 0.31861940026283264, + "learning_rate": 9.849184217356064e-05, + "loss": 0.8849, + "step": 286 + }, + { + "epoch": 0.09374872402106896, + "grad_norm": 0.34367239475250244, + "learning_rate": 9.8479099426159e-05, + "loss": 0.922, + "step": 287 + }, + { + "epoch": 0.09407537462741415, + "grad_norm": 0.3816602826118469, + "learning_rate": 9.846630390321095e-05, + "loss": 1.0733, + "step": 288 + }, + { + "epoch": 0.09440202523375935, + "grad_norm": 0.39302805066108704, + "learning_rate": 9.845345561864599e-05, + "loss": 1.0519, + "step": 289 + }, + { + "epoch": 0.09472867584010453, + "grad_norm": 0.3958548903465271, + "learning_rate": 9.844055458645109e-05, + "loss": 1.1534, + "step": 290 + }, + { + "epoch": 0.09505532644644972, + "grad_norm": 0.4945371150970459, + "learning_rate": 9.842760082067067e-05, + "loss": 1.0721, + "step": 291 + }, + { + "epoch": 0.0953819770527949, + "grad_norm": 0.44846147298812866, + "learning_rate": 9.841459433540646e-05, + "loss": 0.9707, + "step": 292 + }, + { + "epoch": 0.0957086276591401, + "grad_norm": 0.5135351419448853, + "learning_rate": 9.84015351448177e-05, + "loss": 1.0283, + "step": 293 + }, + { + "epoch": 0.09603527826548527, + "grad_norm": 0.5842195153236389, + "learning_rate": 9.838842326312089e-05, + "loss": 1.0416, + "step": 294 + }, + { + "epoch": 0.09636192887183047, + "grad_norm": 0.6268463730812073, + "learning_rate": 9.837525870459e-05, + "loss": 1.2132, + "step": 295 + }, + { + "epoch": 0.09668857947817566, + "grad_norm": 0.7157014012336731, + "learning_rate": 9.836204148355625e-05, + "loss": 1.1914, + "step": 296 + }, + { + "epoch": 0.09701523008452084, + "grad_norm": 1.0113928318023682, + "learning_rate": 9.834877161440825e-05, + "loss": 1.5362, + "step": 297 + }, + { + "epoch": 0.09734188069086604, + "grad_norm": 1.2888816595077515, + "learning_rate": 9.833544911159194e-05, + "loss": 1.7309, + "step": 298 + }, + { + "epoch": 0.09766853129721122, + "grad_norm": 1.4757226705551147, + "learning_rate": 9.832207398961047e-05, + "loss": 1.6857, + "step": 299 + }, + { + "epoch": 0.09799518190355641, + "grad_norm": 3.058228015899658, + "learning_rate": 9.830864626302439e-05, + "loss": 1.9936, + "step": 300 + }, + { + "epoch": 0.09832183250990159, + "grad_norm": 0.20743629336357117, + "learning_rate": 9.82951659464514e-05, + "loss": 0.8756, + "step": 301 + }, + { + "epoch": 0.09864848311624679, + "grad_norm": 0.21614302694797516, + "learning_rate": 9.828163305456652e-05, + "loss": 0.8775, + "step": 302 + }, + { + "epoch": 0.09897513372259198, + "grad_norm": 0.21997962892055511, + "learning_rate": 9.826804760210202e-05, + "loss": 0.8532, + "step": 303 + }, + { + "epoch": 0.09930178432893716, + "grad_norm": 0.244362011551857, + "learning_rate": 9.825440960384733e-05, + "loss": 0.9222, + "step": 304 + }, + { + "epoch": 0.09962843493528235, + "grad_norm": 0.23313583433628082, + "learning_rate": 9.824071907464912e-05, + "loss": 0.8339, + "step": 305 + }, + { + "epoch": 0.09995508554162753, + "grad_norm": 0.2632003128528595, + "learning_rate": 9.822697602941123e-05, + "loss": 0.9296, + "step": 306 + }, + { + "epoch": 0.10028173614797273, + "grad_norm": 0.2504887580871582, + "learning_rate": 9.821318048309469e-05, + "loss": 0.9384, + "step": 307 + }, + { + "epoch": 0.10060838675431791, + "grad_norm": 0.2540770471096039, + "learning_rate": 9.819933245071768e-05, + "loss": 0.9079, + "step": 308 + }, + { + "epoch": 0.1009350373606631, + "grad_norm": 0.25880515575408936, + "learning_rate": 9.81854319473555e-05, + "loss": 0.8531, + "step": 309 + }, + { + "epoch": 0.10126168796700828, + "grad_norm": 0.29444649815559387, + "learning_rate": 9.817147898814059e-05, + "loss": 0.9213, + "step": 310 + }, + { + "epoch": 0.10158833857335348, + "grad_norm": 0.31520259380340576, + "learning_rate": 9.815747358826247e-05, + "loss": 0.9258, + "step": 311 + }, + { + "epoch": 0.10191498917969867, + "grad_norm": 0.3317732810974121, + "learning_rate": 9.814341576296777e-05, + "loss": 1.0338, + "step": 312 + }, + { + "epoch": 0.10224163978604385, + "grad_norm": 0.3525193929672241, + "learning_rate": 9.812930552756018e-05, + "loss": 0.9796, + "step": 313 + }, + { + "epoch": 0.10256829039238904, + "grad_norm": 0.3526162803173065, + "learning_rate": 9.811514289740047e-05, + "loss": 0.9216, + "step": 314 + }, + { + "epoch": 0.10289494099873422, + "grad_norm": 0.38859978318214417, + "learning_rate": 9.810092788790643e-05, + "loss": 0.9836, + "step": 315 + }, + { + "epoch": 0.10322159160507942, + "grad_norm": 0.4253045618534088, + "learning_rate": 9.808666051455287e-05, + "loss": 0.9547, + "step": 316 + }, + { + "epoch": 0.1035482422114246, + "grad_norm": 0.46034926176071167, + "learning_rate": 9.807234079287158e-05, + "loss": 1.2128, + "step": 317 + }, + { + "epoch": 0.10387489281776979, + "grad_norm": 0.5099804401397705, + "learning_rate": 9.80579687384514e-05, + "loss": 1.1683, + "step": 318 + }, + { + "epoch": 0.10420154342411499, + "grad_norm": 0.5876259207725525, + "learning_rate": 9.804354436693805e-05, + "loss": 1.3587, + "step": 319 + }, + { + "epoch": 0.10452819403046017, + "grad_norm": 0.596943199634552, + "learning_rate": 9.80290676940343e-05, + "loss": 1.2699, + "step": 320 + }, + { + "epoch": 0.10485484463680536, + "grad_norm": 0.6765027046203613, + "learning_rate": 9.801453873549983e-05, + "loss": 1.3848, + "step": 321 + }, + { + "epoch": 0.10518149524315054, + "grad_norm": 0.8787732124328613, + "learning_rate": 9.799995750715118e-05, + "loss": 1.3471, + "step": 322 + }, + { + "epoch": 0.10550814584949574, + "grad_norm": 0.9832908511161804, + "learning_rate": 9.798532402486186e-05, + "loss": 1.2418, + "step": 323 + }, + { + "epoch": 0.10583479645584092, + "grad_norm": 1.10990309715271, + "learning_rate": 9.797063830456224e-05, + "loss": 1.2758, + "step": 324 + }, + { + "epoch": 0.10616144706218611, + "grad_norm": 1.856461763381958, + "learning_rate": 9.795590036223955e-05, + "loss": 2.2217, + "step": 325 + }, + { + "epoch": 0.1064880976685313, + "grad_norm": 0.22293882071971893, + "learning_rate": 9.794111021393789e-05, + "loss": 0.7823, + "step": 326 + }, + { + "epoch": 0.10681474827487648, + "grad_norm": 0.2518823444843292, + "learning_rate": 9.792626787575817e-05, + "loss": 0.8259, + "step": 327 + }, + { + "epoch": 0.10714139888122168, + "grad_norm": 0.23686620593070984, + "learning_rate": 9.791137336385812e-05, + "loss": 0.8839, + "step": 328 + }, + { + "epoch": 0.10746804948756686, + "grad_norm": 0.28266048431396484, + "learning_rate": 9.789642669445227e-05, + "loss": 0.9046, + "step": 329 + }, + { + "epoch": 0.10779470009391205, + "grad_norm": 0.2630056142807007, + "learning_rate": 9.788142788381197e-05, + "loss": 0.9035, + "step": 330 + }, + { + "epoch": 0.10812135070025723, + "grad_norm": 0.29435524344444275, + "learning_rate": 9.786637694826527e-05, + "loss": 0.9977, + "step": 331 + }, + { + "epoch": 0.10844800130660243, + "grad_norm": 0.2852599322795868, + "learning_rate": 9.7851273904197e-05, + "loss": 0.9155, + "step": 332 + }, + { + "epoch": 0.10877465191294762, + "grad_norm": 0.34938427805900574, + "learning_rate": 9.783611876804869e-05, + "loss": 1.0045, + "step": 333 + }, + { + "epoch": 0.1091013025192928, + "grad_norm": 0.3286244869232178, + "learning_rate": 9.782091155631862e-05, + "loss": 0.9817, + "step": 334 + }, + { + "epoch": 0.109427953125638, + "grad_norm": 0.3370409309864044, + "learning_rate": 9.780565228556171e-05, + "loss": 0.9247, + "step": 335 + }, + { + "epoch": 0.10975460373198317, + "grad_norm": 0.32208555936813354, + "learning_rate": 9.77903409723896e-05, + "loss": 0.9641, + "step": 336 + }, + { + "epoch": 0.11008125433832837, + "grad_norm": 0.31277501583099365, + "learning_rate": 9.777497763347056e-05, + "loss": 1.0156, + "step": 337 + }, + { + "epoch": 0.11040790494467355, + "grad_norm": 0.34272870421409607, + "learning_rate": 9.775956228552951e-05, + "loss": 1.0397, + "step": 338 + }, + { + "epoch": 0.11073455555101874, + "grad_norm": 0.3709607422351837, + "learning_rate": 9.774409494534795e-05, + "loss": 1.0335, + "step": 339 + }, + { + "epoch": 0.11106120615736392, + "grad_norm": 0.3993772864341736, + "learning_rate": 9.772857562976403e-05, + "loss": 1.119, + "step": 340 + }, + { + "epoch": 0.11138785676370912, + "grad_norm": 0.43857407569885254, + "learning_rate": 9.771300435567246e-05, + "loss": 1.0039, + "step": 341 + }, + { + "epoch": 0.11171450737005431, + "grad_norm": 0.44964396953582764, + "learning_rate": 9.769738114002451e-05, + "loss": 1.2138, + "step": 342 + }, + { + "epoch": 0.11204115797639949, + "grad_norm": 0.5546076893806458, + "learning_rate": 9.7681705999828e-05, + "loss": 1.187, + "step": 343 + }, + { + "epoch": 0.11236780858274469, + "grad_norm": 0.565098762512207, + "learning_rate": 9.766597895214729e-05, + "loss": 1.2735, + "step": 344 + }, + { + "epoch": 0.11269445918908987, + "grad_norm": 0.6920917630195618, + "learning_rate": 9.76502000141032e-05, + "loss": 1.3793, + "step": 345 + }, + { + "epoch": 0.11302110979543506, + "grad_norm": 0.9214739799499512, + "learning_rate": 9.76343692028731e-05, + "loss": 1.3204, + "step": 346 + }, + { + "epoch": 0.11334776040178024, + "grad_norm": 1.2229256629943848, + "learning_rate": 9.761848653569078e-05, + "loss": 1.8793, + "step": 347 + }, + { + "epoch": 0.11367441100812543, + "grad_norm": 1.0612778663635254, + "learning_rate": 9.760255202984652e-05, + "loss": 1.3009, + "step": 348 + }, + { + "epoch": 0.11400106161447063, + "grad_norm": 1.46675705909729, + "learning_rate": 9.758656570268703e-05, + "loss": 1.5488, + "step": 349 + }, + { + "epoch": 0.11432771222081581, + "grad_norm": 1.9769386053085327, + "learning_rate": 9.75705275716154e-05, + "loss": 1.8757, + "step": 350 + }, + { + "epoch": 0.114654362827161, + "grad_norm": 0.1892956793308258, + "learning_rate": 9.755443765409113e-05, + "loss": 0.6978, + "step": 351 + }, + { + "epoch": 0.11498101343350618, + "grad_norm": 0.2098046839237213, + "learning_rate": 9.753829596763012e-05, + "loss": 0.9027, + "step": 352 + }, + { + "epoch": 0.11530766403985138, + "grad_norm": 0.24827317893505096, + "learning_rate": 9.75221025298046e-05, + "loss": 0.854, + "step": 353 + }, + { + "epoch": 0.11563431464619656, + "grad_norm": 0.24063189327716827, + "learning_rate": 9.750585735824315e-05, + "loss": 0.8822, + "step": 354 + }, + { + "epoch": 0.11596096525254175, + "grad_norm": 0.2549203038215637, + "learning_rate": 9.748956047063067e-05, + "loss": 0.9234, + "step": 355 + }, + { + "epoch": 0.11628761585888694, + "grad_norm": 0.3004131019115448, + "learning_rate": 9.747321188470835e-05, + "loss": 0.9409, + "step": 356 + }, + { + "epoch": 0.11661426646523212, + "grad_norm": 0.3164829611778259, + "learning_rate": 9.745681161827367e-05, + "loss": 0.9505, + "step": 357 + }, + { + "epoch": 0.11694091707157732, + "grad_norm": 0.2902483344078064, + "learning_rate": 9.744035968918035e-05, + "loss": 0.8985, + "step": 358 + }, + { + "epoch": 0.1172675676779225, + "grad_norm": 0.3035587668418884, + "learning_rate": 9.742385611533838e-05, + "loss": 0.9599, + "step": 359 + }, + { + "epoch": 0.1175942182842677, + "grad_norm": 0.34695568680763245, + "learning_rate": 9.740730091471395e-05, + "loss": 1.0561, + "step": 360 + }, + { + "epoch": 0.11792086889061287, + "grad_norm": 0.3543941080570221, + "learning_rate": 9.739069410532949e-05, + "loss": 0.8749, + "step": 361 + }, + { + "epoch": 0.11824751949695807, + "grad_norm": 0.3373425006866455, + "learning_rate": 9.737403570526353e-05, + "loss": 0.9717, + "step": 362 + }, + { + "epoch": 0.11857417010330325, + "grad_norm": 0.37677595019340515, + "learning_rate": 9.735732573265086e-05, + "loss": 1.0545, + "step": 363 + }, + { + "epoch": 0.11890082070964844, + "grad_norm": 0.4091393053531647, + "learning_rate": 9.734056420568236e-05, + "loss": 0.9264, + "step": 364 + }, + { + "epoch": 0.11922747131599364, + "grad_norm": 0.4432515799999237, + "learning_rate": 9.732375114260503e-05, + "loss": 1.0886, + "step": 365 + }, + { + "epoch": 0.11955412192233882, + "grad_norm": 0.4816678762435913, + "learning_rate": 9.7306886561722e-05, + "loss": 1.0472, + "step": 366 + }, + { + "epoch": 0.11988077252868401, + "grad_norm": 0.5049456357955933, + "learning_rate": 9.728997048139246e-05, + "loss": 1.0874, + "step": 367 + }, + { + "epoch": 0.12020742313502919, + "grad_norm": 0.5210486650466919, + "learning_rate": 9.727300292003168e-05, + "loss": 1.2298, + "step": 368 + }, + { + "epoch": 0.12053407374137438, + "grad_norm": 0.5923225283622742, + "learning_rate": 9.725598389611095e-05, + "loss": 1.1843, + "step": 369 + }, + { + "epoch": 0.12086072434771956, + "grad_norm": 0.6272051334381104, + "learning_rate": 9.723891342815764e-05, + "loss": 1.2197, + "step": 370 + }, + { + "epoch": 0.12118737495406476, + "grad_norm": 0.8260605931282043, + "learning_rate": 9.722179153475504e-05, + "loss": 1.4288, + "step": 371 + }, + { + "epoch": 0.12151402556040995, + "grad_norm": 0.9338024854660034, + "learning_rate": 9.720461823454248e-05, + "loss": 1.544, + "step": 372 + }, + { + "epoch": 0.12184067616675513, + "grad_norm": 1.0130958557128906, + "learning_rate": 9.718739354621527e-05, + "loss": 1.4905, + "step": 373 + }, + { + "epoch": 0.12216732677310033, + "grad_norm": 1.6334174871444702, + "learning_rate": 9.717011748852459e-05, + "loss": 1.7708, + "step": 374 + }, + { + "epoch": 0.1224939773794455, + "grad_norm": 1.7457410097122192, + "learning_rate": 9.715279008027759e-05, + "loss": 2.0156, + "step": 375 + }, + { + "epoch": 0.1228206279857907, + "grad_norm": 0.1906844973564148, + "learning_rate": 9.713541134033733e-05, + "loss": 0.7623, + "step": 376 + }, + { + "epoch": 0.12314727859213588, + "grad_norm": 0.20964844524860382, + "learning_rate": 9.711798128762273e-05, + "loss": 0.7831, + "step": 377 + }, + { + "epoch": 0.12347392919848107, + "grad_norm": 0.21881312131881714, + "learning_rate": 9.710049994110859e-05, + "loss": 0.8922, + "step": 378 + }, + { + "epoch": 0.12380057980482627, + "grad_norm": 0.22225263714790344, + "learning_rate": 9.708296731982551e-05, + "loss": 0.8802, + "step": 379 + }, + { + "epoch": 0.12412723041117145, + "grad_norm": 0.2540784478187561, + "learning_rate": 9.706538344285996e-05, + "loss": 0.9593, + "step": 380 + }, + { + "epoch": 0.12445388101751664, + "grad_norm": 0.26809781789779663, + "learning_rate": 9.704774832935415e-05, + "loss": 0.8706, + "step": 381 + }, + { + "epoch": 0.12478053162386182, + "grad_norm": 0.25871115922927856, + "learning_rate": 9.703006199850614e-05, + "loss": 0.8731, + "step": 382 + }, + { + "epoch": 0.125107182230207, + "grad_norm": 0.295626699924469, + "learning_rate": 9.701232446956969e-05, + "loss": 0.9022, + "step": 383 + }, + { + "epoch": 0.1254338328365522, + "grad_norm": 0.27636104822158813, + "learning_rate": 9.699453576185429e-05, + "loss": 0.8455, + "step": 384 + }, + { + "epoch": 0.1257604834428974, + "grad_norm": 0.30843478441238403, + "learning_rate": 9.697669589472521e-05, + "loss": 0.9653, + "step": 385 + }, + { + "epoch": 0.12608713404924257, + "grad_norm": 0.321384996175766, + "learning_rate": 9.695880488760333e-05, + "loss": 0.9124, + "step": 386 + }, + { + "epoch": 0.12641378465558778, + "grad_norm": 0.33451342582702637, + "learning_rate": 9.69408627599653e-05, + "loss": 1.0844, + "step": 387 + }, + { + "epoch": 0.12674043526193296, + "grad_norm": 0.3522025942802429, + "learning_rate": 9.692286953134328e-05, + "loss": 1.0568, + "step": 388 + }, + { + "epoch": 0.12706708586827814, + "grad_norm": 0.37441399693489075, + "learning_rate": 9.690482522132523e-05, + "loss": 0.989, + "step": 389 + }, + { + "epoch": 0.12739373647462332, + "grad_norm": 0.39532291889190674, + "learning_rate": 9.688672984955455e-05, + "loss": 1.1478, + "step": 390 + }, + { + "epoch": 0.12772038708096853, + "grad_norm": 0.39523276686668396, + "learning_rate": 9.686858343573037e-05, + "loss": 0.9581, + "step": 391 + }, + { + "epoch": 0.1280470376873137, + "grad_norm": 0.4399612247943878, + "learning_rate": 9.685038599960731e-05, + "loss": 1.0984, + "step": 392 + }, + { + "epoch": 0.1283736882936589, + "grad_norm": 0.493168443441391, + "learning_rate": 9.683213756099555e-05, + "loss": 1.1655, + "step": 393 + }, + { + "epoch": 0.1287003389000041, + "grad_norm": 0.5002815127372742, + "learning_rate": 9.681383813976077e-05, + "loss": 1.1861, + "step": 394 + }, + { + "epoch": 0.12902698950634928, + "grad_norm": 0.6230116486549377, + "learning_rate": 9.679548775582421e-05, + "loss": 1.1512, + "step": 395 + }, + { + "epoch": 0.12935364011269446, + "grad_norm": 0.660778284072876, + "learning_rate": 9.67770864291625e-05, + "loss": 1.3088, + "step": 396 + }, + { + "epoch": 0.12968029071903964, + "grad_norm": 0.8624020218849182, + "learning_rate": 9.675863417980784e-05, + "loss": 1.4296, + "step": 397 + }, + { + "epoch": 0.13000694132538484, + "grad_norm": 1.1575559377670288, + "learning_rate": 9.674013102784776e-05, + "loss": 1.5907, + "step": 398 + }, + { + "epoch": 0.13033359193173003, + "grad_norm": 1.721889615058899, + "learning_rate": 9.672157699342526e-05, + "loss": 1.3368, + "step": 399 + }, + { + "epoch": 0.1306602425380752, + "grad_norm": 1.968307614326477, + "learning_rate": 9.670297209673871e-05, + "loss": 1.9616, + "step": 400 + }, + { + "epoch": 0.1309868931444204, + "grad_norm": 0.19404010474681854, + "learning_rate": 9.668431635804189e-05, + "loss": 0.7653, + "step": 401 + }, + { + "epoch": 0.1313135437507656, + "grad_norm": 0.20838405191898346, + "learning_rate": 9.66656097976439e-05, + "loss": 0.8618, + "step": 402 + }, + { + "epoch": 0.13164019435711077, + "grad_norm": 0.24252377450466156, + "learning_rate": 9.664685243590911e-05, + "loss": 0.9215, + "step": 403 + }, + { + "epoch": 0.13196684496345595, + "grad_norm": 0.24695122241973877, + "learning_rate": 9.662804429325732e-05, + "loss": 0.8518, + "step": 404 + }, + { + "epoch": 0.13229349556980116, + "grad_norm": 0.24052157998085022, + "learning_rate": 9.660918539016348e-05, + "loss": 0.8623, + "step": 405 + }, + { + "epoch": 0.13262014617614634, + "grad_norm": 0.25183114409446716, + "learning_rate": 9.659027574715789e-05, + "loss": 0.8884, + "step": 406 + }, + { + "epoch": 0.13294679678249152, + "grad_norm": 0.26628735661506653, + "learning_rate": 9.657131538482605e-05, + "loss": 0.8609, + "step": 407 + }, + { + "epoch": 0.1332734473888367, + "grad_norm": 0.2659852206707001, + "learning_rate": 9.655230432380869e-05, + "loss": 0.949, + "step": 408 + }, + { + "epoch": 0.1336000979951819, + "grad_norm": 0.2931317687034607, + "learning_rate": 9.653324258480167e-05, + "loss": 0.8961, + "step": 409 + }, + { + "epoch": 0.1339267486015271, + "grad_norm": 0.29204240441322327, + "learning_rate": 9.651413018855613e-05, + "loss": 0.9064, + "step": 410 + }, + { + "epoch": 0.13425339920787227, + "grad_norm": 0.30939286947250366, + "learning_rate": 9.649496715587828e-05, + "loss": 0.8333, + "step": 411 + }, + { + "epoch": 0.13458004981421748, + "grad_norm": 0.34340524673461914, + "learning_rate": 9.647575350762946e-05, + "loss": 0.9585, + "step": 412 + }, + { + "epoch": 0.13490670042056266, + "grad_norm": 0.3608761429786682, + "learning_rate": 9.645648926472612e-05, + "loss": 0.9619, + "step": 413 + }, + { + "epoch": 0.13523335102690784, + "grad_norm": 0.37178945541381836, + "learning_rate": 9.643717444813982e-05, + "loss": 1.0787, + "step": 414 + }, + { + "epoch": 0.13556000163325302, + "grad_norm": 0.38620254397392273, + "learning_rate": 9.641780907889712e-05, + "loss": 1.1331, + "step": 415 + }, + { + "epoch": 0.13588665223959823, + "grad_norm": 0.38686099648475647, + "learning_rate": 9.639839317807963e-05, + "loss": 0.9718, + "step": 416 + }, + { + "epoch": 0.1362133028459434, + "grad_norm": 0.4547607898712158, + "learning_rate": 9.637892676682403e-05, + "loss": 1.2155, + "step": 417 + }, + { + "epoch": 0.1365399534522886, + "grad_norm": 0.45713773369789124, + "learning_rate": 9.635940986632188e-05, + "loss": 0.9978, + "step": 418 + }, + { + "epoch": 0.1368666040586338, + "grad_norm": 0.572322428226471, + "learning_rate": 9.633984249781977e-05, + "loss": 1.3199, + "step": 419 + }, + { + "epoch": 0.13719325466497898, + "grad_norm": 0.5989283919334412, + "learning_rate": 9.632022468261927e-05, + "loss": 1.2575, + "step": 420 + }, + { + "epoch": 0.13751990527132416, + "grad_norm": 0.6520776152610779, + "learning_rate": 9.630055644207677e-05, + "loss": 1.119, + "step": 421 + }, + { + "epoch": 0.13784655587766934, + "grad_norm": 0.9217725992202759, + "learning_rate": 9.628083779760361e-05, + "loss": 1.3642, + "step": 422 + }, + { + "epoch": 0.13817320648401454, + "grad_norm": 1.1913374662399292, + "learning_rate": 9.6261068770666e-05, + "loss": 1.3737, + "step": 423 + }, + { + "epoch": 0.13849985709035972, + "grad_norm": 1.2535861730575562, + "learning_rate": 9.6241249382785e-05, + "loss": 1.5402, + "step": 424 + }, + { + "epoch": 0.1388265076967049, + "grad_norm": 2.2817628383636475, + "learning_rate": 9.622137965553647e-05, + "loss": 1.8676, + "step": 425 + }, + { + "epoch": 0.1391531583030501, + "grad_norm": 0.17584745585918427, + "learning_rate": 9.62014596105511e-05, + "loss": 0.7117, + "step": 426 + }, + { + "epoch": 0.1394798089093953, + "grad_norm": 0.22812600433826447, + "learning_rate": 9.618148926951434e-05, + "loss": 0.8282, + "step": 427 + }, + { + "epoch": 0.13980645951574047, + "grad_norm": 0.24293436110019684, + "learning_rate": 9.616146865416638e-05, + "loss": 0.8739, + "step": 428 + }, + { + "epoch": 0.14013311012208565, + "grad_norm": 0.24836447834968567, + "learning_rate": 9.614139778630219e-05, + "loss": 0.7628, + "step": 429 + }, + { + "epoch": 0.14045976072843086, + "grad_norm": 0.2706006169319153, + "learning_rate": 9.612127668777139e-05, + "loss": 0.8577, + "step": 430 + }, + { + "epoch": 0.14078641133477604, + "grad_norm": 0.2763189673423767, + "learning_rate": 9.61011053804783e-05, + "loss": 0.9749, + "step": 431 + }, + { + "epoch": 0.14111306194112122, + "grad_norm": 0.2643950581550598, + "learning_rate": 9.608088388638193e-05, + "loss": 0.8468, + "step": 432 + }, + { + "epoch": 0.14143971254746643, + "grad_norm": 0.2801801562309265, + "learning_rate": 9.606061222749587e-05, + "loss": 0.8929, + "step": 433 + }, + { + "epoch": 0.1417663631538116, + "grad_norm": 0.3056126832962036, + "learning_rate": 9.604029042588838e-05, + "loss": 0.8787, + "step": 434 + }, + { + "epoch": 0.1420930137601568, + "grad_norm": 0.3168674409389496, + "learning_rate": 9.601991850368224e-05, + "loss": 0.933, + "step": 435 + }, + { + "epoch": 0.14241966436650197, + "grad_norm": 0.31766176223754883, + "learning_rate": 9.599949648305486e-05, + "loss": 1.0011, + "step": 436 + }, + { + "epoch": 0.14274631497284718, + "grad_norm": 0.35657021403312683, + "learning_rate": 9.597902438623814e-05, + "loss": 1.0058, + "step": 437 + }, + { + "epoch": 0.14307296557919236, + "grad_norm": 0.4108402132987976, + "learning_rate": 9.59585022355185e-05, + "loss": 1.2417, + "step": 438 + }, + { + "epoch": 0.14339961618553754, + "grad_norm": 0.4023765027523041, + "learning_rate": 9.593793005323689e-05, + "loss": 1.031, + "step": 439 + }, + { + "epoch": 0.14372626679188275, + "grad_norm": 0.3895339369773865, + "learning_rate": 9.591730786178866e-05, + "loss": 1.0337, + "step": 440 + }, + { + "epoch": 0.14405291739822793, + "grad_norm": 0.42880305647850037, + "learning_rate": 9.589663568362368e-05, + "loss": 1.0338, + "step": 441 + }, + { + "epoch": 0.1443795680045731, + "grad_norm": 0.4699881076812744, + "learning_rate": 9.587591354124616e-05, + "loss": 1.0881, + "step": 442 + }, + { + "epoch": 0.14470621861091829, + "grad_norm": 0.5102139711380005, + "learning_rate": 9.585514145721475e-05, + "loss": 1.0901, + "step": 443 + }, + { + "epoch": 0.1450328692172635, + "grad_norm": 0.5716970562934875, + "learning_rate": 9.583431945414245e-05, + "loss": 1.1766, + "step": 444 + }, + { + "epoch": 0.14535951982360867, + "grad_norm": 0.6934454441070557, + "learning_rate": 9.581344755469663e-05, + "loss": 1.2836, + "step": 445 + }, + { + "epoch": 0.14568617042995385, + "grad_norm": 0.8125836849212646, + "learning_rate": 9.579252578159892e-05, + "loss": 1.2623, + "step": 446 + }, + { + "epoch": 0.14601282103629906, + "grad_norm": 1.278208613395691, + "learning_rate": 9.57715541576253e-05, + "loss": 1.5706, + "step": 447 + }, + { + "epoch": 0.14633947164264424, + "grad_norm": 1.3957326412200928, + "learning_rate": 9.575053270560598e-05, + "loss": 1.675, + "step": 448 + }, + { + "epoch": 0.14666612224898942, + "grad_norm": 1.5048333406448364, + "learning_rate": 9.572946144842547e-05, + "loss": 1.4975, + "step": 449 + }, + { + "epoch": 0.1469927728553346, + "grad_norm": 2.2100048065185547, + "learning_rate": 9.570834040902243e-05, + "loss": 2.1886, + "step": 450 + }, + { + "epoch": 0.1473194234616798, + "grad_norm": 0.1943780481815338, + "learning_rate": 9.568716961038977e-05, + "loss": 0.7153, + "step": 451 + }, + { + "epoch": 0.147646074068025, + "grad_norm": 0.20413154363632202, + "learning_rate": 9.566594907557452e-05, + "loss": 0.8099, + "step": 452 + }, + { + "epoch": 0.14797272467437017, + "grad_norm": 0.24154390394687653, + "learning_rate": 9.564467882767787e-05, + "loss": 0.9133, + "step": 453 + }, + { + "epoch": 0.14829937528071538, + "grad_norm": 0.2548098564147949, + "learning_rate": 9.562335888985516e-05, + "loss": 0.8946, + "step": 454 + }, + { + "epoch": 0.14862602588706056, + "grad_norm": 0.2680473029613495, + "learning_rate": 9.560198928531581e-05, + "loss": 0.8944, + "step": 455 + }, + { + "epoch": 0.14895267649340574, + "grad_norm": 0.2628840506076813, + "learning_rate": 9.55805700373233e-05, + "loss": 0.8787, + "step": 456 + }, + { + "epoch": 0.14927932709975092, + "grad_norm": 0.2785063087940216, + "learning_rate": 9.55591011691951e-05, + "loss": 0.8681, + "step": 457 + }, + { + "epoch": 0.14960597770609613, + "grad_norm": 0.28061676025390625, + "learning_rate": 9.553758270430284e-05, + "loss": 0.8655, + "step": 458 + }, + { + "epoch": 0.1499326283124413, + "grad_norm": 0.29894521832466125, + "learning_rate": 9.551601466607197e-05, + "loss": 0.9433, + "step": 459 + }, + { + "epoch": 0.1502592789187865, + "grad_norm": 0.292904794216156, + "learning_rate": 9.549439707798203e-05, + "loss": 0.8323, + "step": 460 + }, + { + "epoch": 0.15058592952513167, + "grad_norm": 0.3242059648036957, + "learning_rate": 9.547272996356646e-05, + "loss": 0.9685, + "step": 461 + }, + { + "epoch": 0.15091258013147688, + "grad_norm": 0.334773987531662, + "learning_rate": 9.545101334641262e-05, + "loss": 0.946, + "step": 462 + }, + { + "epoch": 0.15123923073782206, + "grad_norm": 0.3734418749809265, + "learning_rate": 9.542924725016173e-05, + "loss": 1.0552, + "step": 463 + }, + { + "epoch": 0.15156588134416724, + "grad_norm": 0.41196638345718384, + "learning_rate": 9.540743169850893e-05, + "loss": 1.1502, + "step": 464 + }, + { + "epoch": 0.15189253195051244, + "grad_norm": 0.4083957076072693, + "learning_rate": 9.538556671520316e-05, + "loss": 1.0984, + "step": 465 + }, + { + "epoch": 0.15221918255685762, + "grad_norm": 0.4450596570968628, + "learning_rate": 9.536365232404718e-05, + "loss": 1.1297, + "step": 466 + }, + { + "epoch": 0.1525458331632028, + "grad_norm": 0.4478522837162018, + "learning_rate": 9.534168854889754e-05, + "loss": 1.125, + "step": 467 + }, + { + "epoch": 0.15287248376954798, + "grad_norm": 0.5305338501930237, + "learning_rate": 9.531967541366452e-05, + "loss": 1.1453, + "step": 468 + }, + { + "epoch": 0.1531991343758932, + "grad_norm": 0.5751011967658997, + "learning_rate": 9.529761294231221e-05, + "loss": 1.2348, + "step": 469 + }, + { + "epoch": 0.15352578498223837, + "grad_norm": 0.6348747611045837, + "learning_rate": 9.527550115885833e-05, + "loss": 1.2891, + "step": 470 + }, + { + "epoch": 0.15385243558858355, + "grad_norm": 0.7538950443267822, + "learning_rate": 9.525334008737435e-05, + "loss": 1.4452, + "step": 471 + }, + { + "epoch": 0.15417908619492876, + "grad_norm": 0.9422009587287903, + "learning_rate": 9.523112975198532e-05, + "loss": 1.3917, + "step": 472 + }, + { + "epoch": 0.15450573680127394, + "grad_norm": 1.307346224784851, + "learning_rate": 9.520887017686997e-05, + "loss": 1.5785, + "step": 473 + }, + { + "epoch": 0.15483238740761912, + "grad_norm": 1.255781650543213, + "learning_rate": 9.518656138626063e-05, + "loss": 1.7596, + "step": 474 + }, + { + "epoch": 0.1551590380139643, + "grad_norm": 1.9935503005981445, + "learning_rate": 9.51642034044432e-05, + "loss": 2.3361, + "step": 475 + }, + { + "epoch": 0.1554856886203095, + "grad_norm": 0.23217955231666565, + "learning_rate": 9.514179625575715e-05, + "loss": 0.7319, + "step": 476 + }, + { + "epoch": 0.1558123392266547, + "grad_norm": 0.23310193419456482, + "learning_rate": 9.511933996459544e-05, + "loss": 0.8677, + "step": 477 + }, + { + "epoch": 0.15613898983299987, + "grad_norm": 0.22472940385341644, + "learning_rate": 9.509683455540452e-05, + "loss": 0.8419, + "step": 478 + }, + { + "epoch": 0.15646564043934508, + "grad_norm": 0.23470456898212433, + "learning_rate": 9.507428005268438e-05, + "loss": 0.921, + "step": 479 + }, + { + "epoch": 0.15679229104569026, + "grad_norm": 0.2441803216934204, + "learning_rate": 9.505167648098837e-05, + "loss": 0.9128, + "step": 480 + }, + { + "epoch": 0.15711894165203544, + "grad_norm": 0.2495880424976349, + "learning_rate": 9.502902386492332e-05, + "loss": 0.8631, + "step": 481 + }, + { + "epoch": 0.15744559225838062, + "grad_norm": 0.2750225365161896, + "learning_rate": 9.500632222914943e-05, + "loss": 0.8532, + "step": 482 + }, + { + "epoch": 0.15777224286472583, + "grad_norm": 0.29554665088653564, + "learning_rate": 9.498357159838025e-05, + "loss": 0.9093, + "step": 483 + }, + { + "epoch": 0.158098893471071, + "grad_norm": 0.26975587010383606, + "learning_rate": 9.496077199738267e-05, + "loss": 0.8197, + "step": 484 + }, + { + "epoch": 0.15842554407741619, + "grad_norm": 0.31354406476020813, + "learning_rate": 9.493792345097693e-05, + "loss": 0.8923, + "step": 485 + }, + { + "epoch": 0.1587521946837614, + "grad_norm": 0.29501739144325256, + "learning_rate": 9.49150259840365e-05, + "loss": 0.8918, + "step": 486 + }, + { + "epoch": 0.15907884529010657, + "grad_norm": 0.3456759452819824, + "learning_rate": 9.489207962148814e-05, + "loss": 0.9493, + "step": 487 + }, + { + "epoch": 0.15940549589645175, + "grad_norm": 0.3523378372192383, + "learning_rate": 9.486908438831181e-05, + "loss": 0.9785, + "step": 488 + }, + { + "epoch": 0.15973214650279693, + "grad_norm": 0.38017013669013977, + "learning_rate": 9.484604030954072e-05, + "loss": 1.0719, + "step": 489 + }, + { + "epoch": 0.16005879710914214, + "grad_norm": 0.39783334732055664, + "learning_rate": 9.482294741026119e-05, + "loss": 0.9972, + "step": 490 + }, + { + "epoch": 0.16038544771548732, + "grad_norm": 0.4237908124923706, + "learning_rate": 9.479980571561274e-05, + "loss": 1.0702, + "step": 491 + }, + { + "epoch": 0.1607120983218325, + "grad_norm": 0.4826822876930237, + "learning_rate": 9.4776615250788e-05, + "loss": 1.0767, + "step": 492 + }, + { + "epoch": 0.1610387489281777, + "grad_norm": 0.47704166173934937, + "learning_rate": 9.475337604103266e-05, + "loss": 0.9948, + "step": 493 + }, + { + "epoch": 0.1613653995345229, + "grad_norm": 0.520698070526123, + "learning_rate": 9.47300881116455e-05, + "loss": 1.0778, + "step": 494 + }, + { + "epoch": 0.16169205014086807, + "grad_norm": 0.5984883308410645, + "learning_rate": 9.470675148797836e-05, + "loss": 1.2363, + "step": 495 + }, + { + "epoch": 0.16201870074721325, + "grad_norm": 0.788203775882721, + "learning_rate": 9.468336619543605e-05, + "loss": 1.3552, + "step": 496 + }, + { + "epoch": 0.16234535135355846, + "grad_norm": 0.9296058416366577, + "learning_rate": 9.465993225947638e-05, + "loss": 1.3718, + "step": 497 + }, + { + "epoch": 0.16267200195990364, + "grad_norm": 1.005128264427185, + "learning_rate": 9.463644970561009e-05, + "loss": 1.2159, + "step": 498 + }, + { + "epoch": 0.16299865256624882, + "grad_norm": 1.1713541746139526, + "learning_rate": 9.461291855940091e-05, + "loss": 1.4884, + "step": 499 + }, + { + "epoch": 0.16332530317259403, + "grad_norm": 1.967857003211975, + "learning_rate": 9.458933884646541e-05, + "loss": 1.4611, + "step": 500 + }, + { + "epoch": 0.1636519537789392, + "grad_norm": 0.18524925410747528, + "learning_rate": 9.456571059247303e-05, + "loss": 0.7829, + "step": 501 + }, + { + "epoch": 0.1639786043852844, + "grad_norm": 0.2153564691543579, + "learning_rate": 9.45420338231461e-05, + "loss": 0.8495, + "step": 502 + }, + { + "epoch": 0.16430525499162957, + "grad_norm": 0.2383943796157837, + "learning_rate": 9.451830856425973e-05, + "loss": 0.7869, + "step": 503 + }, + { + "epoch": 0.16463190559797478, + "grad_norm": 0.24494485557079315, + "learning_rate": 9.449453484164181e-05, + "loss": 0.8712, + "step": 504 + }, + { + "epoch": 0.16495855620431996, + "grad_norm": 0.2590121626853943, + "learning_rate": 9.4470712681173e-05, + "loss": 0.8634, + "step": 505 + }, + { + "epoch": 0.16528520681066514, + "grad_norm": 0.2706097662448883, + "learning_rate": 9.444684210878671e-05, + "loss": 0.8342, + "step": 506 + }, + { + "epoch": 0.16561185741701034, + "grad_norm": 0.2868553698062897, + "learning_rate": 9.442292315046903e-05, + "loss": 0.9175, + "step": 507 + }, + { + "epoch": 0.16593850802335552, + "grad_norm": 0.29883742332458496, + "learning_rate": 9.439895583225873e-05, + "loss": 0.9143, + "step": 508 + }, + { + "epoch": 0.1662651586297007, + "grad_norm": 0.31189242005348206, + "learning_rate": 9.437494018024721e-05, + "loss": 0.9101, + "step": 509 + }, + { + "epoch": 0.16659180923604588, + "grad_norm": 0.2974110245704651, + "learning_rate": 9.435087622057855e-05, + "loss": 0.8765, + "step": 510 + }, + { + "epoch": 0.1669184598423911, + "grad_norm": 0.3233146071434021, + "learning_rate": 9.43267639794493e-05, + "loss": 0.9817, + "step": 511 + }, + { + "epoch": 0.16724511044873627, + "grad_norm": 0.33368226885795593, + "learning_rate": 9.430260348310869e-05, + "loss": 0.968, + "step": 512 + }, + { + "epoch": 0.16757176105508145, + "grad_norm": 0.37103214859962463, + "learning_rate": 9.427839475785844e-05, + "loss": 0.9822, + "step": 513 + }, + { + "epoch": 0.16789841166142663, + "grad_norm": 0.36908629536628723, + "learning_rate": 9.425413783005272e-05, + "loss": 1.0336, + "step": 514 + }, + { + "epoch": 0.16822506226777184, + "grad_norm": 0.41780969500541687, + "learning_rate": 9.422983272609828e-05, + "loss": 0.9956, + "step": 515 + }, + { + "epoch": 0.16855171287411702, + "grad_norm": 0.4576914310455322, + "learning_rate": 9.420547947245422e-05, + "loss": 0.9278, + "step": 516 + }, + { + "epoch": 0.1688783634804622, + "grad_norm": 0.49422821402549744, + "learning_rate": 9.418107809563208e-05, + "loss": 1.1263, + "step": 517 + }, + { + "epoch": 0.1692050140868074, + "grad_norm": 0.5176439881324768, + "learning_rate": 9.415662862219585e-05, + "loss": 1.1614, + "step": 518 + }, + { + "epoch": 0.1695316646931526, + "grad_norm": 0.48677173256874084, + "learning_rate": 9.41321310787618e-05, + "loss": 0.9933, + "step": 519 + }, + { + "epoch": 0.16985831529949777, + "grad_norm": 0.6720970273017883, + "learning_rate": 9.410758549199856e-05, + "loss": 1.3328, + "step": 520 + }, + { + "epoch": 0.17018496590584295, + "grad_norm": 0.7612361311912537, + "learning_rate": 9.408299188862709e-05, + "loss": 1.2837, + "step": 521 + }, + { + "epoch": 0.17051161651218816, + "grad_norm": 0.8276899456977844, + "learning_rate": 9.405835029542055e-05, + "loss": 1.5145, + "step": 522 + }, + { + "epoch": 0.17083826711853334, + "grad_norm": 1.2721513509750366, + "learning_rate": 9.403366073920442e-05, + "loss": 1.6956, + "step": 523 + }, + { + "epoch": 0.17116491772487852, + "grad_norm": 1.5282410383224487, + "learning_rate": 9.400892324685636e-05, + "loss": 2.0885, + "step": 524 + }, + { + "epoch": 0.17149156833122373, + "grad_norm": 1.656018853187561, + "learning_rate": 9.398413784530621e-05, + "loss": 1.6356, + "step": 525 + }, + { + "epoch": 0.1718182189375689, + "grad_norm": 0.18846455216407776, + "learning_rate": 9.395930456153597e-05, + "loss": 0.867, + "step": 526 + }, + { + "epoch": 0.1721448695439141, + "grad_norm": 0.21775482594966888, + "learning_rate": 9.393442342257977e-05, + "loss": 0.7604, + "step": 527 + }, + { + "epoch": 0.17247152015025927, + "grad_norm": 0.22779767215251923, + "learning_rate": 9.390949445552383e-05, + "loss": 0.7736, + "step": 528 + }, + { + "epoch": 0.17279817075660447, + "grad_norm": 0.23663291335105896, + "learning_rate": 9.388451768750644e-05, + "loss": 0.8514, + "step": 529 + }, + { + "epoch": 0.17312482136294965, + "grad_norm": 0.25589585304260254, + "learning_rate": 9.385949314571792e-05, + "loss": 0.8874, + "step": 530 + }, + { + "epoch": 0.17345147196929483, + "grad_norm": 0.24930959939956665, + "learning_rate": 9.383442085740062e-05, + "loss": 0.8109, + "step": 531 + }, + { + "epoch": 0.17377812257564004, + "grad_norm": 0.2678419053554535, + "learning_rate": 9.380930084984884e-05, + "loss": 0.8583, + "step": 532 + }, + { + "epoch": 0.17410477318198522, + "grad_norm": 0.2837614417076111, + "learning_rate": 9.378413315040887e-05, + "loss": 0.9681, + "step": 533 + }, + { + "epoch": 0.1744314237883304, + "grad_norm": 0.2679899036884308, + "learning_rate": 9.375891778647885e-05, + "loss": 0.8959, + "step": 534 + }, + { + "epoch": 0.17475807439467558, + "grad_norm": 0.27939561009407043, + "learning_rate": 9.373365478550886e-05, + "loss": 0.9481, + "step": 535 + }, + { + "epoch": 0.1750847250010208, + "grad_norm": 0.28748729825019836, + "learning_rate": 9.370834417500085e-05, + "loss": 0.8571, + "step": 536 + }, + { + "epoch": 0.17541137560736597, + "grad_norm": 0.3106549084186554, + "learning_rate": 9.368298598250856e-05, + "loss": 0.878, + "step": 537 + }, + { + "epoch": 0.17573802621371115, + "grad_norm": 0.34394335746765137, + "learning_rate": 9.365758023563753e-05, + "loss": 0.9116, + "step": 538 + }, + { + "epoch": 0.17606467682005636, + "grad_norm": 0.3183876872062683, + "learning_rate": 9.363212696204511e-05, + "loss": 0.8952, + "step": 539 + }, + { + "epoch": 0.17639132742640154, + "grad_norm": 0.39337795972824097, + "learning_rate": 9.360662618944033e-05, + "loss": 1.0152, + "step": 540 + }, + { + "epoch": 0.17671797803274672, + "grad_norm": 0.4016577899456024, + "learning_rate": 9.358107794558401e-05, + "loss": 0.9301, + "step": 541 + }, + { + "epoch": 0.1770446286390919, + "grad_norm": 0.42000070214271545, + "learning_rate": 9.355548225828858e-05, + "loss": 1.111, + "step": 542 + }, + { + "epoch": 0.1773712792454371, + "grad_norm": 0.5311472415924072, + "learning_rate": 9.352983915541813e-05, + "loss": 1.1984, + "step": 543 + }, + { + "epoch": 0.1776979298517823, + "grad_norm": 0.5248878598213196, + "learning_rate": 9.350414866488837e-05, + "loss": 1.1448, + "step": 544 + }, + { + "epoch": 0.17802458045812747, + "grad_norm": 0.7028717994689941, + "learning_rate": 9.347841081466662e-05, + "loss": 1.1947, + "step": 545 + }, + { + "epoch": 0.17835123106447268, + "grad_norm": 0.7405298948287964, + "learning_rate": 9.345262563277173e-05, + "loss": 1.3281, + "step": 546 + }, + { + "epoch": 0.17867788167081786, + "grad_norm": 1.0811318159103394, + "learning_rate": 9.342679314727408e-05, + "loss": 1.2947, + "step": 547 + }, + { + "epoch": 0.17900453227716304, + "grad_norm": 1.2324838638305664, + "learning_rate": 9.340091338629556e-05, + "loss": 1.4522, + "step": 548 + }, + { + "epoch": 0.17933118288350822, + "grad_norm": 1.280505657196045, + "learning_rate": 9.337498637800952e-05, + "loss": 1.4853, + "step": 549 + }, + { + "epoch": 0.17965783348985342, + "grad_norm": 1.5319606065750122, + "learning_rate": 9.334901215064075e-05, + "loss": 1.8064, + "step": 550 + }, + { + "epoch": 0.1799844840961986, + "grad_norm": 0.19149908423423767, + "learning_rate": 9.332299073246543e-05, + "loss": 0.6917, + "step": 551 + }, + { + "epoch": 0.18031113470254378, + "grad_norm": 0.24123403429985046, + "learning_rate": 9.329692215181111e-05, + "loss": 0.8778, + "step": 552 + }, + { + "epoch": 0.180637785308889, + "grad_norm": 0.2261694222688675, + "learning_rate": 9.32708064370567e-05, + "loss": 0.7859, + "step": 553 + }, + { + "epoch": 0.18096443591523417, + "grad_norm": 0.2447603940963745, + "learning_rate": 9.32446436166324e-05, + "loss": 0.8404, + "step": 554 + }, + { + "epoch": 0.18129108652157935, + "grad_norm": 0.25092342495918274, + "learning_rate": 9.321843371901975e-05, + "loss": 0.8508, + "step": 555 + }, + { + "epoch": 0.18161773712792453, + "grad_norm": 0.2693803906440735, + "learning_rate": 9.319217677275142e-05, + "loss": 0.8603, + "step": 556 + }, + { + "epoch": 0.18194438773426974, + "grad_norm": 0.26989585161209106, + "learning_rate": 9.316587280641142e-05, + "loss": 0.819, + "step": 557 + }, + { + "epoch": 0.18227103834061492, + "grad_norm": 0.3000803589820862, + "learning_rate": 9.313952184863489e-05, + "loss": 0.8996, + "step": 558 + }, + { + "epoch": 0.1825976889469601, + "grad_norm": 0.27644068002700806, + "learning_rate": 9.311312392810813e-05, + "loss": 0.9319, + "step": 559 + }, + { + "epoch": 0.1829243395533053, + "grad_norm": 0.31071117520332336, + "learning_rate": 9.308667907356856e-05, + "loss": 0.9464, + "step": 560 + }, + { + "epoch": 0.1832509901596505, + "grad_norm": 0.3157282769680023, + "learning_rate": 9.306018731380472e-05, + "loss": 0.9281, + "step": 561 + }, + { + "epoch": 0.18357764076599567, + "grad_norm": 0.33144116401672363, + "learning_rate": 9.303364867765619e-05, + "loss": 0.8753, + "step": 562 + }, + { + "epoch": 0.18390429137234085, + "grad_norm": 0.3958095610141754, + "learning_rate": 9.300706319401358e-05, + "loss": 1.0352, + "step": 563 + }, + { + "epoch": 0.18423094197868606, + "grad_norm": 0.4302027225494385, + "learning_rate": 9.298043089181852e-05, + "loss": 0.9228, + "step": 564 + }, + { + "epoch": 0.18455759258503124, + "grad_norm": 0.39587950706481934, + "learning_rate": 9.295375180006356e-05, + "loss": 0.9858, + "step": 565 + }, + { + "epoch": 0.18488424319137642, + "grad_norm": 0.486945778131485, + "learning_rate": 9.292702594779224e-05, + "loss": 1.0143, + "step": 566 + }, + { + "epoch": 0.1852108937977216, + "grad_norm": 0.44961902499198914, + "learning_rate": 9.2900253364099e-05, + "loss": 1.0699, + "step": 567 + }, + { + "epoch": 0.1855375444040668, + "grad_norm": 0.4762088656425476, + "learning_rate": 9.287343407812909e-05, + "loss": 1.0856, + "step": 568 + }, + { + "epoch": 0.185864195010412, + "grad_norm": 0.5508722066879272, + "learning_rate": 9.28465681190787e-05, + "loss": 1.2314, + "step": 569 + }, + { + "epoch": 0.18619084561675717, + "grad_norm": 0.6167343258857727, + "learning_rate": 9.281965551619476e-05, + "loss": 1.1919, + "step": 570 + }, + { + "epoch": 0.18651749622310237, + "grad_norm": 0.7416098713874817, + "learning_rate": 9.279269629877497e-05, + "loss": 1.2134, + "step": 571 + }, + { + "epoch": 0.18684414682944755, + "grad_norm": 0.9034223556518555, + "learning_rate": 9.276569049616784e-05, + "loss": 1.2163, + "step": 572 + }, + { + "epoch": 0.18717079743579274, + "grad_norm": 1.2114628553390503, + "learning_rate": 9.273863813777253e-05, + "loss": 1.2241, + "step": 573 + }, + { + "epoch": 0.18749744804213792, + "grad_norm": 1.2960182428359985, + "learning_rate": 9.27115392530389e-05, + "loss": 1.1737, + "step": 574 + }, + { + "epoch": 0.18782409864848312, + "grad_norm": 1.766401767730713, + "learning_rate": 9.268439387146747e-05, + "loss": 1.6614, + "step": 575 + }, + { + "epoch": 0.1881507492548283, + "grad_norm": 0.1993979662656784, + "learning_rate": 9.26572020226094e-05, + "loss": 0.8511, + "step": 576 + }, + { + "epoch": 0.18847739986117348, + "grad_norm": 0.23298919200897217, + "learning_rate": 9.262996373606638e-05, + "loss": 0.8701, + "step": 577 + }, + { + "epoch": 0.1888040504675187, + "grad_norm": 0.2348179668188095, + "learning_rate": 9.26026790414907e-05, + "loss": 0.8383, + "step": 578 + }, + { + "epoch": 0.18913070107386387, + "grad_norm": 0.258207768201828, + "learning_rate": 9.257534796858514e-05, + "loss": 0.8137, + "step": 579 + }, + { + "epoch": 0.18945735168020905, + "grad_norm": 0.2487855851650238, + "learning_rate": 9.2547970547103e-05, + "loss": 0.8072, + "step": 580 + }, + { + "epoch": 0.18978400228655423, + "grad_norm": 0.2482251226902008, + "learning_rate": 9.252054680684799e-05, + "loss": 0.8812, + "step": 581 + }, + { + "epoch": 0.19011065289289944, + "grad_norm": 0.264870285987854, + "learning_rate": 9.249307677767429e-05, + "loss": 0.9027, + "step": 582 + }, + { + "epoch": 0.19043730349924462, + "grad_norm": 0.27695828676223755, + "learning_rate": 9.246556048948645e-05, + "loss": 0.9643, + "step": 583 + }, + { + "epoch": 0.1907639541055898, + "grad_norm": 0.2882271409034729, + "learning_rate": 9.243799797223938e-05, + "loss": 0.9431, + "step": 584 + }, + { + "epoch": 0.191090604711935, + "grad_norm": 0.2983972132205963, + "learning_rate": 9.241038925593832e-05, + "loss": 0.8474, + "step": 585 + }, + { + "epoch": 0.1914172553182802, + "grad_norm": 0.2943267524242401, + "learning_rate": 9.23827343706388e-05, + "loss": 0.8948, + "step": 586 + }, + { + "epoch": 0.19174390592462537, + "grad_norm": 0.3304755985736847, + "learning_rate": 9.235503334644662e-05, + "loss": 0.98, + "step": 587 + }, + { + "epoch": 0.19207055653097055, + "grad_norm": 0.3549445867538452, + "learning_rate": 9.232728621351778e-05, + "loss": 0.9946, + "step": 588 + }, + { + "epoch": 0.19239720713731576, + "grad_norm": 0.3509342670440674, + "learning_rate": 9.229949300205852e-05, + "loss": 0.9771, + "step": 589 + }, + { + "epoch": 0.19272385774366094, + "grad_norm": 0.37002110481262207, + "learning_rate": 9.22716537423252e-05, + "loss": 1.0676, + "step": 590 + }, + { + "epoch": 0.19305050835000612, + "grad_norm": 0.3887923061847687, + "learning_rate": 9.224376846462434e-05, + "loss": 1.0657, + "step": 591 + }, + { + "epoch": 0.19337715895635132, + "grad_norm": 0.41990533471107483, + "learning_rate": 9.221583719931253e-05, + "loss": 1.0131, + "step": 592 + }, + { + "epoch": 0.1937038095626965, + "grad_norm": 0.46272900700569153, + "learning_rate": 9.218785997679643e-05, + "loss": 1.1946, + "step": 593 + }, + { + "epoch": 0.19403046016904169, + "grad_norm": 0.5167956352233887, + "learning_rate": 9.215983682753275e-05, + "loss": 0.9902, + "step": 594 + }, + { + "epoch": 0.19435711077538687, + "grad_norm": 0.5987634658813477, + "learning_rate": 9.213176778202818e-05, + "loss": 1.1352, + "step": 595 + }, + { + "epoch": 0.19468376138173207, + "grad_norm": 0.6184087991714478, + "learning_rate": 9.210365287083939e-05, + "loss": 1.3175, + "step": 596 + }, + { + "epoch": 0.19501041198807725, + "grad_norm": 0.7726343870162964, + "learning_rate": 9.207549212457293e-05, + "loss": 1.1563, + "step": 597 + }, + { + "epoch": 0.19533706259442243, + "grad_norm": 0.941210925579071, + "learning_rate": 9.204728557388535e-05, + "loss": 1.3595, + "step": 598 + }, + { + "epoch": 0.19566371320076764, + "grad_norm": 1.330126166343689, + "learning_rate": 9.201903324948292e-05, + "loss": 1.7922, + "step": 599 + }, + { + "epoch": 0.19599036380711282, + "grad_norm": 2.1613612174987793, + "learning_rate": 9.199073518212186e-05, + "loss": 1.4775, + "step": 600 + }, + { + "epoch": 0.196317014413458, + "grad_norm": 0.19565562903881073, + "learning_rate": 9.196239140260816e-05, + "loss": 0.7829, + "step": 601 + }, + { + "epoch": 0.19664366501980318, + "grad_norm": 0.22353625297546387, + "learning_rate": 9.193400194179753e-05, + "loss": 0.8601, + "step": 602 + }, + { + "epoch": 0.1969703156261484, + "grad_norm": 0.23643703758716583, + "learning_rate": 9.190556683059546e-05, + "loss": 0.8285, + "step": 603 + }, + { + "epoch": 0.19729696623249357, + "grad_norm": 0.24481813609600067, + "learning_rate": 9.187708609995711e-05, + "loss": 0.942, + "step": 604 + }, + { + "epoch": 0.19762361683883875, + "grad_norm": 0.2547847032546997, + "learning_rate": 9.184855978088729e-05, + "loss": 0.7936, + "step": 605 + }, + { + "epoch": 0.19795026744518396, + "grad_norm": 0.2569129765033722, + "learning_rate": 9.181998790444047e-05, + "loss": 0.8767, + "step": 606 + }, + { + "epoch": 0.19827691805152914, + "grad_norm": 0.26046594977378845, + "learning_rate": 9.179137050172071e-05, + "loss": 0.9215, + "step": 607 + }, + { + "epoch": 0.19860356865787432, + "grad_norm": 0.2738451063632965, + "learning_rate": 9.176270760388161e-05, + "loss": 0.9197, + "step": 608 + }, + { + "epoch": 0.1989302192642195, + "grad_norm": 0.2807084619998932, + "learning_rate": 9.173399924212631e-05, + "loss": 0.8967, + "step": 609 + }, + { + "epoch": 0.1992568698705647, + "grad_norm": 0.29269683361053467, + "learning_rate": 9.170524544770745e-05, + "loss": 0.8018, + "step": 610 + }, + { + "epoch": 0.1995835204769099, + "grad_norm": 0.30087584257125854, + "learning_rate": 9.167644625192713e-05, + "loss": 0.919, + "step": 611 + }, + { + "epoch": 0.19991017108325507, + "grad_norm": 0.33221113681793213, + "learning_rate": 9.164760168613683e-05, + "loss": 0.997, + "step": 612 + }, + { + "epoch": 0.20023682168960028, + "grad_norm": 0.3531966805458069, + "learning_rate": 9.161871178173749e-05, + "loss": 0.9249, + "step": 613 + }, + { + "epoch": 0.20056347229594546, + "grad_norm": 0.40778499841690063, + "learning_rate": 9.158977657017937e-05, + "loss": 0.9746, + "step": 614 + }, + { + "epoch": 0.20089012290229064, + "grad_norm": 0.40673455595970154, + "learning_rate": 9.156079608296204e-05, + "loss": 0.9772, + "step": 615 + }, + { + "epoch": 0.20121677350863582, + "grad_norm": 0.4674896001815796, + "learning_rate": 9.15317703516344e-05, + "loss": 0.9516, + "step": 616 + }, + { + "epoch": 0.20154342411498102, + "grad_norm": 0.4718310236930847, + "learning_rate": 9.150269940779457e-05, + "loss": 1.0437, + "step": 617 + }, + { + "epoch": 0.2018700747213262, + "grad_norm": 0.5518301725387573, + "learning_rate": 9.147358328308987e-05, + "loss": 1.2963, + "step": 618 + }, + { + "epoch": 0.20219672532767138, + "grad_norm": 0.5814355611801147, + "learning_rate": 9.144442200921688e-05, + "loss": 1.0964, + "step": 619 + }, + { + "epoch": 0.20252337593401656, + "grad_norm": 0.6938658952713013, + "learning_rate": 9.141521561792127e-05, + "loss": 1.3407, + "step": 620 + }, + { + "epoch": 0.20285002654036177, + "grad_norm": 0.7887697815895081, + "learning_rate": 9.138596414099781e-05, + "loss": 1.4162, + "step": 621 + }, + { + "epoch": 0.20317667714670695, + "grad_norm": 0.8191277384757996, + "learning_rate": 9.135666761029043e-05, + "loss": 1.3763, + "step": 622 + }, + { + "epoch": 0.20350332775305213, + "grad_norm": 1.0939558744430542, + "learning_rate": 9.1327326057692e-05, + "loss": 1.2384, + "step": 623 + }, + { + "epoch": 0.20382997835939734, + "grad_norm": 1.284587025642395, + "learning_rate": 9.129793951514449e-05, + "loss": 1.7821, + "step": 624 + }, + { + "epoch": 0.20415662896574252, + "grad_norm": 1.8239585161209106, + "learning_rate": 9.126850801463884e-05, + "loss": 2.5487, + "step": 625 + }, + { + "epoch": 0.2044832795720877, + "grad_norm": 0.1805727183818817, + "learning_rate": 9.123903158821487e-05, + "loss": 0.7256, + "step": 626 + }, + { + "epoch": 0.20480993017843288, + "grad_norm": 0.2200499176979065, + "learning_rate": 9.120951026796138e-05, + "loss": 0.796, + "step": 627 + }, + { + "epoch": 0.2051365807847781, + "grad_norm": 0.23434124886989594, + "learning_rate": 9.117994408601598e-05, + "loss": 0.9145, + "step": 628 + }, + { + "epoch": 0.20546323139112327, + "grad_norm": 0.23503413796424866, + "learning_rate": 9.115033307456515e-05, + "loss": 0.8078, + "step": 629 + }, + { + "epoch": 0.20578988199746845, + "grad_norm": 0.2527485489845276, + "learning_rate": 9.112067726584419e-05, + "loss": 0.8819, + "step": 630 + }, + { + "epoch": 0.20611653260381366, + "grad_norm": 0.27487868070602417, + "learning_rate": 9.109097669213713e-05, + "loss": 0.8779, + "step": 631 + }, + { + "epoch": 0.20644318321015884, + "grad_norm": 0.2560602128505707, + "learning_rate": 9.106123138577675e-05, + "loss": 0.8185, + "step": 632 + }, + { + "epoch": 0.20676983381650402, + "grad_norm": 0.2872486114501953, + "learning_rate": 9.103144137914454e-05, + "loss": 0.8721, + "step": 633 + }, + { + "epoch": 0.2070964844228492, + "grad_norm": 0.298370361328125, + "learning_rate": 9.100160670467064e-05, + "loss": 0.8647, + "step": 634 + }, + { + "epoch": 0.2074231350291944, + "grad_norm": 0.3106897473335266, + "learning_rate": 9.097172739483379e-05, + "loss": 0.968, + "step": 635 + }, + { + "epoch": 0.20774978563553959, + "grad_norm": 0.32926785945892334, + "learning_rate": 9.094180348216135e-05, + "loss": 0.9056, + "step": 636 + }, + { + "epoch": 0.20807643624188477, + "grad_norm": 0.35057681798934937, + "learning_rate": 9.091183499922924e-05, + "loss": 0.9649, + "step": 637 + }, + { + "epoch": 0.20840308684822997, + "grad_norm": 0.35323870182037354, + "learning_rate": 9.088182197866189e-05, + "loss": 0.867, + "step": 638 + }, + { + "epoch": 0.20872973745457515, + "grad_norm": 0.40710511803627014, + "learning_rate": 9.085176445313223e-05, + "loss": 0.9675, + "step": 639 + }, + { + "epoch": 0.20905638806092033, + "grad_norm": 0.38960814476013184, + "learning_rate": 9.08216624553616e-05, + "loss": 0.9669, + "step": 640 + }, + { + "epoch": 0.20938303866726551, + "grad_norm": 0.4638998806476593, + "learning_rate": 9.079151601811979e-05, + "loss": 1.136, + "step": 641 + }, + { + "epoch": 0.20970968927361072, + "grad_norm": 0.4785967171192169, + "learning_rate": 9.076132517422497e-05, + "loss": 1.1302, + "step": 642 + }, + { + "epoch": 0.2100363398799559, + "grad_norm": 0.5356094837188721, + "learning_rate": 9.073108995654362e-05, + "loss": 1.0974, + "step": 643 + }, + { + "epoch": 0.21036299048630108, + "grad_norm": 0.5519333481788635, + "learning_rate": 9.070081039799056e-05, + "loss": 1.0546, + "step": 644 + }, + { + "epoch": 0.2106896410926463, + "grad_norm": 0.6569791436195374, + "learning_rate": 9.067048653152885e-05, + "loss": 1.3072, + "step": 645 + }, + { + "epoch": 0.21101629169899147, + "grad_norm": 0.8130210638046265, + "learning_rate": 9.064011839016982e-05, + "loss": 1.3312, + "step": 646 + }, + { + "epoch": 0.21134294230533665, + "grad_norm": 1.0476133823394775, + "learning_rate": 9.060970600697296e-05, + "loss": 1.389, + "step": 647 + }, + { + "epoch": 0.21166959291168183, + "grad_norm": 1.3284598588943481, + "learning_rate": 9.057924941504596e-05, + "loss": 1.8065, + "step": 648 + }, + { + "epoch": 0.21199624351802704, + "grad_norm": 1.56967031955719, + "learning_rate": 9.05487486475446e-05, + "loss": 1.9164, + "step": 649 + }, + { + "epoch": 0.21232289412437222, + "grad_norm": 1.61257803440094, + "learning_rate": 9.05182037376728e-05, + "loss": 1.6794, + "step": 650 + }, + { + "epoch": 0.2126495447307174, + "grad_norm": 0.19036370515823364, + "learning_rate": 9.048761471868248e-05, + "loss": 0.7986, + "step": 651 + }, + { + "epoch": 0.2129761953370626, + "grad_norm": 0.2137875109910965, + "learning_rate": 9.04569816238736e-05, + "loss": 0.7879, + "step": 652 + }, + { + "epoch": 0.2133028459434078, + "grad_norm": 0.21933026611804962, + "learning_rate": 9.042630448659413e-05, + "loss": 0.8792, + "step": 653 + }, + { + "epoch": 0.21362949654975297, + "grad_norm": 0.24495473504066467, + "learning_rate": 9.039558334023991e-05, + "loss": 0.8169, + "step": 654 + }, + { + "epoch": 0.21395614715609815, + "grad_norm": 0.2623686194419861, + "learning_rate": 9.03648182182548e-05, + "loss": 0.8705, + "step": 655 + }, + { + "epoch": 0.21428279776244336, + "grad_norm": 0.26370468735694885, + "learning_rate": 9.033400915413044e-05, + "loss": 0.9156, + "step": 656 + }, + { + "epoch": 0.21460944836878854, + "grad_norm": 0.2694458067417145, + "learning_rate": 9.030315618140634e-05, + "loss": 0.9068, + "step": 657 + }, + { + "epoch": 0.21493609897513372, + "grad_norm": 0.2986109256744385, + "learning_rate": 9.027225933366982e-05, + "loss": 0.9385, + "step": 658 + }, + { + "epoch": 0.21526274958147892, + "grad_norm": 0.28405866026878357, + "learning_rate": 9.024131864455594e-05, + "loss": 0.8657, + "step": 659 + }, + { + "epoch": 0.2155894001878241, + "grad_norm": 0.31199824810028076, + "learning_rate": 9.02103341477475e-05, + "loss": 0.9322, + "step": 660 + }, + { + "epoch": 0.21591605079416928, + "grad_norm": 0.32993587851524353, + "learning_rate": 9.017930587697501e-05, + "loss": 0.9344, + "step": 661 + }, + { + "epoch": 0.21624270140051446, + "grad_norm": 0.3460935652256012, + "learning_rate": 9.014823386601658e-05, + "loss": 0.9538, + "step": 662 + }, + { + "epoch": 0.21656935200685967, + "grad_norm": 0.3695278763771057, + "learning_rate": 9.011711814869798e-05, + "loss": 0.9617, + "step": 663 + }, + { + "epoch": 0.21689600261320485, + "grad_norm": 0.38950344920158386, + "learning_rate": 9.008595875889258e-05, + "loss": 0.8396, + "step": 664 + }, + { + "epoch": 0.21722265321955003, + "grad_norm": 0.4560457468032837, + "learning_rate": 9.005475573052123e-05, + "loss": 1.1105, + "step": 665 + }, + { + "epoch": 0.21754930382589524, + "grad_norm": 0.4376985728740692, + "learning_rate": 9.002350909755231e-05, + "loss": 0.9999, + "step": 666 + }, + { + "epoch": 0.21787595443224042, + "grad_norm": 0.49906688928604126, + "learning_rate": 8.999221889400171e-05, + "loss": 1.1028, + "step": 667 + }, + { + "epoch": 0.2182026050385856, + "grad_norm": 0.5753675699234009, + "learning_rate": 8.996088515393268e-05, + "loss": 1.1046, + "step": 668 + }, + { + "epoch": 0.21852925564493078, + "grad_norm": 0.6120446920394897, + "learning_rate": 8.992950791145596e-05, + "loss": 1.1665, + "step": 669 + }, + { + "epoch": 0.218855906251276, + "grad_norm": 0.8368681073188782, + "learning_rate": 8.989808720072955e-05, + "loss": 1.4047, + "step": 670 + }, + { + "epoch": 0.21918255685762117, + "grad_norm": 0.9521639347076416, + "learning_rate": 8.98666230559588e-05, + "loss": 1.4259, + "step": 671 + }, + { + "epoch": 0.21950920746396635, + "grad_norm": 1.2514474391937256, + "learning_rate": 8.983511551139641e-05, + "loss": 1.5017, + "step": 672 + }, + { + "epoch": 0.21983585807031153, + "grad_norm": 1.1896357536315918, + "learning_rate": 8.980356460134222e-05, + "loss": 1.3464, + "step": 673 + }, + { + "epoch": 0.22016250867665674, + "grad_norm": 1.2468774318695068, + "learning_rate": 8.977197036014336e-05, + "loss": 1.3893, + "step": 674 + }, + { + "epoch": 0.22048915928300192, + "grad_norm": 2.550205707550049, + "learning_rate": 8.974033282219407e-05, + "loss": 2.4879, + "step": 675 + }, + { + "epoch": 0.2208158098893471, + "grad_norm": 0.22337143123149872, + "learning_rate": 8.970865202193581e-05, + "loss": 0.8812, + "step": 676 + }, + { + "epoch": 0.2211424604956923, + "grad_norm": 0.23587287962436676, + "learning_rate": 8.967692799385702e-05, + "loss": 0.9307, + "step": 677 + }, + { + "epoch": 0.22146911110203749, + "grad_norm": 0.23437225818634033, + "learning_rate": 8.964516077249331e-05, + "loss": 0.858, + "step": 678 + }, + { + "epoch": 0.22179576170838267, + "grad_norm": 0.24456767737865448, + "learning_rate": 8.961335039242727e-05, + "loss": 0.8753, + "step": 679 + }, + { + "epoch": 0.22212241231472785, + "grad_norm": 0.2503317892551422, + "learning_rate": 8.95814968882884e-05, + "loss": 0.8468, + "step": 680 + }, + { + "epoch": 0.22244906292107305, + "grad_norm": 0.24607540667057037, + "learning_rate": 8.954960029475328e-05, + "loss": 0.8255, + "step": 681 + }, + { + "epoch": 0.22277571352741823, + "grad_norm": 0.28483763337135315, + "learning_rate": 8.95176606465453e-05, + "loss": 0.8317, + "step": 682 + }, + { + "epoch": 0.22310236413376341, + "grad_norm": 0.2800973355770111, + "learning_rate": 8.948567797843476e-05, + "loss": 0.8759, + "step": 683 + }, + { + "epoch": 0.22342901474010862, + "grad_norm": 0.2901217043399811, + "learning_rate": 8.945365232523877e-05, + "loss": 0.9152, + "step": 684 + }, + { + "epoch": 0.2237556653464538, + "grad_norm": 0.2840677499771118, + "learning_rate": 8.942158372182126e-05, + "loss": 0.8851, + "step": 685 + }, + { + "epoch": 0.22408231595279898, + "grad_norm": 0.29865285754203796, + "learning_rate": 8.93894722030929e-05, + "loss": 0.894, + "step": 686 + }, + { + "epoch": 0.22440896655914416, + "grad_norm": 0.3416735827922821, + "learning_rate": 8.935731780401109e-05, + "loss": 0.9313, + "step": 687 + }, + { + "epoch": 0.22473561716548937, + "grad_norm": 0.35249149799346924, + "learning_rate": 8.93251205595799e-05, + "loss": 0.9602, + "step": 688 + }, + { + "epoch": 0.22506226777183455, + "grad_norm": 0.3721778094768524, + "learning_rate": 8.929288050485005e-05, + "loss": 0.9332, + "step": 689 + }, + { + "epoch": 0.22538891837817973, + "grad_norm": 0.39727091789245605, + "learning_rate": 8.926059767491884e-05, + "loss": 1.0504, + "step": 690 + }, + { + "epoch": 0.22571556898452494, + "grad_norm": 0.43423396348953247, + "learning_rate": 8.922827210493019e-05, + "loss": 1.1031, + "step": 691 + }, + { + "epoch": 0.22604221959087012, + "grad_norm": 0.4453451335430145, + "learning_rate": 8.919590383007448e-05, + "loss": 0.9969, + "step": 692 + }, + { + "epoch": 0.2263688701972153, + "grad_norm": 0.5197416543960571, + "learning_rate": 8.916349288558865e-05, + "loss": 1.1138, + "step": 693 + }, + { + "epoch": 0.22669552080356048, + "grad_norm": 0.539627194404602, + "learning_rate": 8.913103930675602e-05, + "loss": 1.1913, + "step": 694 + }, + { + "epoch": 0.2270221714099057, + "grad_norm": 0.670895516872406, + "learning_rate": 8.90985431289064e-05, + "loss": 1.2253, + "step": 695 + }, + { + "epoch": 0.22734882201625087, + "grad_norm": 0.8996394872665405, + "learning_rate": 8.906600438741589e-05, + "loss": 1.415, + "step": 696 + }, + { + "epoch": 0.22767547262259605, + "grad_norm": 0.964454174041748, + "learning_rate": 8.9033423117707e-05, + "loss": 1.4633, + "step": 697 + }, + { + "epoch": 0.22800212322894126, + "grad_norm": 1.2644301652908325, + "learning_rate": 8.900079935524849e-05, + "loss": 1.4658, + "step": 698 + }, + { + "epoch": 0.22832877383528644, + "grad_norm": 1.2433687448501587, + "learning_rate": 8.896813313555539e-05, + "loss": 1.7506, + "step": 699 + }, + { + "epoch": 0.22865542444163162, + "grad_norm": 1.4691505432128906, + "learning_rate": 8.893542449418897e-05, + "loss": 1.9687, + "step": 700 + }, + { + "epoch": 0.2289820750479768, + "grad_norm": 0.18912772834300995, + "learning_rate": 8.890267346675666e-05, + "loss": 0.811, + "step": 701 + }, + { + "epoch": 0.229308725654322, + "grad_norm": 0.23038050532341003, + "learning_rate": 8.886988008891205e-05, + "loss": 0.8602, + "step": 702 + }, + { + "epoch": 0.22963537626066718, + "grad_norm": 0.2359895408153534, + "learning_rate": 8.883704439635479e-05, + "loss": 0.8598, + "step": 703 + }, + { + "epoch": 0.22996202686701236, + "grad_norm": 0.23904529213905334, + "learning_rate": 8.880416642483063e-05, + "loss": 0.813, + "step": 704 + }, + { + "epoch": 0.23028867747335757, + "grad_norm": 0.25027212500572205, + "learning_rate": 8.877124621013139e-05, + "loss": 0.8003, + "step": 705 + }, + { + "epoch": 0.23061532807970275, + "grad_norm": 0.27593323588371277, + "learning_rate": 8.873828378809479e-05, + "loss": 0.9057, + "step": 706 + }, + { + "epoch": 0.23094197868604793, + "grad_norm": 0.2732234597206116, + "learning_rate": 8.870527919460454e-05, + "loss": 0.8575, + "step": 707 + }, + { + "epoch": 0.2312686292923931, + "grad_norm": 0.28355199098587036, + "learning_rate": 8.867223246559027e-05, + "loss": 0.8595, + "step": 708 + }, + { + "epoch": 0.23159527989873832, + "grad_norm": 0.2759125232696533, + "learning_rate": 8.863914363702746e-05, + "loss": 0.8502, + "step": 709 + }, + { + "epoch": 0.2319219305050835, + "grad_norm": 0.2850385010242462, + "learning_rate": 8.86060127449374e-05, + "loss": 0.9327, + "step": 710 + }, + { + "epoch": 0.23224858111142868, + "grad_norm": 0.29679635167121887, + "learning_rate": 8.857283982538727e-05, + "loss": 0.886, + "step": 711 + }, + { + "epoch": 0.2325752317177739, + "grad_norm": 0.32602760195732117, + "learning_rate": 8.853962491448985e-05, + "loss": 0.847, + "step": 712 + }, + { + "epoch": 0.23290188232411907, + "grad_norm": 0.35561874508857727, + "learning_rate": 8.850636804840375e-05, + "loss": 0.8877, + "step": 713 + }, + { + "epoch": 0.23322853293046425, + "grad_norm": 0.38955962657928467, + "learning_rate": 8.847306926333323e-05, + "loss": 1.0865, + "step": 714 + }, + { + "epoch": 0.23355518353680943, + "grad_norm": 0.4370516538619995, + "learning_rate": 8.843972859552816e-05, + "loss": 1.0979, + "step": 715 + }, + { + "epoch": 0.23388183414315464, + "grad_norm": 0.476929247379303, + "learning_rate": 8.8406346081284e-05, + "loss": 1.1663, + "step": 716 + }, + { + "epoch": 0.23420848474949982, + "grad_norm": 0.45586854219436646, + "learning_rate": 8.837292175694178e-05, + "loss": 1.1494, + "step": 717 + }, + { + "epoch": 0.234535135355845, + "grad_norm": 0.49512070417404175, + "learning_rate": 8.833945565888809e-05, + "loss": 1.0131, + "step": 718 + }, + { + "epoch": 0.2348617859621902, + "grad_norm": 0.5380144715309143, + "learning_rate": 8.830594782355489e-05, + "loss": 1.1249, + "step": 719 + }, + { + "epoch": 0.2351884365685354, + "grad_norm": 0.5472076535224915, + "learning_rate": 8.827239828741969e-05, + "loss": 1.0563, + "step": 720 + }, + { + "epoch": 0.23551508717488057, + "grad_norm": 0.7166975140571594, + "learning_rate": 8.82388070870053e-05, + "loss": 1.0809, + "step": 721 + }, + { + "epoch": 0.23584173778122575, + "grad_norm": 0.8227059245109558, + "learning_rate": 8.820517425887998e-05, + "loss": 1.6083, + "step": 722 + }, + { + "epoch": 0.23616838838757095, + "grad_norm": 1.0604398250579834, + "learning_rate": 8.81714998396572e-05, + "loss": 1.4259, + "step": 723 + }, + { + "epoch": 0.23649503899391613, + "grad_norm": 1.3100907802581787, + "learning_rate": 8.813778386599582e-05, + "loss": 1.5418, + "step": 724 + }, + { + "epoch": 0.23682168960026131, + "grad_norm": 1.9565879106521606, + "learning_rate": 8.810402637459987e-05, + "loss": 1.8868, + "step": 725 + }, + { + "epoch": 0.2371483402066065, + "grad_norm": 0.19288013875484467, + "learning_rate": 8.807022740221856e-05, + "loss": 0.7707, + "step": 726 + }, + { + "epoch": 0.2374749908129517, + "grad_norm": 0.2201738804578781, + "learning_rate": 8.803638698564631e-05, + "loss": 0.8566, + "step": 727 + }, + { + "epoch": 0.23780164141929688, + "grad_norm": 0.24070677161216736, + "learning_rate": 8.800250516172264e-05, + "loss": 0.8972, + "step": 728 + }, + { + "epoch": 0.23812829202564206, + "grad_norm": 0.25289085507392883, + "learning_rate": 8.796858196733214e-05, + "loss": 0.8939, + "step": 729 + }, + { + "epoch": 0.23845494263198727, + "grad_norm": 0.25093331933021545, + "learning_rate": 8.793461743940442e-05, + "loss": 0.8377, + "step": 730 + }, + { + "epoch": 0.23878159323833245, + "grad_norm": 0.2646547853946686, + "learning_rate": 8.790061161491409e-05, + "loss": 0.8811, + "step": 731 + }, + { + "epoch": 0.23910824384467763, + "grad_norm": 0.27221933007240295, + "learning_rate": 8.78665645308808e-05, + "loss": 0.8527, + "step": 732 + }, + { + "epoch": 0.2394348944510228, + "grad_norm": 0.2808360159397125, + "learning_rate": 8.783247622436899e-05, + "loss": 0.8598, + "step": 733 + }, + { + "epoch": 0.23976154505736802, + "grad_norm": 0.2827470302581787, + "learning_rate": 8.779834673248803e-05, + "loss": 0.8815, + "step": 734 + }, + { + "epoch": 0.2400881956637132, + "grad_norm": 0.3063890039920807, + "learning_rate": 8.776417609239218e-05, + "loss": 0.9354, + "step": 735 + }, + { + "epoch": 0.24041484627005838, + "grad_norm": 0.3075849711894989, + "learning_rate": 8.772996434128039e-05, + "loss": 0.8594, + "step": 736 + }, + { + "epoch": 0.2407414968764036, + "grad_norm": 0.3473495543003082, + "learning_rate": 8.769571151639644e-05, + "loss": 0.9897, + "step": 737 + }, + { + "epoch": 0.24106814748274877, + "grad_norm": 0.33075860142707825, + "learning_rate": 8.766141765502882e-05, + "loss": 1.0453, + "step": 738 + }, + { + "epoch": 0.24139479808909395, + "grad_norm": 0.40573230385780334, + "learning_rate": 8.762708279451063e-05, + "loss": 0.98, + "step": 739 + }, + { + "epoch": 0.24172144869543913, + "grad_norm": 0.36625126004219055, + "learning_rate": 8.759270697221971e-05, + "loss": 0.9599, + "step": 740 + }, + { + "epoch": 0.24204809930178434, + "grad_norm": 0.3789199888706207, + "learning_rate": 8.755829022557837e-05, + "loss": 0.9305, + "step": 741 + }, + { + "epoch": 0.24237474990812952, + "grad_norm": 0.41750094294548035, + "learning_rate": 8.752383259205355e-05, + "loss": 0.8474, + "step": 742 + }, + { + "epoch": 0.2427014005144747, + "grad_norm": 0.5106528401374817, + "learning_rate": 8.748933410915671e-05, + "loss": 1.1046, + "step": 743 + }, + { + "epoch": 0.2430280511208199, + "grad_norm": 0.5671947002410889, + "learning_rate": 8.745479481444372e-05, + "loss": 1.2206, + "step": 744 + }, + { + "epoch": 0.24335470172716508, + "grad_norm": 0.6028645634651184, + "learning_rate": 8.742021474551492e-05, + "loss": 1.2882, + "step": 745 + }, + { + "epoch": 0.24368135233351026, + "grad_norm": 0.7777064442634583, + "learning_rate": 8.738559394001503e-05, + "loss": 1.3427, + "step": 746 + }, + { + "epoch": 0.24400800293985545, + "grad_norm": 0.9058758616447449, + "learning_rate": 8.735093243563311e-05, + "loss": 1.0836, + "step": 747 + }, + { + "epoch": 0.24433465354620065, + "grad_norm": 1.1198101043701172, + "learning_rate": 8.731623027010254e-05, + "loss": 1.7559, + "step": 748 + }, + { + "epoch": 0.24466130415254583, + "grad_norm": 1.198984980583191, + "learning_rate": 8.728148748120095e-05, + "loss": 1.4122, + "step": 749 + }, + { + "epoch": 0.244987954758891, + "grad_norm": 1.8164176940917969, + "learning_rate": 8.72467041067502e-05, + "loss": 1.6278, + "step": 750 + }, + { + "epoch": 0.24531460536523622, + "grad_norm": 0.2089819461107254, + "learning_rate": 8.721188018461633e-05, + "loss": 0.7842, + "step": 751 + }, + { + "epoch": 0.2456412559715814, + "grad_norm": 0.2276405692100525, + "learning_rate": 8.717701575270953e-05, + "loss": 0.8506, + "step": 752 + }, + { + "epoch": 0.24596790657792658, + "grad_norm": 0.23214949667453766, + "learning_rate": 8.714211084898407e-05, + "loss": 0.7915, + "step": 753 + }, + { + "epoch": 0.24629455718427176, + "grad_norm": 0.2502540051937103, + "learning_rate": 8.71071655114383e-05, + "loss": 0.8058, + "step": 754 + }, + { + "epoch": 0.24662120779061697, + "grad_norm": 0.25934338569641113, + "learning_rate": 8.707217977811456e-05, + "loss": 0.9029, + "step": 755 + }, + { + "epoch": 0.24694785839696215, + "grad_norm": 0.2814617455005646, + "learning_rate": 8.703715368709922e-05, + "loss": 0.87, + "step": 756 + }, + { + "epoch": 0.24727450900330733, + "grad_norm": 0.27127784490585327, + "learning_rate": 8.700208727652252e-05, + "loss": 0.8441, + "step": 757 + }, + { + "epoch": 0.24760115960965254, + "grad_norm": 0.2742736041545868, + "learning_rate": 8.696698058455863e-05, + "loss": 0.8699, + "step": 758 + }, + { + "epoch": 0.24792781021599772, + "grad_norm": 0.2857213020324707, + "learning_rate": 8.693183364942556e-05, + "loss": 0.9757, + "step": 759 + }, + { + "epoch": 0.2482544608223429, + "grad_norm": 0.30193600058555603, + "learning_rate": 8.689664650938516e-05, + "loss": 0.9146, + "step": 760 + }, + { + "epoch": 0.24858111142868808, + "grad_norm": 0.33248287439346313, + "learning_rate": 8.686141920274297e-05, + "loss": 0.9358, + "step": 761 + }, + { + "epoch": 0.2489077620350333, + "grad_norm": 0.3253381848335266, + "learning_rate": 8.682615176784835e-05, + "loss": 0.8875, + "step": 762 + }, + { + "epoch": 0.24923441264137847, + "grad_norm": 0.35137176513671875, + "learning_rate": 8.679084424309428e-05, + "loss": 0.9901, + "step": 763 + }, + { + "epoch": 0.24956106324772365, + "grad_norm": 0.38631191849708557, + "learning_rate": 8.675549666691742e-05, + "loss": 0.9687, + "step": 764 + }, + { + "epoch": 0.24988771385406885, + "grad_norm": 0.3780824840068817, + "learning_rate": 8.672010907779799e-05, + "loss": 0.9143, + "step": 765 + }, + { + "epoch": 0.250214364460414, + "grad_norm": 0.439392626285553, + "learning_rate": 8.668468151425982e-05, + "loss": 1.0443, + "step": 766 + }, + { + "epoch": 0.250214364460414, + "eval_loss": 1.0732765197753906, + "eval_runtime": 499.2422, + "eval_samples_per_second": 5.164, + "eval_steps_per_second": 2.582, + "step": 766 + } + ], + "logging_steps": 1, + "max_steps": 3061, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 766, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.04481580677333e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}