diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,28053 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6809669731018045, + "eval_steps": 1000, + "global_step": 4000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00017024174327545113, + "grad_norm": 5.9435296058654785, + "learning_rate": 0.0, + "loss": 0.9299, + "step": 1 + }, + { + "epoch": 0.00034048348655090226, + "grad_norm": 4.946147918701172, + "learning_rate": 1.3391173292339812e-07, + "loss": 0.8753, + "step": 2 + }, + { + "epoch": 0.0005107252298263534, + "grad_norm": 4.654350280761719, + "learning_rate": 2.1224507509017273e-07, + "loss": 0.8961, + "step": 3 + }, + { + "epoch": 0.0006809669731018045, + "grad_norm": 4.624459266662598, + "learning_rate": 2.6782346584679625e-07, + "loss": 0.8702, + "step": 4 + }, + { + "epoch": 0.0008512087163772557, + "grad_norm": 4.840507984161377, + "learning_rate": 3.109334149098911e-07, + "loss": 0.9089, + "step": 5 + }, + { + "epoch": 0.0010214504596527069, + "grad_norm": 4.609108924865723, + "learning_rate": 3.4615680801357083e-07, + "loss": 0.8922, + "step": 6 + }, + { + "epoch": 0.001191692202928158, + "grad_norm": 4.153110980987549, + "learning_rate": 3.759377625437651e-07, + "loss": 0.8303, + "step": 7 + }, + { + "epoch": 0.001361933946203609, + "grad_norm": 3.97541880607605, + "learning_rate": 4.017351987701944e-07, + "loss": 0.8569, + "step": 8 + }, + { + "epoch": 0.0015321756894790602, + "grad_norm": 3.4270944595336914, + "learning_rate": 4.2449015018034546e-07, + "loss": 0.7777, + "step": 9 + }, + { + "epoch": 0.0017024174327545114, + "grad_norm": 3.244053840637207, + "learning_rate": 4.4484514783328935e-07, + "loss": 0.7431, + "step": 10 + }, + { + "epoch": 0.0018726591760299626, + "grad_norm": 3.140742778778076, + "learning_rate": 4.632584829817167e-07, + "loss": 0.7605, + "step": 11 + }, + { + "epoch": 0.0020429009193054137, + "grad_norm": 3.1321463584899902, + "learning_rate": 4.80068540936969e-07, + "loss": 0.7243, + "step": 12 + }, + { + "epoch": 0.0022131426625808647, + "grad_norm": 2.890293836593628, + "learning_rate": 4.955322952348447e-07, + "loss": 0.6453, + "step": 13 + }, + { + "epoch": 0.002383384405856316, + "grad_norm": 2.773414134979248, + "learning_rate": 5.098494954671633e-07, + "loss": 0.6187, + "step": 14 + }, + { + "epoch": 0.002553626149131767, + "grad_norm": 2.7988698482513428, + "learning_rate": 5.231784900000639e-07, + "loss": 0.6197, + "step": 15 + }, + { + "epoch": 0.002723867892407218, + "grad_norm": 2.7417197227478027, + "learning_rate": 5.356469316935925e-07, + "loss": 0.6139, + "step": 16 + }, + { + "epoch": 0.0028941096356826694, + "grad_norm": 2.771998167037964, + "learning_rate": 5.473592323318297e-07, + "loss": 0.6009, + "step": 17 + }, + { + "epoch": 0.0030643513789581204, + "grad_norm": 3.031147003173828, + "learning_rate": 5.584018831037436e-07, + "loss": 0.562, + "step": 18 + }, + { + "epoch": 0.003234593122233572, + "grad_norm": 2.6269404888153076, + "learning_rate": 5.688473346582122e-07, + "loss": 0.4996, + "step": 19 + }, + { + "epoch": 0.0034048348655090228, + "grad_norm": 2.2291667461395264, + "learning_rate": 5.787568807566874e-07, + "loss": 0.5131, + "step": 20 + }, + { + "epoch": 0.003575076608784474, + "grad_norm": 2.112966299057007, + "learning_rate": 5.881828376339378e-07, + "loss": 0.4314, + "step": 21 + }, + { + "epoch": 0.003745318352059925, + "grad_norm": 2.252114772796631, + "learning_rate": 5.971702159051149e-07, + "loss": 0.4716, + "step": 22 + }, + { + "epoch": 0.003915560095335376, + "grad_norm": 1.981942057609558, + "learning_rate": 6.057580205219512e-07, + "loss": 0.4206, + "step": 23 + }, + { + "epoch": 0.0040858018386108275, + "grad_norm": 1.7979234457015991, + "learning_rate": 6.139802738603672e-07, + "loss": 0.4061, + "step": 24 + }, + { + "epoch": 0.004256043581886279, + "grad_norm": 1.8551833629608154, + "learning_rate": 6.218668298197822e-07, + "loss": 0.427, + "step": 25 + }, + { + "epoch": 0.004426285325161729, + "grad_norm": 1.7879689931869507, + "learning_rate": 6.294440281582428e-07, + "loss": 0.4368, + "step": 26 + }, + { + "epoch": 0.004596527068437181, + "grad_norm": 1.7607426643371582, + "learning_rate": 6.367352252705181e-07, + "loss": 0.3931, + "step": 27 + }, + { + "epoch": 0.004766768811712632, + "grad_norm": 1.5092447996139526, + "learning_rate": 6.437612283905613e-07, + "loss": 0.3544, + "step": 28 + }, + { + "epoch": 0.004937010554988083, + "grad_norm": 2.1652700901031494, + "learning_rate": 6.505406535664675e-07, + "loss": 0.3799, + "step": 29 + }, + { + "epoch": 0.005107252298263534, + "grad_norm": 1.570780634880066, + "learning_rate": 6.57090222923462e-07, + "loss": 0.3741, + "step": 30 + }, + { + "epoch": 0.0052774940415389856, + "grad_norm": 1.539757251739502, + "learning_rate": 6.634250131666118e-07, + "loss": 0.3894, + "step": 31 + }, + { + "epoch": 0.005447735784814436, + "grad_norm": 1.5071591138839722, + "learning_rate": 6.695586646169908e-07, + "loss": 0.3559, + "step": 32 + }, + { + "epoch": 0.0056179775280898875, + "grad_norm": 1.2861617803573608, + "learning_rate": 6.755035580718894e-07, + "loss": 0.2831, + "step": 33 + }, + { + "epoch": 0.005788219271365339, + "grad_norm": 1.2855749130249023, + "learning_rate": 6.812709652552277e-07, + "loss": 0.3348, + "step": 34 + }, + { + "epoch": 0.00595846101464079, + "grad_norm": 1.649814248085022, + "learning_rate": 6.868711774536562e-07, + "loss": 0.3381, + "step": 35 + }, + { + "epoch": 0.006128702757916241, + "grad_norm": 1.4062033891677856, + "learning_rate": 6.923136160271417e-07, + "loss": 0.343, + "step": 36 + }, + { + "epoch": 0.006298944501191692, + "grad_norm": 1.3435696363449097, + "learning_rate": 6.976069277750015e-07, + "loss": 0.3239, + "step": 37 + }, + { + "epoch": 0.006469186244467144, + "grad_norm": 1.419482946395874, + "learning_rate": 7.027590675816104e-07, + "loss": 0.3034, + "step": 38 + }, + { + "epoch": 0.006639427987742594, + "grad_norm": 1.3096425533294678, + "learning_rate": 7.077773703250174e-07, + "loss": 0.3282, + "step": 39 + }, + { + "epoch": 0.0068096697310180455, + "grad_norm": 1.3494988679885864, + "learning_rate": 7.126686136800855e-07, + "loss": 0.3264, + "step": 40 + }, + { + "epoch": 0.006979911474293497, + "grad_norm": 1.2407349348068237, + "learning_rate": 7.174390731656332e-07, + "loss": 0.3306, + "step": 41 + }, + { + "epoch": 0.007150153217568948, + "grad_norm": 1.4115320444107056, + "learning_rate": 7.220945705573361e-07, + "loss": 0.3093, + "step": 42 + }, + { + "epoch": 0.007320394960844399, + "grad_norm": 1.4195406436920166, + "learning_rate": 7.266405166033159e-07, + "loss": 0.3102, + "step": 43 + }, + { + "epoch": 0.00749063670411985, + "grad_norm": 1.1780062913894653, + "learning_rate": 7.31081948828513e-07, + "loss": 0.3072, + "step": 44 + }, + { + "epoch": 0.007660878447395302, + "grad_norm": 1.3273332118988037, + "learning_rate": 7.354235650902366e-07, + "loss": 0.3161, + "step": 45 + }, + { + "epoch": 0.007831120190670752, + "grad_norm": 1.2345938682556152, + "learning_rate": 7.396697534453494e-07, + "loss": 0.312, + "step": 46 + }, + { + "epoch": 0.008001361933946204, + "grad_norm": 1.327564001083374, + "learning_rate": 7.438246188051406e-07, + "loss": 0.3177, + "step": 47 + }, + { + "epoch": 0.008171603677221655, + "grad_norm": 1.195912480354309, + "learning_rate": 7.478920067837654e-07, + "loss": 0.3031, + "step": 48 + }, + { + "epoch": 0.008341845420497106, + "grad_norm": 1.3030645847320557, + "learning_rate": 7.518755250875302e-07, + "loss": 0.3009, + "step": 49 + }, + { + "epoch": 0.008512087163772558, + "grad_norm": 1.2128993272781372, + "learning_rate": 7.557785627431804e-07, + "loss": 0.3139, + "step": 50 + }, + { + "epoch": 0.008682328907048007, + "grad_norm": 1.1837964057922363, + "learning_rate": 7.596043074220024e-07, + "loss": 0.2816, + "step": 51 + }, + { + "epoch": 0.008852570650323459, + "grad_norm": 1.1090530157089233, + "learning_rate": 7.633557610816411e-07, + "loss": 0.2451, + "step": 52 + }, + { + "epoch": 0.00902281239359891, + "grad_norm": 1.2700027227401733, + "learning_rate": 7.670357541179365e-07, + "loss": 0.2702, + "step": 53 + }, + { + "epoch": 0.009193054136874362, + "grad_norm": 1.1058295965194702, + "learning_rate": 7.706469581939163e-07, + "loss": 0.2632, + "step": 54 + }, + { + "epoch": 0.009363295880149813, + "grad_norm": 1.152600884437561, + "learning_rate": 7.741918978916079e-07, + "loss": 0.2693, + "step": 55 + }, + { + "epoch": 0.009533537623425264, + "grad_norm": 1.1038074493408203, + "learning_rate": 7.776729613139597e-07, + "loss": 0.2361, + "step": 56 + }, + { + "epoch": 0.009703779366700716, + "grad_norm": 1.298447847366333, + "learning_rate": 7.81092409748385e-07, + "loss": 0.2814, + "step": 57 + }, + { + "epoch": 0.009874021109976166, + "grad_norm": 1.1311403512954712, + "learning_rate": 7.844523864898656e-07, + "loss": 0.2706, + "step": 58 + }, + { + "epoch": 0.010044262853251617, + "grad_norm": 1.2915065288543701, + "learning_rate": 7.877549249098274e-07, + "loss": 0.2774, + "step": 59 + }, + { + "epoch": 0.010214504596527068, + "grad_norm": 1.0150870084762573, + "learning_rate": 7.910019558468602e-07, + "loss": 0.2325, + "step": 60 + }, + { + "epoch": 0.01038474633980252, + "grad_norm": 1.2452411651611328, + "learning_rate": 7.941953143865467e-07, + "loss": 0.2873, + "step": 61 + }, + { + "epoch": 0.010554988083077971, + "grad_norm": 1.3862929344177246, + "learning_rate": 7.973367460900099e-07, + "loss": 0.3041, + "step": 62 + }, + { + "epoch": 0.010725229826353423, + "grad_norm": 1.0656795501708984, + "learning_rate": 8.004279127241105e-07, + "loss": 0.2215, + "step": 63 + }, + { + "epoch": 0.010895471569628872, + "grad_norm": 1.3162792921066284, + "learning_rate": 8.034703975403888e-07, + "loss": 0.3019, + "step": 64 + }, + { + "epoch": 0.011065713312904324, + "grad_norm": 1.0575990676879883, + "learning_rate": 8.064657101447357e-07, + "loss": 0.2394, + "step": 65 + }, + { + "epoch": 0.011235955056179775, + "grad_norm": 1.0472103357315063, + "learning_rate": 8.094152909952874e-07, + "loss": 0.246, + "step": 66 + }, + { + "epoch": 0.011406196799455226, + "grad_norm": 1.1870509386062622, + "learning_rate": 8.123205155620937e-07, + "loss": 0.2323, + "step": 67 + }, + { + "epoch": 0.011576438542730678, + "grad_norm": 1.0447919368743896, + "learning_rate": 8.15182698178626e-07, + "loss": 0.2372, + "step": 68 + }, + { + "epoch": 0.01174668028600613, + "grad_norm": 1.3799355030059814, + "learning_rate": 8.18003095612124e-07, + "loss": 0.2981, + "step": 69 + }, + { + "epoch": 0.01191692202928158, + "grad_norm": 1.3175947666168213, + "learning_rate": 8.207829103770545e-07, + "loss": 0.2427, + "step": 70 + }, + { + "epoch": 0.01208716377255703, + "grad_norm": 1.8826605081558228, + "learning_rate": 8.235232938135481e-07, + "loss": 0.2829, + "step": 71 + }, + { + "epoch": 0.012257405515832482, + "grad_norm": 1.0391643047332764, + "learning_rate": 8.262253489505398e-07, + "loss": 0.2228, + "step": 72 + }, + { + "epoch": 0.012427647259107933, + "grad_norm": 0.9970806241035461, + "learning_rate": 8.288901331714316e-07, + "loss": 0.2154, + "step": 73 + }, + { + "epoch": 0.012597889002383384, + "grad_norm": 1.1637834310531616, + "learning_rate": 8.315186606983998e-07, + "loss": 0.2596, + "step": 74 + }, + { + "epoch": 0.012768130745658836, + "grad_norm": 1.374963402748108, + "learning_rate": 8.34111904909955e-07, + "loss": 0.2749, + "step": 75 + }, + { + "epoch": 0.012938372488934287, + "grad_norm": 1.0730386972427368, + "learning_rate": 8.366708005050085e-07, + "loss": 0.2227, + "step": 76 + }, + { + "epoch": 0.013108614232209739, + "grad_norm": 1.1448426246643066, + "learning_rate": 8.391962455254819e-07, + "loss": 0.2361, + "step": 77 + }, + { + "epoch": 0.013278855975485188, + "grad_norm": 1.2940086126327515, + "learning_rate": 8.416891032484155e-07, + "loss": 0.3044, + "step": 78 + }, + { + "epoch": 0.01344909771876064, + "grad_norm": 1.3453335762023926, + "learning_rate": 8.441502039575513e-07, + "loss": 0.3157, + "step": 79 + }, + { + "epoch": 0.013619339462036091, + "grad_norm": 1.1563291549682617, + "learning_rate": 8.465803466034837e-07, + "loss": 0.2501, + "step": 80 + }, + { + "epoch": 0.013789581205311542, + "grad_norm": 1.1329809427261353, + "learning_rate": 8.489803003606909e-07, + "loss": 0.2429, + "step": 81 + }, + { + "epoch": 0.013959822948586994, + "grad_norm": 1.0661827325820923, + "learning_rate": 8.513508060890314e-07, + "loss": 0.2232, + "step": 82 + }, + { + "epoch": 0.014130064691862445, + "grad_norm": 1.1219416856765747, + "learning_rate": 8.536925777066614e-07, + "loss": 0.2142, + "step": 83 + }, + { + "epoch": 0.014300306435137897, + "grad_norm": 1.2325749397277832, + "learning_rate": 8.560063034807341e-07, + "loss": 0.2122, + "step": 84 + }, + { + "epoch": 0.014470548178413346, + "grad_norm": 1.0120998620986938, + "learning_rate": 8.582926472417208e-07, + "loss": 0.2147, + "step": 85 + }, + { + "epoch": 0.014640789921688798, + "grad_norm": 1.0740280151367188, + "learning_rate": 8.605522495267141e-07, + "loss": 0.2271, + "step": 86 + }, + { + "epoch": 0.01481103166496425, + "grad_norm": 1.309480905532837, + "learning_rate": 8.627857286566401e-07, + "loss": 0.2563, + "step": 87 + }, + { + "epoch": 0.0149812734082397, + "grad_norm": 1.108237385749817, + "learning_rate": 8.649936817519112e-07, + "loss": 0.2231, + "step": 88 + }, + { + "epoch": 0.015151515151515152, + "grad_norm": 1.1064398288726807, + "learning_rate": 8.671766856906931e-07, + "loss": 0.2516, + "step": 89 + }, + { + "epoch": 0.015321756894790603, + "grad_norm": 1.2427948713302612, + "learning_rate": 8.693352980136347e-07, + "loss": 0.2604, + "step": 90 + }, + { + "epoch": 0.015491998638066053, + "grad_norm": 1.1325477361679077, + "learning_rate": 8.714700577786097e-07, + "loss": 0.2314, + "step": 91 + }, + { + "epoch": 0.015662240381341504, + "grad_norm": 0.9738770127296448, + "learning_rate": 8.735814863687475e-07, + "loss": 0.2126, + "step": 92 + }, + { + "epoch": 0.015832482124616958, + "grad_norm": 1.1820428371429443, + "learning_rate": 8.756700882567846e-07, + "loss": 0.2116, + "step": 93 + }, + { + "epoch": 0.016002723867892407, + "grad_norm": 1.0578908920288086, + "learning_rate": 8.777363517285388e-07, + "loss": 0.2284, + "step": 94 + }, + { + "epoch": 0.016172965611167857, + "grad_norm": 1.2253124713897705, + "learning_rate": 8.797807495681034e-07, + "loss": 0.2492, + "step": 95 + }, + { + "epoch": 0.01634320735444331, + "grad_norm": 1.4362291097640991, + "learning_rate": 8.818037397071634e-07, + "loss": 0.2468, + "step": 96 + }, + { + "epoch": 0.01651344909771876, + "grad_norm": 1.2666908502578735, + "learning_rate": 8.838057658406682e-07, + "loss": 0.2349, + "step": 97 + }, + { + "epoch": 0.016683690840994213, + "grad_norm": 1.0576752424240112, + "learning_rate": 8.857872580109284e-07, + "loss": 0.2142, + "step": 98 + }, + { + "epoch": 0.016853932584269662, + "grad_norm": 0.937912106513977, + "learning_rate": 8.877486331620622e-07, + "loss": 0.1829, + "step": 99 + }, + { + "epoch": 0.017024174327545116, + "grad_norm": 1.2614257335662842, + "learning_rate": 8.896902956665787e-07, + "loss": 0.2405, + "step": 100 + }, + { + "epoch": 0.017194416070820565, + "grad_norm": 1.2120734453201294, + "learning_rate": 8.916126378257612e-07, + "loss": 0.2347, + "step": 101 + }, + { + "epoch": 0.017364657814096015, + "grad_norm": 1.108526349067688, + "learning_rate": 8.935160403454004e-07, + "loss": 0.1874, + "step": 102 + }, + { + "epoch": 0.017534899557371468, + "grad_norm": 1.0109800100326538, + "learning_rate": 8.954008727883201e-07, + "loss": 0.2053, + "step": 103 + }, + { + "epoch": 0.017705141300646918, + "grad_norm": 1.0737659931182861, + "learning_rate": 8.972674940050391e-07, + "loss": 0.2015, + "step": 104 + }, + { + "epoch": 0.01787538304392237, + "grad_norm": 1.0084854364395142, + "learning_rate": 8.991162525438289e-07, + "loss": 0.178, + "step": 105 + }, + { + "epoch": 0.01804562478719782, + "grad_norm": 1.0313851833343506, + "learning_rate": 9.009474870413346e-07, + "loss": 0.1977, + "step": 106 + }, + { + "epoch": 0.018215866530473274, + "grad_norm": 1.1986593008041382, + "learning_rate": 9.02761526594856e-07, + "loss": 0.2481, + "step": 107 + }, + { + "epoch": 0.018386108273748723, + "grad_norm": 1.0787945985794067, + "learning_rate": 9.045586911173146e-07, + "loss": 0.2128, + "step": 108 + }, + { + "epoch": 0.018556350017024173, + "grad_norm": 1.36089289188385, + "learning_rate": 9.063392916758576e-07, + "loss": 0.2232, + "step": 109 + }, + { + "epoch": 0.018726591760299626, + "grad_norm": 1.1465134620666504, + "learning_rate": 9.081036308150061e-07, + "loss": 0.2174, + "step": 110 + }, + { + "epoch": 0.018896833503575076, + "grad_norm": 1.1831120252609253, + "learning_rate": 9.098520028651742e-07, + "loss": 0.2266, + "step": 111 + }, + { + "epoch": 0.01906707524685053, + "grad_norm": 1.2039859294891357, + "learning_rate": 9.115846942373576e-07, + "loss": 0.1919, + "step": 112 + }, + { + "epoch": 0.01923731699012598, + "grad_norm": 1.119031548500061, + "learning_rate": 9.133019837047214e-07, + "loss": 0.2067, + "step": 113 + }, + { + "epoch": 0.01940755873340143, + "grad_norm": 1.1545226573944092, + "learning_rate": 9.150041426717831e-07, + "loss": 0.2066, + "step": 114 + }, + { + "epoch": 0.01957780047667688, + "grad_norm": 1.141434669494629, + "learning_rate": 9.166914354318424e-07, + "loss": 0.2301, + "step": 115 + }, + { + "epoch": 0.01974804221995233, + "grad_norm": 0.9759832620620728, + "learning_rate": 9.183641194132636e-07, + "loss": 0.1774, + "step": 116 + }, + { + "epoch": 0.019918283963227784, + "grad_norm": 1.1361424922943115, + "learning_rate": 9.200224454151901e-07, + "loss": 0.2169, + "step": 117 + }, + { + "epoch": 0.020088525706503234, + "grad_norm": 1.0914556980133057, + "learning_rate": 9.216666578332256e-07, + "loss": 0.188, + "step": 118 + }, + { + "epoch": 0.020258767449778687, + "grad_norm": 1.4587888717651367, + "learning_rate": 9.232969948755948e-07, + "loss": 0.2582, + "step": 119 + }, + { + "epoch": 0.020429009193054137, + "grad_norm": 1.1200069189071655, + "learning_rate": 9.249136887702583e-07, + "loss": 0.2058, + "step": 120 + }, + { + "epoch": 0.020599250936329586, + "grad_norm": 1.232216715812683, + "learning_rate": 9.265169659634334e-07, + "loss": 0.2383, + "step": 121 + }, + { + "epoch": 0.02076949267960504, + "grad_norm": 1.1444164514541626, + "learning_rate": 9.281070473099448e-07, + "loss": 0.2153, + "step": 122 + }, + { + "epoch": 0.02093973442288049, + "grad_norm": 1.0245447158813477, + "learning_rate": 9.296841482558059e-07, + "loss": 0.1701, + "step": 123 + }, + { + "epoch": 0.021109976166155942, + "grad_norm": 1.1672013998031616, + "learning_rate": 9.312484790134081e-07, + "loss": 0.2171, + "step": 124 + }, + { + "epoch": 0.021280217909431392, + "grad_norm": 1.001262903213501, + "learning_rate": 9.328002447296736e-07, + "loss": 0.1813, + "step": 125 + }, + { + "epoch": 0.021450459652706845, + "grad_norm": 1.345151662826538, + "learning_rate": 9.343396456475086e-07, + "loss": 0.2214, + "step": 126 + }, + { + "epoch": 0.021620701395982295, + "grad_norm": 1.1059603691101074, + "learning_rate": 9.358668772608768e-07, + "loss": 0.2258, + "step": 127 + }, + { + "epoch": 0.021790943139257744, + "grad_norm": 1.2178932428359985, + "learning_rate": 9.37382130463787e-07, + "loss": 0.1938, + "step": 128 + }, + { + "epoch": 0.021961184882533197, + "grad_norm": 1.2222176790237427, + "learning_rate": 9.388855916934886e-07, + "loss": 0.2357, + "step": 129 + }, + { + "epoch": 0.022131426625808647, + "grad_norm": 1.1123160123825073, + "learning_rate": 9.403774430681339e-07, + "loss": 0.1898, + "step": 130 + }, + { + "epoch": 0.0223016683690841, + "grad_norm": 1.3277643918991089, + "learning_rate": 9.418578625191684e-07, + "loss": 0.2397, + "step": 131 + }, + { + "epoch": 0.02247191011235955, + "grad_norm": 1.0972932577133179, + "learning_rate": 9.433270239186857e-07, + "loss": 0.187, + "step": 132 + }, + { + "epoch": 0.022642151855635003, + "grad_norm": 1.1459786891937256, + "learning_rate": 9.447850972019773e-07, + "loss": 0.1931, + "step": 133 + }, + { + "epoch": 0.022812393598910453, + "grad_norm": 1.1958963871002197, + "learning_rate": 9.462322484854918e-07, + "loss": 0.2066, + "step": 134 + }, + { + "epoch": 0.022982635342185902, + "grad_norm": 1.410596251487732, + "learning_rate": 9.476686401804093e-07, + "loss": 0.1863, + "step": 135 + }, + { + "epoch": 0.023152877085461356, + "grad_norm": 1.1559172868728638, + "learning_rate": 9.490944311020241e-07, + "loss": 0.1947, + "step": 136 + }, + { + "epoch": 0.023323118828736805, + "grad_norm": 1.2141169309616089, + "learning_rate": 9.505097765751215e-07, + "loss": 0.2312, + "step": 137 + }, + { + "epoch": 0.02349336057201226, + "grad_norm": 1.081640362739563, + "learning_rate": 9.519148285355222e-07, + "loss": 0.204, + "step": 138 + }, + { + "epoch": 0.023663602315287708, + "grad_norm": 1.086338996887207, + "learning_rate": 9.533097356279598e-07, + "loss": 0.1895, + "step": 139 + }, + { + "epoch": 0.02383384405856316, + "grad_norm": 1.0459181070327759, + "learning_rate": 9.546946433004524e-07, + "loss": 0.1944, + "step": 140 + }, + { + "epoch": 0.02400408580183861, + "grad_norm": 1.192455530166626, + "learning_rate": 9.560696938953133e-07, + "loss": 0.1916, + "step": 141 + }, + { + "epoch": 0.02417432754511406, + "grad_norm": 1.0824813842773438, + "learning_rate": 9.574350267369461e-07, + "loss": 0.1585, + "step": 142 + }, + { + "epoch": 0.024344569288389514, + "grad_norm": 1.211179494857788, + "learning_rate": 9.587907782165614e-07, + "loss": 0.1878, + "step": 143 + }, + { + "epoch": 0.024514811031664963, + "grad_norm": 1.272672176361084, + "learning_rate": 9.60137081873938e-07, + "loss": 0.1899, + "step": 144 + }, + { + "epoch": 0.024685052774940416, + "grad_norm": 1.2296862602233887, + "learning_rate": 9.614740684763584e-07, + "loss": 0.1851, + "step": 145 + }, + { + "epoch": 0.024855294518215866, + "grad_norm": 1.2672345638275146, + "learning_rate": 9.628018660948297e-07, + "loss": 0.2202, + "step": 146 + }, + { + "epoch": 0.02502553626149132, + "grad_norm": 1.336018443107605, + "learning_rate": 9.641206001777028e-07, + "loss": 0.2084, + "step": 147 + }, + { + "epoch": 0.02519577800476677, + "grad_norm": 1.0895048379898071, + "learning_rate": 9.654303936217977e-07, + "loss": 0.1939, + "step": 148 + }, + { + "epoch": 0.02536601974804222, + "grad_norm": 1.124503493309021, + "learning_rate": 9.667313668411324e-07, + "loss": 0.186, + "step": 149 + }, + { + "epoch": 0.02553626149131767, + "grad_norm": 1.1752759218215942, + "learning_rate": 9.680236378333531e-07, + "loss": 0.1991, + "step": 150 + }, + { + "epoch": 0.02570650323459312, + "grad_norm": 1.1944782733917236, + "learning_rate": 9.693073222439593e-07, + "loss": 0.2166, + "step": 151 + }, + { + "epoch": 0.025876744977868574, + "grad_norm": 1.1967934370040894, + "learning_rate": 9.705825334284067e-07, + "loss": 0.2066, + "step": 152 + }, + { + "epoch": 0.026046986721144024, + "grad_norm": 1.2898709774017334, + "learning_rate": 9.71849382512175e-07, + "loss": 0.2167, + "step": 153 + }, + { + "epoch": 0.026217228464419477, + "grad_norm": 1.0959877967834473, + "learning_rate": 9.7310797844888e-07, + "loss": 0.1884, + "step": 154 + }, + { + "epoch": 0.026387470207694927, + "grad_norm": 1.192084789276123, + "learning_rate": 9.743584280765029e-07, + "loss": 0.2111, + "step": 155 + }, + { + "epoch": 0.026557711950970377, + "grad_norm": 1.4502787590026855, + "learning_rate": 9.756008361718137e-07, + "loss": 0.2387, + "step": 156 + }, + { + "epoch": 0.02672795369424583, + "grad_norm": 1.3559253215789795, + "learning_rate": 9.76835305503054e-07, + "loss": 0.2201, + "step": 157 + }, + { + "epoch": 0.02689819543752128, + "grad_norm": 1.1862843036651611, + "learning_rate": 9.780619368809492e-07, + "loss": 0.1881, + "step": 158 + }, + { + "epoch": 0.027068437180796732, + "grad_norm": 1.2608420848846436, + "learning_rate": 9.792808292081091e-07, + "loss": 0.2009, + "step": 159 + }, + { + "epoch": 0.027238678924072182, + "grad_norm": 1.3061561584472656, + "learning_rate": 9.804920795268817e-07, + "loss": 0.2324, + "step": 160 + }, + { + "epoch": 0.027408920667347635, + "grad_norm": 1.3358960151672363, + "learning_rate": 9.816957830657163e-07, + "loss": 0.2116, + "step": 161 + }, + { + "epoch": 0.027579162410623085, + "grad_norm": 1.0722428560256958, + "learning_rate": 9.828920332840889e-07, + "loss": 0.1641, + "step": 162 + }, + { + "epoch": 0.027749404153898535, + "grad_norm": 1.2872867584228516, + "learning_rate": 9.840809219160487e-07, + "loss": 0.1849, + "step": 163 + }, + { + "epoch": 0.027919645897173988, + "grad_norm": 1.3819077014923096, + "learning_rate": 9.852625390124294e-07, + "loss": 0.1933, + "step": 164 + }, + { + "epoch": 0.028089887640449437, + "grad_norm": 1.278441309928894, + "learning_rate": 9.864369729817805e-07, + "loss": 0.1835, + "step": 165 + }, + { + "epoch": 0.02826012938372489, + "grad_norm": 1.1808550357818604, + "learning_rate": 9.876043106300596e-07, + "loss": 0.166, + "step": 166 + }, + { + "epoch": 0.02843037112700034, + "grad_norm": 1.233665108680725, + "learning_rate": 9.887646371991337e-07, + "loss": 0.1918, + "step": 167 + }, + { + "epoch": 0.028600612870275793, + "grad_norm": 1.1117044687271118, + "learning_rate": 9.899180364041324e-07, + "loss": 0.1699, + "step": 168 + }, + { + "epoch": 0.028770854613551243, + "grad_norm": 1.112967848777771, + "learning_rate": 9.910645904696893e-07, + "loss": 0.1629, + "step": 169 + }, + { + "epoch": 0.028941096356826693, + "grad_norm": 1.1570054292678833, + "learning_rate": 9.92204380165119e-07, + "loss": 0.1729, + "step": 170 + }, + { + "epoch": 0.029111338100102146, + "grad_norm": 1.1414086818695068, + "learning_rate": 9.933374848385576e-07, + "loss": 0.1659, + "step": 171 + }, + { + "epoch": 0.029281579843377595, + "grad_norm": 1.2004859447479248, + "learning_rate": 9.944639824501122e-07, + "loss": 0.1835, + "step": 172 + }, + { + "epoch": 0.02945182158665305, + "grad_norm": 1.2365108728408813, + "learning_rate": 9.95583949604046e-07, + "loss": 0.2133, + "step": 173 + }, + { + "epoch": 0.0296220633299285, + "grad_norm": 1.2216784954071045, + "learning_rate": 9.966974615800383e-07, + "loss": 0.1848, + "step": 174 + }, + { + "epoch": 0.029792305073203948, + "grad_norm": 1.2305302619934082, + "learning_rate": 9.978045923635475e-07, + "loss": 0.2026, + "step": 175 + }, + { + "epoch": 0.0299625468164794, + "grad_norm": 1.3286123275756836, + "learning_rate": 9.989054146753091e-07, + "loss": 0.1817, + "step": 176 + }, + { + "epoch": 0.03013278855975485, + "grad_norm": 1.3320600986480713, + "learning_rate": 1e-06, + "loss": 0.2147, + "step": 177 + }, + { + "epoch": 0.030303030303030304, + "grad_norm": 1.2844750881195068, + "learning_rate": 1e-06, + "loss": 0.1893, + "step": 178 + }, + { + "epoch": 0.030473272046305754, + "grad_norm": 1.1248033046722412, + "learning_rate": 1e-06, + "loss": 0.1617, + "step": 179 + }, + { + "epoch": 0.030643513789581207, + "grad_norm": 1.3026221990585327, + "learning_rate": 1e-06, + "loss": 0.1815, + "step": 180 + }, + { + "epoch": 0.030813755532856656, + "grad_norm": 1.2536060810089111, + "learning_rate": 1e-06, + "loss": 0.1852, + "step": 181 + }, + { + "epoch": 0.030983997276132106, + "grad_norm": 1.1810585260391235, + "learning_rate": 1e-06, + "loss": 0.1712, + "step": 182 + }, + { + "epoch": 0.03115423901940756, + "grad_norm": 1.4999784231185913, + "learning_rate": 1e-06, + "loss": 0.2461, + "step": 183 + }, + { + "epoch": 0.03132448076268301, + "grad_norm": 1.1074563264846802, + "learning_rate": 1e-06, + "loss": 0.1796, + "step": 184 + }, + { + "epoch": 0.03149472250595846, + "grad_norm": 1.0583021640777588, + "learning_rate": 1e-06, + "loss": 0.1603, + "step": 185 + }, + { + "epoch": 0.031664964249233915, + "grad_norm": 1.1220263242721558, + "learning_rate": 1e-06, + "loss": 0.1732, + "step": 186 + }, + { + "epoch": 0.031835205992509365, + "grad_norm": 1.1038565635681152, + "learning_rate": 1e-06, + "loss": 0.1766, + "step": 187 + }, + { + "epoch": 0.032005447735784814, + "grad_norm": 1.2731438875198364, + "learning_rate": 1e-06, + "loss": 0.1993, + "step": 188 + }, + { + "epoch": 0.032175689479060264, + "grad_norm": 1.569389820098877, + "learning_rate": 1e-06, + "loss": 0.1862, + "step": 189 + }, + { + "epoch": 0.032345931222335714, + "grad_norm": 1.1800955533981323, + "learning_rate": 1e-06, + "loss": 0.1411, + "step": 190 + }, + { + "epoch": 0.03251617296561117, + "grad_norm": 1.2948437929153442, + "learning_rate": 1e-06, + "loss": 0.186, + "step": 191 + }, + { + "epoch": 0.03268641470888662, + "grad_norm": 1.4020929336547852, + "learning_rate": 1e-06, + "loss": 0.1837, + "step": 192 + }, + { + "epoch": 0.03285665645216207, + "grad_norm": 1.25234055519104, + "learning_rate": 1e-06, + "loss": 0.1645, + "step": 193 + }, + { + "epoch": 0.03302689819543752, + "grad_norm": 1.349990725517273, + "learning_rate": 1e-06, + "loss": 0.2217, + "step": 194 + }, + { + "epoch": 0.03319713993871297, + "grad_norm": 1.2276535034179688, + "learning_rate": 1e-06, + "loss": 0.1825, + "step": 195 + }, + { + "epoch": 0.033367381681988426, + "grad_norm": 1.2954577207565308, + "learning_rate": 1e-06, + "loss": 0.1811, + "step": 196 + }, + { + "epoch": 0.033537623425263875, + "grad_norm": 1.2191767692565918, + "learning_rate": 1e-06, + "loss": 0.195, + "step": 197 + }, + { + "epoch": 0.033707865168539325, + "grad_norm": 1.1188750267028809, + "learning_rate": 1e-06, + "loss": 0.1505, + "step": 198 + }, + { + "epoch": 0.033878106911814775, + "grad_norm": 1.246447205543518, + "learning_rate": 1e-06, + "loss": 0.1615, + "step": 199 + }, + { + "epoch": 0.03404834865509023, + "grad_norm": 1.1524237394332886, + "learning_rate": 1e-06, + "loss": 0.1858, + "step": 200 + }, + { + "epoch": 0.03421859039836568, + "grad_norm": 1.1863471269607544, + "learning_rate": 1e-06, + "loss": 0.142, + "step": 201 + }, + { + "epoch": 0.03438883214164113, + "grad_norm": 1.2720561027526855, + "learning_rate": 1e-06, + "loss": 0.1689, + "step": 202 + }, + { + "epoch": 0.03455907388491658, + "grad_norm": 1.3305280208587646, + "learning_rate": 1e-06, + "loss": 0.1679, + "step": 203 + }, + { + "epoch": 0.03472931562819203, + "grad_norm": 1.2643027305603027, + "learning_rate": 1e-06, + "loss": 0.1906, + "step": 204 + }, + { + "epoch": 0.034899557371467486, + "grad_norm": 1.2647238969802856, + "learning_rate": 1e-06, + "loss": 0.192, + "step": 205 + }, + { + "epoch": 0.035069799114742936, + "grad_norm": 1.4146193265914917, + "learning_rate": 1e-06, + "loss": 0.1747, + "step": 206 + }, + { + "epoch": 0.035240040858018386, + "grad_norm": 1.366545557975769, + "learning_rate": 1e-06, + "loss": 0.182, + "step": 207 + }, + { + "epoch": 0.035410282601293835, + "grad_norm": 1.150009036064148, + "learning_rate": 1e-06, + "loss": 0.1536, + "step": 208 + }, + { + "epoch": 0.035580524344569285, + "grad_norm": 1.2122458219528198, + "learning_rate": 1e-06, + "loss": 0.1717, + "step": 209 + }, + { + "epoch": 0.03575076608784474, + "grad_norm": 1.1708459854125977, + "learning_rate": 1e-06, + "loss": 0.1685, + "step": 210 + }, + { + "epoch": 0.03592100783112019, + "grad_norm": 1.1359509229660034, + "learning_rate": 1e-06, + "loss": 0.1628, + "step": 211 + }, + { + "epoch": 0.03609124957439564, + "grad_norm": 2.6132564544677734, + "learning_rate": 1e-06, + "loss": 0.2673, + "step": 212 + }, + { + "epoch": 0.03626149131767109, + "grad_norm": 1.2932703495025635, + "learning_rate": 1e-06, + "loss": 0.1568, + "step": 213 + }, + { + "epoch": 0.03643173306094655, + "grad_norm": 1.6617991924285889, + "learning_rate": 1e-06, + "loss": 0.1944, + "step": 214 + }, + { + "epoch": 0.036601974804222, + "grad_norm": 2.0360565185546875, + "learning_rate": 1e-06, + "loss": 0.2282, + "step": 215 + }, + { + "epoch": 0.03677221654749745, + "grad_norm": 1.231723666191101, + "learning_rate": 1e-06, + "loss": 0.1452, + "step": 216 + }, + { + "epoch": 0.036942458290772896, + "grad_norm": 1.3259527683258057, + "learning_rate": 1e-06, + "loss": 0.167, + "step": 217 + }, + { + "epoch": 0.037112700034048346, + "grad_norm": 1.054499626159668, + "learning_rate": 1e-06, + "loss": 0.1342, + "step": 218 + }, + { + "epoch": 0.0372829417773238, + "grad_norm": 1.4665144681930542, + "learning_rate": 1e-06, + "loss": 0.1978, + "step": 219 + }, + { + "epoch": 0.03745318352059925, + "grad_norm": 1.3940861225128174, + "learning_rate": 1e-06, + "loss": 0.1971, + "step": 220 + }, + { + "epoch": 0.0376234252638747, + "grad_norm": 1.164638638496399, + "learning_rate": 1e-06, + "loss": 0.1727, + "step": 221 + }, + { + "epoch": 0.03779366700715015, + "grad_norm": 1.1719894409179688, + "learning_rate": 1e-06, + "loss": 0.1426, + "step": 222 + }, + { + "epoch": 0.0379639087504256, + "grad_norm": 1.1395282745361328, + "learning_rate": 1e-06, + "loss": 0.1544, + "step": 223 + }, + { + "epoch": 0.03813415049370106, + "grad_norm": 1.2896770238876343, + "learning_rate": 1e-06, + "loss": 0.1709, + "step": 224 + }, + { + "epoch": 0.03830439223697651, + "grad_norm": 1.1964484453201294, + "learning_rate": 1e-06, + "loss": 0.1723, + "step": 225 + }, + { + "epoch": 0.03847463398025196, + "grad_norm": 1.2355952262878418, + "learning_rate": 1e-06, + "loss": 0.1388, + "step": 226 + }, + { + "epoch": 0.03864487572352741, + "grad_norm": 1.3132867813110352, + "learning_rate": 1e-06, + "loss": 0.174, + "step": 227 + }, + { + "epoch": 0.03881511746680286, + "grad_norm": 1.175132393836975, + "learning_rate": 1e-06, + "loss": 0.1533, + "step": 228 + }, + { + "epoch": 0.03898535921007831, + "grad_norm": 1.4419723749160767, + "learning_rate": 1e-06, + "loss": 0.1839, + "step": 229 + }, + { + "epoch": 0.03915560095335376, + "grad_norm": 1.1911609172821045, + "learning_rate": 1e-06, + "loss": 0.1473, + "step": 230 + }, + { + "epoch": 0.03932584269662921, + "grad_norm": 1.047421932220459, + "learning_rate": 1e-06, + "loss": 0.1437, + "step": 231 + }, + { + "epoch": 0.03949608443990466, + "grad_norm": 1.2604156732559204, + "learning_rate": 1e-06, + "loss": 0.1651, + "step": 232 + }, + { + "epoch": 0.03966632618318012, + "grad_norm": 1.2047826051712036, + "learning_rate": 1e-06, + "loss": 0.1587, + "step": 233 + }, + { + "epoch": 0.03983656792645557, + "grad_norm": 1.0610707998275757, + "learning_rate": 1e-06, + "loss": 0.1317, + "step": 234 + }, + { + "epoch": 0.04000680966973102, + "grad_norm": 1.1626613140106201, + "learning_rate": 1e-06, + "loss": 0.1543, + "step": 235 + }, + { + "epoch": 0.04017705141300647, + "grad_norm": 1.0833077430725098, + "learning_rate": 1e-06, + "loss": 0.1316, + "step": 236 + }, + { + "epoch": 0.04034729315628192, + "grad_norm": 1.4806079864501953, + "learning_rate": 1e-06, + "loss": 0.1699, + "step": 237 + }, + { + "epoch": 0.040517534899557374, + "grad_norm": 1.360552191734314, + "learning_rate": 1e-06, + "loss": 0.1559, + "step": 238 + }, + { + "epoch": 0.040687776642832824, + "grad_norm": 1.2109061479568481, + "learning_rate": 1e-06, + "loss": 0.1507, + "step": 239 + }, + { + "epoch": 0.04085801838610827, + "grad_norm": 1.1271363496780396, + "learning_rate": 1e-06, + "loss": 0.1522, + "step": 240 + }, + { + "epoch": 0.04102826012938372, + "grad_norm": 1.3026715517044067, + "learning_rate": 1e-06, + "loss": 0.1586, + "step": 241 + }, + { + "epoch": 0.04119850187265917, + "grad_norm": 1.589646816253662, + "learning_rate": 1e-06, + "loss": 0.2006, + "step": 242 + }, + { + "epoch": 0.04136874361593463, + "grad_norm": 1.1894760131835938, + "learning_rate": 1e-06, + "loss": 0.1547, + "step": 243 + }, + { + "epoch": 0.04153898535921008, + "grad_norm": 1.1556522846221924, + "learning_rate": 1e-06, + "loss": 0.1509, + "step": 244 + }, + { + "epoch": 0.04170922710248553, + "grad_norm": 2.4897043704986572, + "learning_rate": 1e-06, + "loss": 0.2587, + "step": 245 + }, + { + "epoch": 0.04187946884576098, + "grad_norm": 1.3815762996673584, + "learning_rate": 1e-06, + "loss": 0.1604, + "step": 246 + }, + { + "epoch": 0.042049710589036435, + "grad_norm": 1.4068046808242798, + "learning_rate": 1e-06, + "loss": 0.1628, + "step": 247 + }, + { + "epoch": 0.042219952332311884, + "grad_norm": 1.4070250988006592, + "learning_rate": 1e-06, + "loss": 0.1631, + "step": 248 + }, + { + "epoch": 0.042390194075587334, + "grad_norm": 1.1561863422393799, + "learning_rate": 1e-06, + "loss": 0.1488, + "step": 249 + }, + { + "epoch": 0.042560435818862784, + "grad_norm": 1.0689971446990967, + "learning_rate": 1e-06, + "loss": 0.1318, + "step": 250 + }, + { + "epoch": 0.04273067756213823, + "grad_norm": 1.1282958984375, + "learning_rate": 1e-06, + "loss": 0.1243, + "step": 251 + }, + { + "epoch": 0.04290091930541369, + "grad_norm": 1.2809712886810303, + "learning_rate": 1e-06, + "loss": 0.1609, + "step": 252 + }, + { + "epoch": 0.04307116104868914, + "grad_norm": 1.1898773908615112, + "learning_rate": 1e-06, + "loss": 0.1653, + "step": 253 + }, + { + "epoch": 0.04324140279196459, + "grad_norm": 1.215122938156128, + "learning_rate": 1e-06, + "loss": 0.1459, + "step": 254 + }, + { + "epoch": 0.04341164453524004, + "grad_norm": 1.4629710912704468, + "learning_rate": 1e-06, + "loss": 0.1945, + "step": 255 + }, + { + "epoch": 0.04358188627851549, + "grad_norm": 1.206394910812378, + "learning_rate": 1e-06, + "loss": 0.149, + "step": 256 + }, + { + "epoch": 0.043752128021790945, + "grad_norm": 1.3022671937942505, + "learning_rate": 1e-06, + "loss": 0.1529, + "step": 257 + }, + { + "epoch": 0.043922369765066395, + "grad_norm": 1.2597123384475708, + "learning_rate": 1e-06, + "loss": 0.1411, + "step": 258 + }, + { + "epoch": 0.044092611508341845, + "grad_norm": 1.3716976642608643, + "learning_rate": 1e-06, + "loss": 0.1552, + "step": 259 + }, + { + "epoch": 0.044262853251617294, + "grad_norm": 1.2082346677780151, + "learning_rate": 1e-06, + "loss": 0.1583, + "step": 260 + }, + { + "epoch": 0.04443309499489275, + "grad_norm": 1.1413241624832153, + "learning_rate": 1e-06, + "loss": 0.1197, + "step": 261 + }, + { + "epoch": 0.0446033367381682, + "grad_norm": 1.346548080444336, + "learning_rate": 1e-06, + "loss": 0.175, + "step": 262 + }, + { + "epoch": 0.04477357848144365, + "grad_norm": 1.326627254486084, + "learning_rate": 1e-06, + "loss": 0.168, + "step": 263 + }, + { + "epoch": 0.0449438202247191, + "grad_norm": 1.259684681892395, + "learning_rate": 1e-06, + "loss": 0.1541, + "step": 264 + }, + { + "epoch": 0.04511406196799455, + "grad_norm": 1.3728045225143433, + "learning_rate": 1e-06, + "loss": 0.1653, + "step": 265 + }, + { + "epoch": 0.045284303711270006, + "grad_norm": 1.1758908033370972, + "learning_rate": 1e-06, + "loss": 0.1519, + "step": 266 + }, + { + "epoch": 0.045454545454545456, + "grad_norm": 1.4522403478622437, + "learning_rate": 1e-06, + "loss": 0.1541, + "step": 267 + }, + { + "epoch": 0.045624787197820905, + "grad_norm": 1.3297243118286133, + "learning_rate": 1e-06, + "loss": 0.1647, + "step": 268 + }, + { + "epoch": 0.045795028941096355, + "grad_norm": 1.1865285634994507, + "learning_rate": 1e-06, + "loss": 0.1415, + "step": 269 + }, + { + "epoch": 0.045965270684371805, + "grad_norm": 1.2091939449310303, + "learning_rate": 1e-06, + "loss": 0.1386, + "step": 270 + }, + { + "epoch": 0.04613551242764726, + "grad_norm": 1.2446978092193604, + "learning_rate": 1e-06, + "loss": 0.1285, + "step": 271 + }, + { + "epoch": 0.04630575417092271, + "grad_norm": 1.1907007694244385, + "learning_rate": 1e-06, + "loss": 0.1607, + "step": 272 + }, + { + "epoch": 0.04647599591419816, + "grad_norm": 1.2765071392059326, + "learning_rate": 1e-06, + "loss": 0.1539, + "step": 273 + }, + { + "epoch": 0.04664623765747361, + "grad_norm": 1.364173173904419, + "learning_rate": 1e-06, + "loss": 0.1434, + "step": 274 + }, + { + "epoch": 0.04681647940074907, + "grad_norm": 1.212433934211731, + "learning_rate": 1e-06, + "loss": 0.1221, + "step": 275 + }, + { + "epoch": 0.04698672114402452, + "grad_norm": 1.203960657119751, + "learning_rate": 1e-06, + "loss": 0.1292, + "step": 276 + }, + { + "epoch": 0.047156962887299966, + "grad_norm": 1.4043571949005127, + "learning_rate": 1e-06, + "loss": 0.1496, + "step": 277 + }, + { + "epoch": 0.047327204630575416, + "grad_norm": 1.4482816457748413, + "learning_rate": 1e-06, + "loss": 0.1747, + "step": 278 + }, + { + "epoch": 0.047497446373850866, + "grad_norm": 1.1347603797912598, + "learning_rate": 1e-06, + "loss": 0.1268, + "step": 279 + }, + { + "epoch": 0.04766768811712632, + "grad_norm": 1.1415354013442993, + "learning_rate": 1e-06, + "loss": 0.1448, + "step": 280 + }, + { + "epoch": 0.04783792986040177, + "grad_norm": 1.4771915674209595, + "learning_rate": 1e-06, + "loss": 0.1596, + "step": 281 + }, + { + "epoch": 0.04800817160367722, + "grad_norm": 1.176045298576355, + "learning_rate": 1e-06, + "loss": 0.1207, + "step": 282 + }, + { + "epoch": 0.04817841334695267, + "grad_norm": 1.3680100440979004, + "learning_rate": 1e-06, + "loss": 0.1413, + "step": 283 + }, + { + "epoch": 0.04834865509022812, + "grad_norm": 1.3888130187988281, + "learning_rate": 1e-06, + "loss": 0.1395, + "step": 284 + }, + { + "epoch": 0.04851889683350358, + "grad_norm": 1.4310017824172974, + "learning_rate": 1e-06, + "loss": 0.1395, + "step": 285 + }, + { + "epoch": 0.04868913857677903, + "grad_norm": 1.4663360118865967, + "learning_rate": 1e-06, + "loss": 0.1674, + "step": 286 + }, + { + "epoch": 0.04885938032005448, + "grad_norm": 1.2074915170669556, + "learning_rate": 1e-06, + "loss": 0.1384, + "step": 287 + }, + { + "epoch": 0.049029622063329927, + "grad_norm": 1.3467768430709839, + "learning_rate": 1e-06, + "loss": 0.1702, + "step": 288 + }, + { + "epoch": 0.049199863806605376, + "grad_norm": 1.192862629890442, + "learning_rate": 1e-06, + "loss": 0.1347, + "step": 289 + }, + { + "epoch": 0.04937010554988083, + "grad_norm": 1.304807424545288, + "learning_rate": 1e-06, + "loss": 0.1459, + "step": 290 + }, + { + "epoch": 0.04954034729315628, + "grad_norm": 1.309431791305542, + "learning_rate": 1e-06, + "loss": 0.1651, + "step": 291 + }, + { + "epoch": 0.04971058903643173, + "grad_norm": 1.534616231918335, + "learning_rate": 1e-06, + "loss": 0.1481, + "step": 292 + }, + { + "epoch": 0.04988083077970718, + "grad_norm": 1.2383191585540771, + "learning_rate": 1e-06, + "loss": 0.1238, + "step": 293 + }, + { + "epoch": 0.05005107252298264, + "grad_norm": 1.1695579290390015, + "learning_rate": 1e-06, + "loss": 0.1144, + "step": 294 + }, + { + "epoch": 0.05022131426625809, + "grad_norm": 1.3590017557144165, + "learning_rate": 1e-06, + "loss": 0.1376, + "step": 295 + }, + { + "epoch": 0.05039155600953354, + "grad_norm": 1.186447024345398, + "learning_rate": 1e-06, + "loss": 0.1336, + "step": 296 + }, + { + "epoch": 0.05056179775280899, + "grad_norm": 1.2510392665863037, + "learning_rate": 1e-06, + "loss": 0.1363, + "step": 297 + }, + { + "epoch": 0.05073203949608444, + "grad_norm": 1.3094967603683472, + "learning_rate": 1e-06, + "loss": 0.1339, + "step": 298 + }, + { + "epoch": 0.050902281239359894, + "grad_norm": 1.4106348752975464, + "learning_rate": 1e-06, + "loss": 0.1527, + "step": 299 + }, + { + "epoch": 0.05107252298263534, + "grad_norm": 1.2612876892089844, + "learning_rate": 1e-06, + "loss": 0.1343, + "step": 300 + }, + { + "epoch": 0.05124276472591079, + "grad_norm": 1.3075928688049316, + "learning_rate": 1e-06, + "loss": 0.1572, + "step": 301 + }, + { + "epoch": 0.05141300646918624, + "grad_norm": 1.3252586126327515, + "learning_rate": 1e-06, + "loss": 0.1491, + "step": 302 + }, + { + "epoch": 0.05158324821246169, + "grad_norm": 1.3578503131866455, + "learning_rate": 1e-06, + "loss": 0.1526, + "step": 303 + }, + { + "epoch": 0.05175348995573715, + "grad_norm": 1.383754849433899, + "learning_rate": 1e-06, + "loss": 0.1547, + "step": 304 + }, + { + "epoch": 0.0519237316990126, + "grad_norm": 1.463248372077942, + "learning_rate": 1e-06, + "loss": 0.171, + "step": 305 + }, + { + "epoch": 0.05209397344228805, + "grad_norm": 1.3403490781784058, + "learning_rate": 1e-06, + "loss": 0.1738, + "step": 306 + }, + { + "epoch": 0.0522642151855635, + "grad_norm": 1.3475086688995361, + "learning_rate": 1e-06, + "loss": 0.1434, + "step": 307 + }, + { + "epoch": 0.052434456928838954, + "grad_norm": 1.1605323553085327, + "learning_rate": 1e-06, + "loss": 0.1362, + "step": 308 + }, + { + "epoch": 0.052604698672114404, + "grad_norm": 1.2388291358947754, + "learning_rate": 1e-06, + "loss": 0.1377, + "step": 309 + }, + { + "epoch": 0.052774940415389854, + "grad_norm": 1.6835265159606934, + "learning_rate": 1e-06, + "loss": 0.1976, + "step": 310 + }, + { + "epoch": 0.052945182158665303, + "grad_norm": 1.2333765029907227, + "learning_rate": 1e-06, + "loss": 0.1284, + "step": 311 + }, + { + "epoch": 0.05311542390194075, + "grad_norm": 1.581054925918579, + "learning_rate": 1e-06, + "loss": 0.1709, + "step": 312 + }, + { + "epoch": 0.05328566564521621, + "grad_norm": 1.165010690689087, + "learning_rate": 1e-06, + "loss": 0.1258, + "step": 313 + }, + { + "epoch": 0.05345590738849166, + "grad_norm": 1.209064245223999, + "learning_rate": 1e-06, + "loss": 0.1223, + "step": 314 + }, + { + "epoch": 0.05362614913176711, + "grad_norm": 1.3305888175964355, + "learning_rate": 1e-06, + "loss": 0.1425, + "step": 315 + }, + { + "epoch": 0.05379639087504256, + "grad_norm": 1.828088402748108, + "learning_rate": 1e-06, + "loss": 0.1634, + "step": 316 + }, + { + "epoch": 0.05396663261831801, + "grad_norm": 1.332383394241333, + "learning_rate": 1e-06, + "loss": 0.1268, + "step": 317 + }, + { + "epoch": 0.054136874361593465, + "grad_norm": 1.339270830154419, + "learning_rate": 1e-06, + "loss": 0.1428, + "step": 318 + }, + { + "epoch": 0.054307116104868915, + "grad_norm": 1.2108736038208008, + "learning_rate": 1e-06, + "loss": 0.1345, + "step": 319 + }, + { + "epoch": 0.054477357848144364, + "grad_norm": 1.2508689165115356, + "learning_rate": 1e-06, + "loss": 0.1251, + "step": 320 + }, + { + "epoch": 0.054647599591419814, + "grad_norm": 1.0894383192062378, + "learning_rate": 1e-06, + "loss": 0.1272, + "step": 321 + }, + { + "epoch": 0.05481784133469527, + "grad_norm": 1.2453547716140747, + "learning_rate": 1e-06, + "loss": 0.1315, + "step": 322 + }, + { + "epoch": 0.05498808307797072, + "grad_norm": 1.3076040744781494, + "learning_rate": 1e-06, + "loss": 0.1178, + "step": 323 + }, + { + "epoch": 0.05515832482124617, + "grad_norm": 2.6135568618774414, + "learning_rate": 1e-06, + "loss": 0.1613, + "step": 324 + }, + { + "epoch": 0.05532856656452162, + "grad_norm": 1.5091561079025269, + "learning_rate": 1e-06, + "loss": 0.1126, + "step": 325 + }, + { + "epoch": 0.05549880830779707, + "grad_norm": 1.7307822704315186, + "learning_rate": 1e-06, + "loss": 0.171, + "step": 326 + }, + { + "epoch": 0.055669050051072526, + "grad_norm": 1.2100858688354492, + "learning_rate": 1e-06, + "loss": 0.1094, + "step": 327 + }, + { + "epoch": 0.055839291794347976, + "grad_norm": 1.2510110139846802, + "learning_rate": 1e-06, + "loss": 0.1162, + "step": 328 + }, + { + "epoch": 0.056009533537623425, + "grad_norm": 1.3461787700653076, + "learning_rate": 1e-06, + "loss": 0.1427, + "step": 329 + }, + { + "epoch": 0.056179775280898875, + "grad_norm": 1.6088210344314575, + "learning_rate": 1e-06, + "loss": 0.1648, + "step": 330 + }, + { + "epoch": 0.056350017024174325, + "grad_norm": 1.4088263511657715, + "learning_rate": 1e-06, + "loss": 0.1334, + "step": 331 + }, + { + "epoch": 0.05652025876744978, + "grad_norm": 1.4224536418914795, + "learning_rate": 1e-06, + "loss": 0.1414, + "step": 332 + }, + { + "epoch": 0.05669050051072523, + "grad_norm": 1.1860085725784302, + "learning_rate": 1e-06, + "loss": 0.132, + "step": 333 + }, + { + "epoch": 0.05686074225400068, + "grad_norm": 1.3678163290023804, + "learning_rate": 1e-06, + "loss": 0.1496, + "step": 334 + }, + { + "epoch": 0.05703098399727613, + "grad_norm": 1.4010056257247925, + "learning_rate": 1e-06, + "loss": 0.1444, + "step": 335 + }, + { + "epoch": 0.05720122574055159, + "grad_norm": 1.2456624507904053, + "learning_rate": 1e-06, + "loss": 0.1222, + "step": 336 + }, + { + "epoch": 0.057371467483827036, + "grad_norm": 1.5460395812988281, + "learning_rate": 1e-06, + "loss": 0.1385, + "step": 337 + }, + { + "epoch": 0.057541709227102486, + "grad_norm": 1.2900766134262085, + "learning_rate": 1e-06, + "loss": 0.1096, + "step": 338 + }, + { + "epoch": 0.057711950970377936, + "grad_norm": 1.5039972066879272, + "learning_rate": 1e-06, + "loss": 0.1393, + "step": 339 + }, + { + "epoch": 0.057882192713653385, + "grad_norm": 1.2665271759033203, + "learning_rate": 1e-06, + "loss": 0.1264, + "step": 340 + }, + { + "epoch": 0.05805243445692884, + "grad_norm": 1.4474594593048096, + "learning_rate": 1e-06, + "loss": 0.1308, + "step": 341 + }, + { + "epoch": 0.05822267620020429, + "grad_norm": 1.3482589721679688, + "learning_rate": 1e-06, + "loss": 0.1473, + "step": 342 + }, + { + "epoch": 0.05839291794347974, + "grad_norm": 1.3561499118804932, + "learning_rate": 1e-06, + "loss": 0.1227, + "step": 343 + }, + { + "epoch": 0.05856315968675519, + "grad_norm": 1.194254755973816, + "learning_rate": 1e-06, + "loss": 0.116, + "step": 344 + }, + { + "epoch": 0.05873340143003064, + "grad_norm": 1.2828115224838257, + "learning_rate": 1e-06, + "loss": 0.1218, + "step": 345 + }, + { + "epoch": 0.0589036431733061, + "grad_norm": 2.596560001373291, + "learning_rate": 1e-06, + "loss": 0.2237, + "step": 346 + }, + { + "epoch": 0.05907388491658155, + "grad_norm": 1.3315943479537964, + "learning_rate": 1e-06, + "loss": 0.1271, + "step": 347 + }, + { + "epoch": 0.059244126659857, + "grad_norm": 1.2427936792373657, + "learning_rate": 1e-06, + "loss": 0.1181, + "step": 348 + }, + { + "epoch": 0.059414368403132446, + "grad_norm": 1.223074197769165, + "learning_rate": 1e-06, + "loss": 0.1073, + "step": 349 + }, + { + "epoch": 0.059584610146407896, + "grad_norm": 1.4834375381469727, + "learning_rate": 1e-06, + "loss": 0.1435, + "step": 350 + }, + { + "epoch": 0.05975485188968335, + "grad_norm": 1.4834375381469727, + "learning_rate": 1e-06, + "loss": 0.1392, + "step": 351 + }, + { + "epoch": 0.0599250936329588, + "grad_norm": 1.2152388095855713, + "learning_rate": 1e-06, + "loss": 0.1158, + "step": 352 + }, + { + "epoch": 0.06009533537623425, + "grad_norm": 1.3129734992980957, + "learning_rate": 1e-06, + "loss": 0.1272, + "step": 353 + }, + { + "epoch": 0.0602655771195097, + "grad_norm": 1.3754154443740845, + "learning_rate": 1e-06, + "loss": 0.1416, + "step": 354 + }, + { + "epoch": 0.06043581886278516, + "grad_norm": 1.2157810926437378, + "learning_rate": 1e-06, + "loss": 0.1207, + "step": 355 + }, + { + "epoch": 0.06060606060606061, + "grad_norm": 1.3933988809585571, + "learning_rate": 1e-06, + "loss": 0.1504, + "step": 356 + }, + { + "epoch": 0.06077630234933606, + "grad_norm": 1.3696870803833008, + "learning_rate": 1e-06, + "loss": 0.1303, + "step": 357 + }, + { + "epoch": 0.06094654409261151, + "grad_norm": 1.324859380722046, + "learning_rate": 1e-06, + "loss": 0.1344, + "step": 358 + }, + { + "epoch": 0.06111678583588696, + "grad_norm": 1.1997361183166504, + "learning_rate": 1e-06, + "loss": 0.1165, + "step": 359 + }, + { + "epoch": 0.06128702757916241, + "grad_norm": 1.3134782314300537, + "learning_rate": 1e-06, + "loss": 0.1269, + "step": 360 + }, + { + "epoch": 0.06145726932243786, + "grad_norm": 2.7401535511016846, + "learning_rate": 1e-06, + "loss": 0.253, + "step": 361 + }, + { + "epoch": 0.06162751106571331, + "grad_norm": 1.3712646961212158, + "learning_rate": 1e-06, + "loss": 0.138, + "step": 362 + }, + { + "epoch": 0.06179775280898876, + "grad_norm": 1.2538585662841797, + "learning_rate": 1e-06, + "loss": 0.1225, + "step": 363 + }, + { + "epoch": 0.06196799455226421, + "grad_norm": 1.2842880487442017, + "learning_rate": 1e-06, + "loss": 0.1301, + "step": 364 + }, + { + "epoch": 0.06213823629553967, + "grad_norm": 1.1870968341827393, + "learning_rate": 1e-06, + "loss": 0.1116, + "step": 365 + }, + { + "epoch": 0.06230847803881512, + "grad_norm": 1.3063998222351074, + "learning_rate": 1e-06, + "loss": 0.124, + "step": 366 + }, + { + "epoch": 0.06247871978209057, + "grad_norm": 1.3379669189453125, + "learning_rate": 1e-06, + "loss": 0.1396, + "step": 367 + }, + { + "epoch": 0.06264896152536602, + "grad_norm": 1.3882386684417725, + "learning_rate": 1e-06, + "loss": 0.1274, + "step": 368 + }, + { + "epoch": 0.06281920326864147, + "grad_norm": 1.38962984085083, + "learning_rate": 1e-06, + "loss": 0.1192, + "step": 369 + }, + { + "epoch": 0.06298944501191692, + "grad_norm": 1.5751211643218994, + "learning_rate": 1e-06, + "loss": 0.1465, + "step": 370 + }, + { + "epoch": 0.06315968675519237, + "grad_norm": 1.4705424308776855, + "learning_rate": 1e-06, + "loss": 0.1428, + "step": 371 + }, + { + "epoch": 0.06332992849846783, + "grad_norm": 1.27107834815979, + "learning_rate": 1e-06, + "loss": 0.1309, + "step": 372 + }, + { + "epoch": 0.06350017024174327, + "grad_norm": 1.3142486810684204, + "learning_rate": 1e-06, + "loss": 0.1407, + "step": 373 + }, + { + "epoch": 0.06367041198501873, + "grad_norm": 1.4944632053375244, + "learning_rate": 1e-06, + "loss": 0.139, + "step": 374 + }, + { + "epoch": 0.06384065372829417, + "grad_norm": 1.5329262018203735, + "learning_rate": 1e-06, + "loss": 0.1533, + "step": 375 + }, + { + "epoch": 0.06401089547156963, + "grad_norm": 1.682967185974121, + "learning_rate": 1e-06, + "loss": 0.1446, + "step": 376 + }, + { + "epoch": 0.06418113721484509, + "grad_norm": 1.3599567413330078, + "learning_rate": 1e-06, + "loss": 0.117, + "step": 377 + }, + { + "epoch": 0.06435137895812053, + "grad_norm": 1.41896653175354, + "learning_rate": 1e-06, + "loss": 0.1316, + "step": 378 + }, + { + "epoch": 0.06452162070139598, + "grad_norm": 1.2187316417694092, + "learning_rate": 1e-06, + "loss": 0.1104, + "step": 379 + }, + { + "epoch": 0.06469186244467143, + "grad_norm": 1.3118098974227905, + "learning_rate": 1e-06, + "loss": 0.1173, + "step": 380 + }, + { + "epoch": 0.06486210418794688, + "grad_norm": 1.2350298166275024, + "learning_rate": 1e-06, + "loss": 0.1188, + "step": 381 + }, + { + "epoch": 0.06503234593122234, + "grad_norm": 1.3306002616882324, + "learning_rate": 1e-06, + "loss": 0.1153, + "step": 382 + }, + { + "epoch": 0.06520258767449778, + "grad_norm": 2.9044129848480225, + "learning_rate": 1e-06, + "loss": 0.1784, + "step": 383 + }, + { + "epoch": 0.06537282941777324, + "grad_norm": 1.6895009279251099, + "learning_rate": 1e-06, + "loss": 0.1522, + "step": 384 + }, + { + "epoch": 0.06554307116104868, + "grad_norm": 1.2503180503845215, + "learning_rate": 1e-06, + "loss": 0.1048, + "step": 385 + }, + { + "epoch": 0.06571331290432414, + "grad_norm": 1.4215972423553467, + "learning_rate": 1e-06, + "loss": 0.135, + "step": 386 + }, + { + "epoch": 0.0658835546475996, + "grad_norm": 1.4079679250717163, + "learning_rate": 1e-06, + "loss": 0.1266, + "step": 387 + }, + { + "epoch": 0.06605379639087504, + "grad_norm": 1.29921293258667, + "learning_rate": 1e-06, + "loss": 0.1122, + "step": 388 + }, + { + "epoch": 0.0662240381341505, + "grad_norm": 1.442528486251831, + "learning_rate": 1e-06, + "loss": 0.1333, + "step": 389 + }, + { + "epoch": 0.06639427987742594, + "grad_norm": 1.8958500623703003, + "learning_rate": 1e-06, + "loss": 0.1928, + "step": 390 + }, + { + "epoch": 0.0665645216207014, + "grad_norm": 1.1162981986999512, + "learning_rate": 1e-06, + "loss": 0.0949, + "step": 391 + }, + { + "epoch": 0.06673476336397685, + "grad_norm": 1.3155403137207031, + "learning_rate": 1e-06, + "loss": 0.1148, + "step": 392 + }, + { + "epoch": 0.0669050051072523, + "grad_norm": 1.2109743356704712, + "learning_rate": 1e-06, + "loss": 0.0965, + "step": 393 + }, + { + "epoch": 0.06707524685052775, + "grad_norm": 1.2797634601593018, + "learning_rate": 1e-06, + "loss": 0.108, + "step": 394 + }, + { + "epoch": 0.06724548859380321, + "grad_norm": 1.2622194290161133, + "learning_rate": 1e-06, + "loss": 0.1205, + "step": 395 + }, + { + "epoch": 0.06741573033707865, + "grad_norm": 1.4360229969024658, + "learning_rate": 1e-06, + "loss": 0.1337, + "step": 396 + }, + { + "epoch": 0.0675859720803541, + "grad_norm": 1.1915457248687744, + "learning_rate": 1e-06, + "loss": 0.1186, + "step": 397 + }, + { + "epoch": 0.06775621382362955, + "grad_norm": 1.2633016109466553, + "learning_rate": 1e-06, + "loss": 0.1055, + "step": 398 + }, + { + "epoch": 0.067926455566905, + "grad_norm": 1.318787932395935, + "learning_rate": 1e-06, + "loss": 0.1247, + "step": 399 + }, + { + "epoch": 0.06809669731018046, + "grad_norm": 1.0899081230163574, + "learning_rate": 1e-06, + "loss": 0.0921, + "step": 400 + }, + { + "epoch": 0.0682669390534559, + "grad_norm": 1.4745516777038574, + "learning_rate": 1e-06, + "loss": 0.134, + "step": 401 + }, + { + "epoch": 0.06843718079673136, + "grad_norm": 1.2535583972930908, + "learning_rate": 1e-06, + "loss": 0.1183, + "step": 402 + }, + { + "epoch": 0.0686074225400068, + "grad_norm": 1.5067999362945557, + "learning_rate": 1e-06, + "loss": 0.1411, + "step": 403 + }, + { + "epoch": 0.06877766428328226, + "grad_norm": 1.5428158044815063, + "learning_rate": 1e-06, + "loss": 0.1325, + "step": 404 + }, + { + "epoch": 0.06894790602655772, + "grad_norm": 1.7841521501541138, + "learning_rate": 1e-06, + "loss": 0.1571, + "step": 405 + }, + { + "epoch": 0.06911814776983316, + "grad_norm": 1.1209527254104614, + "learning_rate": 1e-06, + "loss": 0.094, + "step": 406 + }, + { + "epoch": 0.06928838951310862, + "grad_norm": 1.2920805215835571, + "learning_rate": 1e-06, + "loss": 0.1211, + "step": 407 + }, + { + "epoch": 0.06945863125638406, + "grad_norm": 1.410902500152588, + "learning_rate": 1e-06, + "loss": 0.1163, + "step": 408 + }, + { + "epoch": 0.06962887299965952, + "grad_norm": 1.3287694454193115, + "learning_rate": 1e-06, + "loss": 0.1061, + "step": 409 + }, + { + "epoch": 0.06979911474293497, + "grad_norm": 1.3526452779769897, + "learning_rate": 1e-06, + "loss": 0.1176, + "step": 410 + }, + { + "epoch": 0.06996935648621042, + "grad_norm": 1.6113089323043823, + "learning_rate": 1e-06, + "loss": 0.1318, + "step": 411 + }, + { + "epoch": 0.07013959822948587, + "grad_norm": 1.4161698818206787, + "learning_rate": 1e-06, + "loss": 0.1344, + "step": 412 + }, + { + "epoch": 0.07030983997276131, + "grad_norm": 1.4419130086898804, + "learning_rate": 1e-06, + "loss": 0.137, + "step": 413 + }, + { + "epoch": 0.07048008171603677, + "grad_norm": 1.2640091180801392, + "learning_rate": 1e-06, + "loss": 0.1227, + "step": 414 + }, + { + "epoch": 0.07065032345931223, + "grad_norm": 1.23671555519104, + "learning_rate": 1e-06, + "loss": 0.121, + "step": 415 + }, + { + "epoch": 0.07082056520258767, + "grad_norm": 1.3423279523849487, + "learning_rate": 1e-06, + "loss": 0.0975, + "step": 416 + }, + { + "epoch": 0.07099080694586313, + "grad_norm": 1.2852997779846191, + "learning_rate": 1e-06, + "loss": 0.1246, + "step": 417 + }, + { + "epoch": 0.07116104868913857, + "grad_norm": 1.4518605470657349, + "learning_rate": 1e-06, + "loss": 0.1341, + "step": 418 + }, + { + "epoch": 0.07133129043241403, + "grad_norm": 1.4686856269836426, + "learning_rate": 1e-06, + "loss": 0.1259, + "step": 419 + }, + { + "epoch": 0.07150153217568948, + "grad_norm": 1.3495961427688599, + "learning_rate": 1e-06, + "loss": 0.122, + "step": 420 + }, + { + "epoch": 0.07167177391896493, + "grad_norm": 1.5383343696594238, + "learning_rate": 1e-06, + "loss": 0.1229, + "step": 421 + }, + { + "epoch": 0.07184201566224038, + "grad_norm": 1.2680972814559937, + "learning_rate": 1e-06, + "loss": 0.105, + "step": 422 + }, + { + "epoch": 0.07201225740551584, + "grad_norm": 1.4354145526885986, + "learning_rate": 1e-06, + "loss": 0.1247, + "step": 423 + }, + { + "epoch": 0.07218249914879128, + "grad_norm": 1.4354145526885986, + "learning_rate": 1e-06, + "loss": 0.2006, + "step": 424 + }, + { + "epoch": 0.07235274089206674, + "grad_norm": 1.7208505868911743, + "learning_rate": 1e-06, + "loss": 0.1541, + "step": 425 + }, + { + "epoch": 0.07252298263534218, + "grad_norm": 1.2794008255004883, + "learning_rate": 1e-06, + "loss": 0.1088, + "step": 426 + }, + { + "epoch": 0.07269322437861764, + "grad_norm": 1.3522515296936035, + "learning_rate": 1e-06, + "loss": 0.1139, + "step": 427 + }, + { + "epoch": 0.0728634661218931, + "grad_norm": 1.4536526203155518, + "learning_rate": 1e-06, + "loss": 0.1293, + "step": 428 + }, + { + "epoch": 0.07303370786516854, + "grad_norm": 1.2702877521514893, + "learning_rate": 1e-06, + "loss": 0.0999, + "step": 429 + }, + { + "epoch": 0.073203949608444, + "grad_norm": 1.3044495582580566, + "learning_rate": 1e-06, + "loss": 0.1225, + "step": 430 + }, + { + "epoch": 0.07337419135171944, + "grad_norm": 1.5265694856643677, + "learning_rate": 1e-06, + "loss": 0.1321, + "step": 431 + }, + { + "epoch": 0.0735444330949949, + "grad_norm": 1.3504886627197266, + "learning_rate": 1e-06, + "loss": 0.0999, + "step": 432 + }, + { + "epoch": 0.07371467483827035, + "grad_norm": 1.9963023662567139, + "learning_rate": 1e-06, + "loss": 0.1559, + "step": 433 + }, + { + "epoch": 0.07388491658154579, + "grad_norm": 1.6677991151809692, + "learning_rate": 1e-06, + "loss": 0.1331, + "step": 434 + }, + { + "epoch": 0.07405515832482125, + "grad_norm": 1.486445665359497, + "learning_rate": 1e-06, + "loss": 0.1224, + "step": 435 + }, + { + "epoch": 0.07422540006809669, + "grad_norm": 1.301155686378479, + "learning_rate": 1e-06, + "loss": 0.1045, + "step": 436 + }, + { + "epoch": 0.07439564181137215, + "grad_norm": 1.8198158740997314, + "learning_rate": 1e-06, + "loss": 0.1324, + "step": 437 + }, + { + "epoch": 0.0745658835546476, + "grad_norm": 1.3100183010101318, + "learning_rate": 1e-06, + "loss": 0.1107, + "step": 438 + }, + { + "epoch": 0.07473612529792305, + "grad_norm": 1.3502755165100098, + "learning_rate": 1e-06, + "loss": 0.1082, + "step": 439 + }, + { + "epoch": 0.0749063670411985, + "grad_norm": 1.5028979778289795, + "learning_rate": 1e-06, + "loss": 0.1325, + "step": 440 + }, + { + "epoch": 0.07507660878447395, + "grad_norm": 1.3118284940719604, + "learning_rate": 1e-06, + "loss": 0.0964, + "step": 441 + }, + { + "epoch": 0.0752468505277494, + "grad_norm": 1.4086387157440186, + "learning_rate": 1e-06, + "loss": 0.094, + "step": 442 + }, + { + "epoch": 0.07541709227102486, + "grad_norm": 1.5664148330688477, + "learning_rate": 1e-06, + "loss": 0.1195, + "step": 443 + }, + { + "epoch": 0.0755873340143003, + "grad_norm": 1.4086772203445435, + "learning_rate": 1e-06, + "loss": 0.1104, + "step": 444 + }, + { + "epoch": 0.07575757575757576, + "grad_norm": 1.4743391275405884, + "learning_rate": 1e-06, + "loss": 0.1163, + "step": 445 + }, + { + "epoch": 0.0759278175008512, + "grad_norm": 1.600606083869934, + "learning_rate": 1e-06, + "loss": 0.1199, + "step": 446 + }, + { + "epoch": 0.07609805924412666, + "grad_norm": 1.3386887311935425, + "learning_rate": 1e-06, + "loss": 0.1372, + "step": 447 + }, + { + "epoch": 0.07626830098740212, + "grad_norm": 1.4845508337020874, + "learning_rate": 1e-06, + "loss": 0.1201, + "step": 448 + }, + { + "epoch": 0.07643854273067756, + "grad_norm": 1.503592610359192, + "learning_rate": 1e-06, + "loss": 0.1243, + "step": 449 + }, + { + "epoch": 0.07660878447395301, + "grad_norm": 1.3089518547058105, + "learning_rate": 1e-06, + "loss": 0.0884, + "step": 450 + }, + { + "epoch": 0.07677902621722846, + "grad_norm": 1.3585859537124634, + "learning_rate": 1e-06, + "loss": 0.0991, + "step": 451 + }, + { + "epoch": 0.07694926796050391, + "grad_norm": 3.5047550201416016, + "learning_rate": 1e-06, + "loss": 0.2257, + "step": 452 + }, + { + "epoch": 0.07711950970377937, + "grad_norm": 2.3448007106781006, + "learning_rate": 1e-06, + "loss": 0.1543, + "step": 453 + }, + { + "epoch": 0.07728975144705481, + "grad_norm": 1.8489949703216553, + "learning_rate": 1e-06, + "loss": 0.1335, + "step": 454 + }, + { + "epoch": 0.07745999319033027, + "grad_norm": 1.5891618728637695, + "learning_rate": 1e-06, + "loss": 0.108, + "step": 455 + }, + { + "epoch": 0.07763023493360573, + "grad_norm": 1.9977794885635376, + "learning_rate": 1e-06, + "loss": 0.1622, + "step": 456 + }, + { + "epoch": 0.07780047667688117, + "grad_norm": 1.131685495376587, + "learning_rate": 1e-06, + "loss": 0.0874, + "step": 457 + }, + { + "epoch": 0.07797071842015663, + "grad_norm": 1.4111319780349731, + "learning_rate": 1e-06, + "loss": 0.122, + "step": 458 + }, + { + "epoch": 0.07814096016343207, + "grad_norm": 1.3373247385025024, + "learning_rate": 1e-06, + "loss": 0.0998, + "step": 459 + }, + { + "epoch": 0.07831120190670753, + "grad_norm": 1.3671199083328247, + "learning_rate": 1e-06, + "loss": 0.1006, + "step": 460 + }, + { + "epoch": 0.07848144364998298, + "grad_norm": 1.4334213733673096, + "learning_rate": 1e-06, + "loss": 0.1115, + "step": 461 + }, + { + "epoch": 0.07865168539325842, + "grad_norm": 1.6302123069763184, + "learning_rate": 1e-06, + "loss": 0.1225, + "step": 462 + }, + { + "epoch": 0.07882192713653388, + "grad_norm": 1.5899648666381836, + "learning_rate": 1e-06, + "loss": 0.1196, + "step": 463 + }, + { + "epoch": 0.07899216887980932, + "grad_norm": 1.3022098541259766, + "learning_rate": 1e-06, + "loss": 0.0863, + "step": 464 + }, + { + "epoch": 0.07916241062308478, + "grad_norm": 1.5681778192520142, + "learning_rate": 1e-06, + "loss": 0.1, + "step": 465 + }, + { + "epoch": 0.07933265236636024, + "grad_norm": 1.5576342344284058, + "learning_rate": 1e-06, + "loss": 0.1098, + "step": 466 + }, + { + "epoch": 0.07950289410963568, + "grad_norm": 1.1382691860198975, + "learning_rate": 1e-06, + "loss": 0.0896, + "step": 467 + }, + { + "epoch": 0.07967313585291114, + "grad_norm": 1.551830768585205, + "learning_rate": 1e-06, + "loss": 0.1475, + "step": 468 + }, + { + "epoch": 0.07984337759618658, + "grad_norm": 1.4834508895874023, + "learning_rate": 1e-06, + "loss": 0.1331, + "step": 469 + }, + { + "epoch": 0.08001361933946204, + "grad_norm": 1.7695828676223755, + "learning_rate": 1e-06, + "loss": 0.1544, + "step": 470 + }, + { + "epoch": 0.08018386108273749, + "grad_norm": 2.0113699436187744, + "learning_rate": 1e-06, + "loss": 0.1643, + "step": 471 + }, + { + "epoch": 0.08035410282601294, + "grad_norm": 1.7527496814727783, + "learning_rate": 1e-06, + "loss": 0.1529, + "step": 472 + }, + { + "epoch": 0.08052434456928839, + "grad_norm": 1.2186193466186523, + "learning_rate": 1e-06, + "loss": 0.0892, + "step": 473 + }, + { + "epoch": 0.08069458631256383, + "grad_norm": 1.2762824296951294, + "learning_rate": 1e-06, + "loss": 0.0839, + "step": 474 + }, + { + "epoch": 0.08086482805583929, + "grad_norm": 1.3743529319763184, + "learning_rate": 1e-06, + "loss": 0.1033, + "step": 475 + }, + { + "epoch": 0.08103506979911475, + "grad_norm": 1.5724990367889404, + "learning_rate": 1e-06, + "loss": 0.104, + "step": 476 + }, + { + "epoch": 0.08120531154239019, + "grad_norm": 1.5919311046600342, + "learning_rate": 1e-06, + "loss": 0.0943, + "step": 477 + }, + { + "epoch": 0.08137555328566565, + "grad_norm": 1.5781210660934448, + "learning_rate": 1e-06, + "loss": 0.1012, + "step": 478 + }, + { + "epoch": 0.08154579502894109, + "grad_norm": 1.8504365682601929, + "learning_rate": 1e-06, + "loss": 0.1361, + "step": 479 + }, + { + "epoch": 0.08171603677221655, + "grad_norm": 1.4634361267089844, + "learning_rate": 1e-06, + "loss": 0.1075, + "step": 480 + }, + { + "epoch": 0.081886278515492, + "grad_norm": 1.2929086685180664, + "learning_rate": 1e-06, + "loss": 0.0966, + "step": 481 + }, + { + "epoch": 0.08205652025876745, + "grad_norm": 3.413325786590576, + "learning_rate": 1e-06, + "loss": 0.2179, + "step": 482 + }, + { + "epoch": 0.0822267620020429, + "grad_norm": 1.2101235389709473, + "learning_rate": 1e-06, + "loss": 0.0915, + "step": 483 + }, + { + "epoch": 0.08239700374531835, + "grad_norm": 1.7361688613891602, + "learning_rate": 1e-06, + "loss": 0.121, + "step": 484 + }, + { + "epoch": 0.0825672454885938, + "grad_norm": 1.4495158195495605, + "learning_rate": 1e-06, + "loss": 0.1387, + "step": 485 + }, + { + "epoch": 0.08273748723186926, + "grad_norm": 1.3097033500671387, + "learning_rate": 1e-06, + "loss": 0.0994, + "step": 486 + }, + { + "epoch": 0.0829077289751447, + "grad_norm": 1.6425195932388306, + "learning_rate": 1e-06, + "loss": 0.1233, + "step": 487 + }, + { + "epoch": 0.08307797071842016, + "grad_norm": 1.344718098640442, + "learning_rate": 1e-06, + "loss": 0.107, + "step": 488 + }, + { + "epoch": 0.08324821246169561, + "grad_norm": 1.1989573240280151, + "learning_rate": 1e-06, + "loss": 0.0888, + "step": 489 + }, + { + "epoch": 0.08341845420497106, + "grad_norm": 1.4948607683181763, + "learning_rate": 1e-06, + "loss": 0.1105, + "step": 490 + }, + { + "epoch": 0.08358869594824651, + "grad_norm": 1.4378379583358765, + "learning_rate": 1e-06, + "loss": 0.106, + "step": 491 + }, + { + "epoch": 0.08375893769152196, + "grad_norm": 1.2525907754898071, + "learning_rate": 1e-06, + "loss": 0.0968, + "step": 492 + }, + { + "epoch": 0.08392917943479741, + "grad_norm": 1.479597806930542, + "learning_rate": 1e-06, + "loss": 0.1217, + "step": 493 + }, + { + "epoch": 0.08409942117807287, + "grad_norm": 1.3180419206619263, + "learning_rate": 1e-06, + "loss": 0.1011, + "step": 494 + }, + { + "epoch": 0.08426966292134831, + "grad_norm": 1.3772739171981812, + "learning_rate": 1e-06, + "loss": 0.1049, + "step": 495 + }, + { + "epoch": 0.08443990466462377, + "grad_norm": 1.814520001411438, + "learning_rate": 1e-06, + "loss": 0.1293, + "step": 496 + }, + { + "epoch": 0.08461014640789921, + "grad_norm": 1.5137838125228882, + "learning_rate": 1e-06, + "loss": 0.0999, + "step": 497 + }, + { + "epoch": 0.08478038815117467, + "grad_norm": 1.4192203283309937, + "learning_rate": 1e-06, + "loss": 0.114, + "step": 498 + }, + { + "epoch": 0.08495062989445012, + "grad_norm": 1.3078948259353638, + "learning_rate": 1e-06, + "loss": 0.1158, + "step": 499 + }, + { + "epoch": 0.08512087163772557, + "grad_norm": 1.5848225355148315, + "learning_rate": 1e-06, + "loss": 0.1163, + "step": 500 + }, + { + "epoch": 0.08529111338100102, + "grad_norm": 1.3920193910598755, + "learning_rate": 1e-06, + "loss": 0.0995, + "step": 501 + }, + { + "epoch": 0.08546135512427647, + "grad_norm": 1.3783040046691895, + "learning_rate": 1e-06, + "loss": 0.0955, + "step": 502 + }, + { + "epoch": 0.08563159686755192, + "grad_norm": 1.8510295152664185, + "learning_rate": 1e-06, + "loss": 0.1328, + "step": 503 + }, + { + "epoch": 0.08580183861082738, + "grad_norm": 3.0699658393859863, + "learning_rate": 1e-06, + "loss": 0.1774, + "step": 504 + }, + { + "epoch": 0.08597208035410282, + "grad_norm": 1.5914885997772217, + "learning_rate": 1e-06, + "loss": 0.1055, + "step": 505 + }, + { + "epoch": 0.08614232209737828, + "grad_norm": 1.7265233993530273, + "learning_rate": 1e-06, + "loss": 0.1271, + "step": 506 + }, + { + "epoch": 0.08631256384065372, + "grad_norm": 1.5482544898986816, + "learning_rate": 1e-06, + "loss": 0.1279, + "step": 507 + }, + { + "epoch": 0.08648280558392918, + "grad_norm": 1.321733832359314, + "learning_rate": 1e-06, + "loss": 0.0937, + "step": 508 + }, + { + "epoch": 0.08665304732720464, + "grad_norm": 1.4876199960708618, + "learning_rate": 1e-06, + "loss": 0.1192, + "step": 509 + }, + { + "epoch": 0.08682328907048008, + "grad_norm": 1.8378245830535889, + "learning_rate": 1e-06, + "loss": 0.1157, + "step": 510 + }, + { + "epoch": 0.08699353081375553, + "grad_norm": 1.625184178352356, + "learning_rate": 1e-06, + "loss": 0.1206, + "step": 511 + }, + { + "epoch": 0.08716377255703098, + "grad_norm": 1.4779977798461914, + "learning_rate": 1e-06, + "loss": 0.1103, + "step": 512 + }, + { + "epoch": 0.08733401430030643, + "grad_norm": 1.4066017866134644, + "learning_rate": 1e-06, + "loss": 0.1011, + "step": 513 + }, + { + "epoch": 0.08750425604358189, + "grad_norm": 1.525978922843933, + "learning_rate": 1e-06, + "loss": 0.096, + "step": 514 + }, + { + "epoch": 0.08767449778685733, + "grad_norm": 1.49380362033844, + "learning_rate": 1e-06, + "loss": 0.1135, + "step": 515 + }, + { + "epoch": 0.08784473953013279, + "grad_norm": 1.74233078956604, + "learning_rate": 1e-06, + "loss": 0.1114, + "step": 516 + }, + { + "epoch": 0.08801498127340825, + "grad_norm": 1.3099770545959473, + "learning_rate": 1e-06, + "loss": 0.0907, + "step": 517 + }, + { + "epoch": 0.08818522301668369, + "grad_norm": 1.7866302728652954, + "learning_rate": 1e-06, + "loss": 0.1178, + "step": 518 + }, + { + "epoch": 0.08835546475995915, + "grad_norm": 1.6818584203720093, + "learning_rate": 1e-06, + "loss": 0.1252, + "step": 519 + }, + { + "epoch": 0.08852570650323459, + "grad_norm": 1.628466248512268, + "learning_rate": 1e-06, + "loss": 0.1105, + "step": 520 + }, + { + "epoch": 0.08869594824651005, + "grad_norm": 1.681505799293518, + "learning_rate": 1e-06, + "loss": 0.1177, + "step": 521 + }, + { + "epoch": 0.0888661899897855, + "grad_norm": 1.2766761779785156, + "learning_rate": 1e-06, + "loss": 0.0799, + "step": 522 + }, + { + "epoch": 0.08903643173306094, + "grad_norm": 1.575498342514038, + "learning_rate": 1e-06, + "loss": 0.1068, + "step": 523 + }, + { + "epoch": 0.0892066734763364, + "grad_norm": 1.4087481498718262, + "learning_rate": 1e-06, + "loss": 0.0935, + "step": 524 + }, + { + "epoch": 0.08937691521961184, + "grad_norm": 1.4377367496490479, + "learning_rate": 1e-06, + "loss": 0.0968, + "step": 525 + }, + { + "epoch": 0.0895471569628873, + "grad_norm": 1.7580372095108032, + "learning_rate": 1e-06, + "loss": 0.1101, + "step": 526 + }, + { + "epoch": 0.08971739870616276, + "grad_norm": 1.5135084390640259, + "learning_rate": 1e-06, + "loss": 0.1005, + "step": 527 + }, + { + "epoch": 0.0898876404494382, + "grad_norm": 1.4128092527389526, + "learning_rate": 1e-06, + "loss": 0.0858, + "step": 528 + }, + { + "epoch": 0.09005788219271366, + "grad_norm": 1.4026193618774414, + "learning_rate": 1e-06, + "loss": 0.1002, + "step": 529 + }, + { + "epoch": 0.0902281239359891, + "grad_norm": 1.3924407958984375, + "learning_rate": 1e-06, + "loss": 0.0895, + "step": 530 + }, + { + "epoch": 0.09039836567926456, + "grad_norm": 1.4779433012008667, + "learning_rate": 1e-06, + "loss": 0.1033, + "step": 531 + }, + { + "epoch": 0.09056860742254001, + "grad_norm": 1.9175746440887451, + "learning_rate": 1e-06, + "loss": 0.1302, + "step": 532 + }, + { + "epoch": 0.09073884916581546, + "grad_norm": 1.7925680875778198, + "learning_rate": 1e-06, + "loss": 0.1162, + "step": 533 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 1.3707115650177002, + "learning_rate": 1e-06, + "loss": 0.0872, + "step": 534 + }, + { + "epoch": 0.09107933265236635, + "grad_norm": 1.3779429197311401, + "learning_rate": 1e-06, + "loss": 0.0982, + "step": 535 + }, + { + "epoch": 0.09124957439564181, + "grad_norm": 1.5113381147384644, + "learning_rate": 1e-06, + "loss": 0.1036, + "step": 536 + }, + { + "epoch": 0.09141981613891727, + "grad_norm": 1.5571949481964111, + "learning_rate": 1e-06, + "loss": 0.0933, + "step": 537 + }, + { + "epoch": 0.09159005788219271, + "grad_norm": 1.3230814933776855, + "learning_rate": 1e-06, + "loss": 0.1019, + "step": 538 + }, + { + "epoch": 0.09176029962546817, + "grad_norm": 1.257027506828308, + "learning_rate": 1e-06, + "loss": 0.0775, + "step": 539 + }, + { + "epoch": 0.09193054136874361, + "grad_norm": 1.3978290557861328, + "learning_rate": 1e-06, + "loss": 0.0942, + "step": 540 + }, + { + "epoch": 0.09210078311201907, + "grad_norm": 1.290602207183838, + "learning_rate": 1e-06, + "loss": 0.0809, + "step": 541 + }, + { + "epoch": 0.09227102485529452, + "grad_norm": 1.3179094791412354, + "learning_rate": 1e-06, + "loss": 0.0963, + "step": 542 + }, + { + "epoch": 0.09244126659856997, + "grad_norm": 1.6563512086868286, + "learning_rate": 1e-06, + "loss": 0.111, + "step": 543 + }, + { + "epoch": 0.09261150834184542, + "grad_norm": 1.4956837892532349, + "learning_rate": 1e-06, + "loss": 0.1165, + "step": 544 + }, + { + "epoch": 0.09278175008512086, + "grad_norm": 1.3757632970809937, + "learning_rate": 1e-06, + "loss": 0.0897, + "step": 545 + }, + { + "epoch": 0.09295199182839632, + "grad_norm": 1.205857276916504, + "learning_rate": 1e-06, + "loss": 0.0787, + "step": 546 + }, + { + "epoch": 0.09312223357167178, + "grad_norm": 1.7391396760940552, + "learning_rate": 1e-06, + "loss": 0.1286, + "step": 547 + }, + { + "epoch": 0.09329247531494722, + "grad_norm": 1.492804765701294, + "learning_rate": 1e-06, + "loss": 0.0916, + "step": 548 + }, + { + "epoch": 0.09346271705822268, + "grad_norm": 1.6261539459228516, + "learning_rate": 1e-06, + "loss": 0.0995, + "step": 549 + }, + { + "epoch": 0.09363295880149813, + "grad_norm": 1.5201900005340576, + "learning_rate": 1e-06, + "loss": 0.0987, + "step": 550 + }, + { + "epoch": 0.09380320054477358, + "grad_norm": 1.3725306987762451, + "learning_rate": 1e-06, + "loss": 0.0958, + "step": 551 + }, + { + "epoch": 0.09397344228804903, + "grad_norm": 1.601542592048645, + "learning_rate": 1e-06, + "loss": 0.1168, + "step": 552 + }, + { + "epoch": 0.09414368403132448, + "grad_norm": 1.4362895488739014, + "learning_rate": 1e-06, + "loss": 0.0896, + "step": 553 + }, + { + "epoch": 0.09431392577459993, + "grad_norm": 1.3668473958969116, + "learning_rate": 1e-06, + "loss": 0.1029, + "step": 554 + }, + { + "epoch": 0.09448416751787539, + "grad_norm": 1.6272999048233032, + "learning_rate": 1e-06, + "loss": 0.0938, + "step": 555 + }, + { + "epoch": 0.09465440926115083, + "grad_norm": 1.8201227188110352, + "learning_rate": 1e-06, + "loss": 0.1186, + "step": 556 + }, + { + "epoch": 0.09482465100442629, + "grad_norm": 1.4684869050979614, + "learning_rate": 1e-06, + "loss": 0.1029, + "step": 557 + }, + { + "epoch": 0.09499489274770173, + "grad_norm": 1.4213141202926636, + "learning_rate": 1e-06, + "loss": 0.0875, + "step": 558 + }, + { + "epoch": 0.09516513449097719, + "grad_norm": 1.4483574628829956, + "learning_rate": 1e-06, + "loss": 0.0996, + "step": 559 + }, + { + "epoch": 0.09533537623425264, + "grad_norm": 1.4041225910186768, + "learning_rate": 1e-06, + "loss": 0.0769, + "step": 560 + }, + { + "epoch": 0.09550561797752809, + "grad_norm": 1.3655953407287598, + "learning_rate": 1e-06, + "loss": 0.0927, + "step": 561 + }, + { + "epoch": 0.09567585972080354, + "grad_norm": 1.2655277252197266, + "learning_rate": 1e-06, + "loss": 0.078, + "step": 562 + }, + { + "epoch": 0.09584610146407899, + "grad_norm": 1.6605805158615112, + "learning_rate": 1e-06, + "loss": 0.1101, + "step": 563 + }, + { + "epoch": 0.09601634320735444, + "grad_norm": 1.6565370559692383, + "learning_rate": 1e-06, + "loss": 0.1107, + "step": 564 + }, + { + "epoch": 0.0961865849506299, + "grad_norm": 1.5182013511657715, + "learning_rate": 1e-06, + "loss": 0.0992, + "step": 565 + }, + { + "epoch": 0.09635682669390534, + "grad_norm": 1.3900082111358643, + "learning_rate": 1e-06, + "loss": 0.0999, + "step": 566 + }, + { + "epoch": 0.0965270684371808, + "grad_norm": 1.2898858785629272, + "learning_rate": 1e-06, + "loss": 0.0906, + "step": 567 + }, + { + "epoch": 0.09669731018045624, + "grad_norm": 3.67594838142395, + "learning_rate": 1e-06, + "loss": 0.1686, + "step": 568 + }, + { + "epoch": 0.0968675519237317, + "grad_norm": 1.313101887702942, + "learning_rate": 1e-06, + "loss": 0.0799, + "step": 569 + }, + { + "epoch": 0.09703779366700716, + "grad_norm": 3.672680139541626, + "learning_rate": 1e-06, + "loss": 0.2146, + "step": 570 + }, + { + "epoch": 0.0972080354102826, + "grad_norm": 1.770923376083374, + "learning_rate": 1e-06, + "loss": 0.0999, + "step": 571 + }, + { + "epoch": 0.09737827715355805, + "grad_norm": 1.533212661743164, + "learning_rate": 1e-06, + "loss": 0.1004, + "step": 572 + }, + { + "epoch": 0.0975485188968335, + "grad_norm": 1.6698973178863525, + "learning_rate": 1e-06, + "loss": 0.0945, + "step": 573 + }, + { + "epoch": 0.09771876064010895, + "grad_norm": 1.4515860080718994, + "learning_rate": 1e-06, + "loss": 0.1065, + "step": 574 + }, + { + "epoch": 0.09788900238338441, + "grad_norm": 1.332733154296875, + "learning_rate": 1e-06, + "loss": 0.0875, + "step": 575 + }, + { + "epoch": 0.09805924412665985, + "grad_norm": 1.3322087526321411, + "learning_rate": 1e-06, + "loss": 0.0872, + "step": 576 + }, + { + "epoch": 0.09822948586993531, + "grad_norm": 1.320599913597107, + "learning_rate": 1e-06, + "loss": 0.0832, + "step": 577 + }, + { + "epoch": 0.09839972761321075, + "grad_norm": 1.4648524522781372, + "learning_rate": 1e-06, + "loss": 0.0963, + "step": 578 + }, + { + "epoch": 0.09856996935648621, + "grad_norm": 1.6072378158569336, + "learning_rate": 1e-06, + "loss": 0.09, + "step": 579 + }, + { + "epoch": 0.09874021109976167, + "grad_norm": 2.066091537475586, + "learning_rate": 1e-06, + "loss": 0.1273, + "step": 580 + }, + { + "epoch": 0.09891045284303711, + "grad_norm": 1.3935089111328125, + "learning_rate": 1e-06, + "loss": 0.0938, + "step": 581 + }, + { + "epoch": 0.09908069458631256, + "grad_norm": 1.4537439346313477, + "learning_rate": 1e-06, + "loss": 0.0908, + "step": 582 + }, + { + "epoch": 0.09925093632958802, + "grad_norm": 1.4111661911010742, + "learning_rate": 1e-06, + "loss": 0.0826, + "step": 583 + }, + { + "epoch": 0.09942117807286346, + "grad_norm": 1.417158603668213, + "learning_rate": 1e-06, + "loss": 0.0916, + "step": 584 + }, + { + "epoch": 0.09959141981613892, + "grad_norm": 1.5110013484954834, + "learning_rate": 1e-06, + "loss": 0.0842, + "step": 585 + }, + { + "epoch": 0.09976166155941436, + "grad_norm": 1.3743090629577637, + "learning_rate": 1e-06, + "loss": 0.0891, + "step": 586 + }, + { + "epoch": 0.09993190330268982, + "grad_norm": 1.4649263620376587, + "learning_rate": 1e-06, + "loss": 0.0791, + "step": 587 + }, + { + "epoch": 0.10010214504596528, + "grad_norm": 1.6102653741836548, + "learning_rate": 1e-06, + "loss": 0.0856, + "step": 588 + }, + { + "epoch": 0.10027238678924072, + "grad_norm": 1.7673293352127075, + "learning_rate": 1e-06, + "loss": 0.1013, + "step": 589 + }, + { + "epoch": 0.10044262853251618, + "grad_norm": 1.4860748052597046, + "learning_rate": 1e-06, + "loss": 0.0854, + "step": 590 + }, + { + "epoch": 0.10061287027579162, + "grad_norm": 1.5368369817733765, + "learning_rate": 1e-06, + "loss": 0.089, + "step": 591 + }, + { + "epoch": 0.10078311201906708, + "grad_norm": 1.6211143732070923, + "learning_rate": 1e-06, + "loss": 0.1087, + "step": 592 + }, + { + "epoch": 0.10095335376234253, + "grad_norm": 1.432572603225708, + "learning_rate": 1e-06, + "loss": 0.0934, + "step": 593 + }, + { + "epoch": 0.10112359550561797, + "grad_norm": 1.351190209388733, + "learning_rate": 1e-06, + "loss": 0.0816, + "step": 594 + }, + { + "epoch": 0.10129383724889343, + "grad_norm": 1.5220497846603394, + "learning_rate": 1e-06, + "loss": 0.1014, + "step": 595 + }, + { + "epoch": 0.10146407899216887, + "grad_norm": 3.933367967605591, + "learning_rate": 1e-06, + "loss": 0.2032, + "step": 596 + }, + { + "epoch": 0.10163432073544433, + "grad_norm": 1.5819555521011353, + "learning_rate": 1e-06, + "loss": 0.1125, + "step": 597 + }, + { + "epoch": 0.10180456247871979, + "grad_norm": 1.5007481575012207, + "learning_rate": 1e-06, + "loss": 0.0991, + "step": 598 + }, + { + "epoch": 0.10197480422199523, + "grad_norm": 1.6869536638259888, + "learning_rate": 1e-06, + "loss": 0.1113, + "step": 599 + }, + { + "epoch": 0.10214504596527069, + "grad_norm": 1.5548107624053955, + "learning_rate": 1e-06, + "loss": 0.1018, + "step": 600 + }, + { + "epoch": 0.10231528770854613, + "grad_norm": 1.7281802892684937, + "learning_rate": 1e-06, + "loss": 0.0962, + "step": 601 + }, + { + "epoch": 0.10248552945182159, + "grad_norm": 1.603119969367981, + "learning_rate": 1e-06, + "loss": 0.0764, + "step": 602 + }, + { + "epoch": 0.10265577119509704, + "grad_norm": 1.6384819746017456, + "learning_rate": 1e-06, + "loss": 0.0842, + "step": 603 + }, + { + "epoch": 0.10282601293837249, + "grad_norm": 1.245998501777649, + "learning_rate": 1e-06, + "loss": 0.086, + "step": 604 + }, + { + "epoch": 0.10299625468164794, + "grad_norm": 1.5214192867279053, + "learning_rate": 1e-06, + "loss": 0.0834, + "step": 605 + }, + { + "epoch": 0.10316649642492338, + "grad_norm": 1.345001459121704, + "learning_rate": 1e-06, + "loss": 0.0869, + "step": 606 + }, + { + "epoch": 0.10333673816819884, + "grad_norm": 3.7370526790618896, + "learning_rate": 1e-06, + "loss": 0.1794, + "step": 607 + }, + { + "epoch": 0.1035069799114743, + "grad_norm": 1.3892709016799927, + "learning_rate": 1e-06, + "loss": 0.0912, + "step": 608 + }, + { + "epoch": 0.10367722165474974, + "grad_norm": 1.4373610019683838, + "learning_rate": 1e-06, + "loss": 0.0914, + "step": 609 + }, + { + "epoch": 0.1038474633980252, + "grad_norm": 1.4336578845977783, + "learning_rate": 1e-06, + "loss": 0.0732, + "step": 610 + }, + { + "epoch": 0.10401770514130065, + "grad_norm": 1.7176496982574463, + "learning_rate": 1e-06, + "loss": 0.1008, + "step": 611 + }, + { + "epoch": 0.1041879468845761, + "grad_norm": 1.9851466417312622, + "learning_rate": 1e-06, + "loss": 0.1067, + "step": 612 + }, + { + "epoch": 0.10435818862785155, + "grad_norm": 1.8875466585159302, + "learning_rate": 1e-06, + "loss": 0.1106, + "step": 613 + }, + { + "epoch": 0.104528430371127, + "grad_norm": 1.458789348602295, + "learning_rate": 1e-06, + "loss": 0.0843, + "step": 614 + }, + { + "epoch": 0.10469867211440245, + "grad_norm": 1.6216390132904053, + "learning_rate": 1e-06, + "loss": 0.09, + "step": 615 + }, + { + "epoch": 0.10486891385767791, + "grad_norm": 1.2169405221939087, + "learning_rate": 1e-06, + "loss": 0.0626, + "step": 616 + }, + { + "epoch": 0.10503915560095335, + "grad_norm": 1.3988337516784668, + "learning_rate": 1e-06, + "loss": 0.0891, + "step": 617 + }, + { + "epoch": 0.10520939734422881, + "grad_norm": 1.5023198127746582, + "learning_rate": 1e-06, + "loss": 0.0968, + "step": 618 + }, + { + "epoch": 0.10537963908750425, + "grad_norm": 1.4495683908462524, + "learning_rate": 1e-06, + "loss": 0.0865, + "step": 619 + }, + { + "epoch": 0.10554988083077971, + "grad_norm": 1.256415843963623, + "learning_rate": 1e-06, + "loss": 0.0713, + "step": 620 + }, + { + "epoch": 0.10572012257405516, + "grad_norm": 1.7719178199768066, + "learning_rate": 1e-06, + "loss": 0.0998, + "step": 621 + }, + { + "epoch": 0.10589036431733061, + "grad_norm": 1.5151079893112183, + "learning_rate": 1e-06, + "loss": 0.0812, + "step": 622 + }, + { + "epoch": 0.10606060606060606, + "grad_norm": 4.284844875335693, + "learning_rate": 1e-06, + "loss": 0.178, + "step": 623 + }, + { + "epoch": 0.1062308478038815, + "grad_norm": 1.978816032409668, + "learning_rate": 1e-06, + "loss": 0.0872, + "step": 624 + }, + { + "epoch": 0.10640108954715696, + "grad_norm": 1.5746681690216064, + "learning_rate": 1e-06, + "loss": 0.0849, + "step": 625 + }, + { + "epoch": 0.10657133129043242, + "grad_norm": 1.4485297203063965, + "learning_rate": 1e-06, + "loss": 0.081, + "step": 626 + }, + { + "epoch": 0.10674157303370786, + "grad_norm": 1.581984281539917, + "learning_rate": 1e-06, + "loss": 0.1037, + "step": 627 + }, + { + "epoch": 0.10691181477698332, + "grad_norm": 1.3287845849990845, + "learning_rate": 1e-06, + "loss": 0.0722, + "step": 628 + }, + { + "epoch": 0.10708205652025876, + "grad_norm": 1.2169119119644165, + "learning_rate": 1e-06, + "loss": 0.0588, + "step": 629 + }, + { + "epoch": 0.10725229826353422, + "grad_norm": 1.2675771713256836, + "learning_rate": 1e-06, + "loss": 0.0792, + "step": 630 + }, + { + "epoch": 0.10742254000680967, + "grad_norm": 1.3391927480697632, + "learning_rate": 1e-06, + "loss": 0.0884, + "step": 631 + }, + { + "epoch": 0.10759278175008512, + "grad_norm": 1.7200751304626465, + "learning_rate": 1e-06, + "loss": 0.0922, + "step": 632 + }, + { + "epoch": 0.10776302349336057, + "grad_norm": 1.4246052503585815, + "learning_rate": 1e-06, + "loss": 0.08, + "step": 633 + }, + { + "epoch": 0.10793326523663602, + "grad_norm": 1.4974547624588013, + "learning_rate": 1e-06, + "loss": 0.0738, + "step": 634 + }, + { + "epoch": 0.10810350697991147, + "grad_norm": 1.7298548221588135, + "learning_rate": 1e-06, + "loss": 0.0869, + "step": 635 + }, + { + "epoch": 0.10827374872318693, + "grad_norm": 1.4626245498657227, + "learning_rate": 1e-06, + "loss": 0.0745, + "step": 636 + }, + { + "epoch": 0.10844399046646237, + "grad_norm": 1.5561753511428833, + "learning_rate": 1e-06, + "loss": 0.0807, + "step": 637 + }, + { + "epoch": 0.10861423220973783, + "grad_norm": 1.6584053039550781, + "learning_rate": 1e-06, + "loss": 0.0951, + "step": 638 + }, + { + "epoch": 0.10878447395301327, + "grad_norm": 1.3954371213912964, + "learning_rate": 1e-06, + "loss": 0.0754, + "step": 639 + }, + { + "epoch": 0.10895471569628873, + "grad_norm": 3.7084250450134277, + "learning_rate": 1e-06, + "loss": 0.1604, + "step": 640 + }, + { + "epoch": 0.10912495743956419, + "grad_norm": 1.594537377357483, + "learning_rate": 1e-06, + "loss": 0.1014, + "step": 641 + }, + { + "epoch": 0.10929519918283963, + "grad_norm": 3.0633435249328613, + "learning_rate": 1e-06, + "loss": 0.1342, + "step": 642 + }, + { + "epoch": 0.10946544092611508, + "grad_norm": 1.6951473951339722, + "learning_rate": 1e-06, + "loss": 0.0925, + "step": 643 + }, + { + "epoch": 0.10963568266939054, + "grad_norm": 1.4243108034133911, + "learning_rate": 1e-06, + "loss": 0.0696, + "step": 644 + }, + { + "epoch": 0.10980592441266598, + "grad_norm": 1.480381965637207, + "learning_rate": 1e-06, + "loss": 0.0768, + "step": 645 + }, + { + "epoch": 0.10997616615594144, + "grad_norm": 1.3805941343307495, + "learning_rate": 1e-06, + "loss": 0.0848, + "step": 646 + }, + { + "epoch": 0.11014640789921688, + "grad_norm": 1.4682902097702026, + "learning_rate": 1e-06, + "loss": 0.0722, + "step": 647 + }, + { + "epoch": 0.11031664964249234, + "grad_norm": 1.408152461051941, + "learning_rate": 1e-06, + "loss": 0.0734, + "step": 648 + }, + { + "epoch": 0.1104868913857678, + "grad_norm": 1.8382225036621094, + "learning_rate": 1e-06, + "loss": 0.1047, + "step": 649 + }, + { + "epoch": 0.11065713312904324, + "grad_norm": 1.4985864162445068, + "learning_rate": 1e-06, + "loss": 0.0845, + "step": 650 + }, + { + "epoch": 0.1108273748723187, + "grad_norm": 1.340008020401001, + "learning_rate": 1e-06, + "loss": 0.0741, + "step": 651 + }, + { + "epoch": 0.11099761661559414, + "grad_norm": 1.65091073513031, + "learning_rate": 1e-06, + "loss": 0.0914, + "step": 652 + }, + { + "epoch": 0.1111678583588696, + "grad_norm": 1.6151150465011597, + "learning_rate": 1e-06, + "loss": 0.084, + "step": 653 + }, + { + "epoch": 0.11133810010214505, + "grad_norm": 1.8154164552688599, + "learning_rate": 1e-06, + "loss": 0.0988, + "step": 654 + }, + { + "epoch": 0.1115083418454205, + "grad_norm": 1.4415311813354492, + "learning_rate": 1e-06, + "loss": 0.0841, + "step": 655 + }, + { + "epoch": 0.11167858358869595, + "grad_norm": 2.2858686447143555, + "learning_rate": 1e-06, + "loss": 0.1037, + "step": 656 + }, + { + "epoch": 0.1118488253319714, + "grad_norm": 1.6448941230773926, + "learning_rate": 1e-06, + "loss": 0.082, + "step": 657 + }, + { + "epoch": 0.11201906707524685, + "grad_norm": 1.4810398817062378, + "learning_rate": 1e-06, + "loss": 0.0806, + "step": 658 + }, + { + "epoch": 0.11218930881852231, + "grad_norm": 1.6105705499649048, + "learning_rate": 1e-06, + "loss": 0.0962, + "step": 659 + }, + { + "epoch": 0.11235955056179775, + "grad_norm": 1.5040578842163086, + "learning_rate": 1e-06, + "loss": 0.0842, + "step": 660 + }, + { + "epoch": 0.1125297923050732, + "grad_norm": 1.6204015016555786, + "learning_rate": 1e-06, + "loss": 0.082, + "step": 661 + }, + { + "epoch": 0.11270003404834865, + "grad_norm": 1.3563892841339111, + "learning_rate": 1e-06, + "loss": 0.061, + "step": 662 + }, + { + "epoch": 0.1128702757916241, + "grad_norm": 2.079106330871582, + "learning_rate": 1e-06, + "loss": 0.0947, + "step": 663 + }, + { + "epoch": 0.11304051753489956, + "grad_norm": 1.3969217538833618, + "learning_rate": 1e-06, + "loss": 0.0723, + "step": 664 + }, + { + "epoch": 0.113210759278175, + "grad_norm": 1.3509082794189453, + "learning_rate": 1e-06, + "loss": 0.0811, + "step": 665 + }, + { + "epoch": 0.11338100102145046, + "grad_norm": 1.4950233697891235, + "learning_rate": 1e-06, + "loss": 0.089, + "step": 666 + }, + { + "epoch": 0.1135512427647259, + "grad_norm": 1.6716737747192383, + "learning_rate": 1e-06, + "loss": 0.0861, + "step": 667 + }, + { + "epoch": 0.11372148450800136, + "grad_norm": 1.8707127571105957, + "learning_rate": 1e-06, + "loss": 0.0924, + "step": 668 + }, + { + "epoch": 0.11389172625127682, + "grad_norm": 1.5277262926101685, + "learning_rate": 1e-06, + "loss": 0.079, + "step": 669 + }, + { + "epoch": 0.11406196799455226, + "grad_norm": 1.7219480276107788, + "learning_rate": 1e-06, + "loss": 0.0928, + "step": 670 + }, + { + "epoch": 0.11423220973782772, + "grad_norm": 1.4405816793441772, + "learning_rate": 1e-06, + "loss": 0.0767, + "step": 671 + }, + { + "epoch": 0.11440245148110317, + "grad_norm": 1.7040899991989136, + "learning_rate": 1e-06, + "loss": 0.0855, + "step": 672 + }, + { + "epoch": 0.11457269322437862, + "grad_norm": 1.588266134262085, + "learning_rate": 1e-06, + "loss": 0.0772, + "step": 673 + }, + { + "epoch": 0.11474293496765407, + "grad_norm": 1.8362873792648315, + "learning_rate": 1e-06, + "loss": 0.0978, + "step": 674 + }, + { + "epoch": 0.11491317671092952, + "grad_norm": 1.6758869886398315, + "learning_rate": 1e-06, + "loss": 0.0928, + "step": 675 + }, + { + "epoch": 0.11508341845420497, + "grad_norm": 1.7752368450164795, + "learning_rate": 1e-06, + "loss": 0.0879, + "step": 676 + }, + { + "epoch": 0.11525366019748043, + "grad_norm": 1.6252251863479614, + "learning_rate": 1e-06, + "loss": 0.096, + "step": 677 + }, + { + "epoch": 0.11542390194075587, + "grad_norm": 1.7671598196029663, + "learning_rate": 1e-06, + "loss": 0.0975, + "step": 678 + }, + { + "epoch": 0.11559414368403133, + "grad_norm": 1.3313266038894653, + "learning_rate": 1e-06, + "loss": 0.0782, + "step": 679 + }, + { + "epoch": 0.11576438542730677, + "grad_norm": 1.298072338104248, + "learning_rate": 1e-06, + "loss": 0.0788, + "step": 680 + }, + { + "epoch": 0.11593462717058223, + "grad_norm": 1.4549241065979004, + "learning_rate": 1e-06, + "loss": 0.0827, + "step": 681 + }, + { + "epoch": 0.11610486891385768, + "grad_norm": 1.5572149753570557, + "learning_rate": 1e-06, + "loss": 0.0866, + "step": 682 + }, + { + "epoch": 0.11627511065713313, + "grad_norm": 1.3730863332748413, + "learning_rate": 1e-06, + "loss": 0.0714, + "step": 683 + }, + { + "epoch": 0.11644535240040858, + "grad_norm": 1.7244811058044434, + "learning_rate": 1e-06, + "loss": 0.0768, + "step": 684 + }, + { + "epoch": 0.11661559414368403, + "grad_norm": 1.879199743270874, + "learning_rate": 1e-06, + "loss": 0.0863, + "step": 685 + }, + { + "epoch": 0.11678583588695948, + "grad_norm": 1.3578203916549683, + "learning_rate": 1e-06, + "loss": 0.0623, + "step": 686 + }, + { + "epoch": 0.11695607763023494, + "grad_norm": 1.5125126838684082, + "learning_rate": 1e-06, + "loss": 0.0872, + "step": 687 + }, + { + "epoch": 0.11712631937351038, + "grad_norm": 2.28521728515625, + "learning_rate": 1e-06, + "loss": 0.1136, + "step": 688 + }, + { + "epoch": 0.11729656111678584, + "grad_norm": 1.5524616241455078, + "learning_rate": 1e-06, + "loss": 0.0815, + "step": 689 + }, + { + "epoch": 0.11746680286006128, + "grad_norm": 1.5951274633407593, + "learning_rate": 1e-06, + "loss": 0.0779, + "step": 690 + }, + { + "epoch": 0.11763704460333674, + "grad_norm": 1.1863946914672852, + "learning_rate": 1e-06, + "loss": 0.0611, + "step": 691 + }, + { + "epoch": 0.1178072863466122, + "grad_norm": 1.413318395614624, + "learning_rate": 1e-06, + "loss": 0.0735, + "step": 692 + }, + { + "epoch": 0.11797752808988764, + "grad_norm": 1.5461151599884033, + "learning_rate": 1e-06, + "loss": 0.0794, + "step": 693 + }, + { + "epoch": 0.1181477698331631, + "grad_norm": 1.5801036357879639, + "learning_rate": 1e-06, + "loss": 0.0805, + "step": 694 + }, + { + "epoch": 0.11831801157643854, + "grad_norm": 1.341295838356018, + "learning_rate": 1e-06, + "loss": 0.07, + "step": 695 + }, + { + "epoch": 0.118488253319714, + "grad_norm": 1.561644196510315, + "learning_rate": 1e-06, + "loss": 0.0756, + "step": 696 + }, + { + "epoch": 0.11865849506298945, + "grad_norm": 1.3806345462799072, + "learning_rate": 1e-06, + "loss": 0.0646, + "step": 697 + }, + { + "epoch": 0.11882873680626489, + "grad_norm": 1.5367329120635986, + "learning_rate": 1e-06, + "loss": 0.0775, + "step": 698 + }, + { + "epoch": 0.11899897854954035, + "grad_norm": 1.8795855045318604, + "learning_rate": 1e-06, + "loss": 0.0936, + "step": 699 + }, + { + "epoch": 0.11916922029281579, + "grad_norm": 1.3554261922836304, + "learning_rate": 1e-06, + "loss": 0.0624, + "step": 700 + }, + { + "epoch": 0.11933946203609125, + "grad_norm": 1.4295129776000977, + "learning_rate": 1e-06, + "loss": 0.0817, + "step": 701 + }, + { + "epoch": 0.1195097037793667, + "grad_norm": 2.1465654373168945, + "learning_rate": 1e-06, + "loss": 0.1037, + "step": 702 + }, + { + "epoch": 0.11967994552264215, + "grad_norm": 1.2775219678878784, + "learning_rate": 1e-06, + "loss": 0.0698, + "step": 703 + }, + { + "epoch": 0.1198501872659176, + "grad_norm": 1.6362903118133545, + "learning_rate": 1e-06, + "loss": 0.0867, + "step": 704 + }, + { + "epoch": 0.12002042900919306, + "grad_norm": 1.531864047050476, + "learning_rate": 1e-06, + "loss": 0.075, + "step": 705 + }, + { + "epoch": 0.1201906707524685, + "grad_norm": 1.4992605447769165, + "learning_rate": 1e-06, + "loss": 0.0909, + "step": 706 + }, + { + "epoch": 0.12036091249574396, + "grad_norm": 1.376063346862793, + "learning_rate": 1e-06, + "loss": 0.0639, + "step": 707 + }, + { + "epoch": 0.1205311542390194, + "grad_norm": 1.4128174781799316, + "learning_rate": 1e-06, + "loss": 0.0725, + "step": 708 + }, + { + "epoch": 0.12070139598229486, + "grad_norm": 1.9290549755096436, + "learning_rate": 1e-06, + "loss": 0.1059, + "step": 709 + }, + { + "epoch": 0.12087163772557032, + "grad_norm": 1.6471437215805054, + "learning_rate": 1e-06, + "loss": 0.0768, + "step": 710 + }, + { + "epoch": 0.12104187946884576, + "grad_norm": 1.5400367975234985, + "learning_rate": 1e-06, + "loss": 0.0828, + "step": 711 + }, + { + "epoch": 0.12121212121212122, + "grad_norm": 1.3457374572753906, + "learning_rate": 1e-06, + "loss": 0.0738, + "step": 712 + }, + { + "epoch": 0.12138236295539666, + "grad_norm": 1.8186038732528687, + "learning_rate": 1e-06, + "loss": 0.0968, + "step": 713 + }, + { + "epoch": 0.12155260469867211, + "grad_norm": 1.650877833366394, + "learning_rate": 1e-06, + "loss": 0.076, + "step": 714 + }, + { + "epoch": 0.12172284644194757, + "grad_norm": 1.4798765182495117, + "learning_rate": 1e-06, + "loss": 0.0787, + "step": 715 + }, + { + "epoch": 0.12189308818522301, + "grad_norm": 1.2624869346618652, + "learning_rate": 1e-06, + "loss": 0.0592, + "step": 716 + }, + { + "epoch": 0.12206332992849847, + "grad_norm": 1.596134901046753, + "learning_rate": 1e-06, + "loss": 0.0761, + "step": 717 + }, + { + "epoch": 0.12223357167177391, + "grad_norm": 1.538567304611206, + "learning_rate": 1e-06, + "loss": 0.0723, + "step": 718 + }, + { + "epoch": 0.12240381341504937, + "grad_norm": 5.429737567901611, + "learning_rate": 1e-06, + "loss": 0.1379, + "step": 719 + }, + { + "epoch": 0.12257405515832483, + "grad_norm": 1.6454511880874634, + "learning_rate": 1e-06, + "loss": 0.0871, + "step": 720 + }, + { + "epoch": 0.12274429690160027, + "grad_norm": 1.4540003538131714, + "learning_rate": 1e-06, + "loss": 0.0788, + "step": 721 + }, + { + "epoch": 0.12291453864487573, + "grad_norm": 1.774742603302002, + "learning_rate": 1e-06, + "loss": 0.0824, + "step": 722 + }, + { + "epoch": 0.12308478038815117, + "grad_norm": 1.881834626197815, + "learning_rate": 1e-06, + "loss": 0.1062, + "step": 723 + }, + { + "epoch": 0.12325502213142663, + "grad_norm": 1.543564796447754, + "learning_rate": 1e-06, + "loss": 0.0949, + "step": 724 + }, + { + "epoch": 0.12342526387470208, + "grad_norm": 1.6501184701919556, + "learning_rate": 1e-06, + "loss": 0.0775, + "step": 725 + }, + { + "epoch": 0.12359550561797752, + "grad_norm": 1.3438044786453247, + "learning_rate": 1e-06, + "loss": 0.0764, + "step": 726 + }, + { + "epoch": 0.12376574736125298, + "grad_norm": 1.2549158334732056, + "learning_rate": 1e-06, + "loss": 0.0575, + "step": 727 + }, + { + "epoch": 0.12393598910452842, + "grad_norm": 1.8060505390167236, + "learning_rate": 1e-06, + "loss": 0.0829, + "step": 728 + }, + { + "epoch": 0.12410623084780388, + "grad_norm": 1.6881535053253174, + "learning_rate": 1e-06, + "loss": 0.0849, + "step": 729 + }, + { + "epoch": 0.12427647259107934, + "grad_norm": 1.4681129455566406, + "learning_rate": 1e-06, + "loss": 0.054, + "step": 730 + }, + { + "epoch": 0.12444671433435478, + "grad_norm": 1.806591510772705, + "learning_rate": 1e-06, + "loss": 0.081, + "step": 731 + }, + { + "epoch": 0.12461695607763024, + "grad_norm": 1.6089766025543213, + "learning_rate": 1e-06, + "loss": 0.0773, + "step": 732 + }, + { + "epoch": 0.12478719782090568, + "grad_norm": 1.6091641187667847, + "learning_rate": 1e-06, + "loss": 0.0804, + "step": 733 + }, + { + "epoch": 0.12495743956418114, + "grad_norm": 1.669281005859375, + "learning_rate": 1e-06, + "loss": 0.0771, + "step": 734 + }, + { + "epoch": 0.12512768130745658, + "grad_norm": 1.7492468357086182, + "learning_rate": 1e-06, + "loss": 0.0777, + "step": 735 + }, + { + "epoch": 0.12529792305073204, + "grad_norm": 1.7161555290222168, + "learning_rate": 1e-06, + "loss": 0.0898, + "step": 736 + }, + { + "epoch": 0.1254681647940075, + "grad_norm": 1.7574137449264526, + "learning_rate": 1e-06, + "loss": 0.107, + "step": 737 + }, + { + "epoch": 0.12563840653728295, + "grad_norm": 1.3838268518447876, + "learning_rate": 1e-06, + "loss": 0.0699, + "step": 738 + }, + { + "epoch": 0.1258086482805584, + "grad_norm": 1.6737573146820068, + "learning_rate": 1e-06, + "loss": 0.0846, + "step": 739 + }, + { + "epoch": 0.12597889002383383, + "grad_norm": 2.306833028793335, + "learning_rate": 1e-06, + "loss": 0.1018, + "step": 740 + }, + { + "epoch": 0.1261491317671093, + "grad_norm": 2.311218738555908, + "learning_rate": 1e-06, + "loss": 0.1155, + "step": 741 + }, + { + "epoch": 0.12631937351038475, + "grad_norm": 1.696584701538086, + "learning_rate": 1e-06, + "loss": 0.069, + "step": 742 + }, + { + "epoch": 0.1264896152536602, + "grad_norm": 1.3152414560317993, + "learning_rate": 1e-06, + "loss": 0.0606, + "step": 743 + }, + { + "epoch": 0.12665985699693566, + "grad_norm": 1.6591945886611938, + "learning_rate": 1e-06, + "loss": 0.0859, + "step": 744 + }, + { + "epoch": 0.1268300987402111, + "grad_norm": 1.4438961744308472, + "learning_rate": 1e-06, + "loss": 0.0702, + "step": 745 + }, + { + "epoch": 0.12700034048348655, + "grad_norm": 1.289958119392395, + "learning_rate": 1e-06, + "loss": 0.067, + "step": 746 + }, + { + "epoch": 0.127170582226762, + "grad_norm": 2.529083251953125, + "learning_rate": 1e-06, + "loss": 0.1006, + "step": 747 + }, + { + "epoch": 0.12734082397003746, + "grad_norm": 1.5549131631851196, + "learning_rate": 1e-06, + "loss": 0.0815, + "step": 748 + }, + { + "epoch": 0.12751106571331292, + "grad_norm": 1.5461150407791138, + "learning_rate": 1e-06, + "loss": 0.0835, + "step": 749 + }, + { + "epoch": 0.12768130745658834, + "grad_norm": 1.4177753925323486, + "learning_rate": 1e-06, + "loss": 0.072, + "step": 750 + }, + { + "epoch": 0.1278515491998638, + "grad_norm": 1.6254005432128906, + "learning_rate": 1e-06, + "loss": 0.0788, + "step": 751 + }, + { + "epoch": 0.12802179094313926, + "grad_norm": 1.7862988710403442, + "learning_rate": 1e-06, + "loss": 0.0813, + "step": 752 + }, + { + "epoch": 0.12819203268641471, + "grad_norm": 1.7907599210739136, + "learning_rate": 1e-06, + "loss": 0.0889, + "step": 753 + }, + { + "epoch": 0.12836227442969017, + "grad_norm": 1.9308052062988281, + "learning_rate": 1e-06, + "loss": 0.0851, + "step": 754 + }, + { + "epoch": 0.1285325161729656, + "grad_norm": 1.512603998184204, + "learning_rate": 1e-06, + "loss": 0.0737, + "step": 755 + }, + { + "epoch": 0.12870275791624106, + "grad_norm": 1.6706660985946655, + "learning_rate": 1e-06, + "loss": 0.0857, + "step": 756 + }, + { + "epoch": 0.1288729996595165, + "grad_norm": 1.6297451257705688, + "learning_rate": 1e-06, + "loss": 0.0796, + "step": 757 + }, + { + "epoch": 0.12904324140279197, + "grad_norm": 1.4156213998794556, + "learning_rate": 1e-06, + "loss": 0.0762, + "step": 758 + }, + { + "epoch": 0.12921348314606743, + "grad_norm": 1.7496767044067383, + "learning_rate": 1e-06, + "loss": 0.0802, + "step": 759 + }, + { + "epoch": 0.12938372488934285, + "grad_norm": 1.380969524383545, + "learning_rate": 1e-06, + "loss": 0.0757, + "step": 760 + }, + { + "epoch": 0.1295539666326183, + "grad_norm": 1.8817622661590576, + "learning_rate": 1e-06, + "loss": 0.0812, + "step": 761 + }, + { + "epoch": 0.12972420837589377, + "grad_norm": 1.3597335815429688, + "learning_rate": 1e-06, + "loss": 0.063, + "step": 762 + }, + { + "epoch": 0.12989445011916922, + "grad_norm": 1.5356440544128418, + "learning_rate": 1e-06, + "loss": 0.077, + "step": 763 + }, + { + "epoch": 0.13006469186244468, + "grad_norm": 1.3353973627090454, + "learning_rate": 1e-06, + "loss": 0.0552, + "step": 764 + }, + { + "epoch": 0.1302349336057201, + "grad_norm": 1.738538384437561, + "learning_rate": 1e-06, + "loss": 0.066, + "step": 765 + }, + { + "epoch": 0.13040517534899557, + "grad_norm": 1.3719581365585327, + "learning_rate": 1e-06, + "loss": 0.0606, + "step": 766 + }, + { + "epoch": 0.13057541709227102, + "grad_norm": 2.158234119415283, + "learning_rate": 1e-06, + "loss": 0.0843, + "step": 767 + }, + { + "epoch": 0.13074565883554648, + "grad_norm": 1.713789701461792, + "learning_rate": 1e-06, + "loss": 0.0808, + "step": 768 + }, + { + "epoch": 0.13091590057882194, + "grad_norm": 1.5063124895095825, + "learning_rate": 1e-06, + "loss": 0.0769, + "step": 769 + }, + { + "epoch": 0.13108614232209737, + "grad_norm": 1.3353931903839111, + "learning_rate": 1e-06, + "loss": 0.0698, + "step": 770 + }, + { + "epoch": 0.13125638406537282, + "grad_norm": 1.8870093822479248, + "learning_rate": 1e-06, + "loss": 0.091, + "step": 771 + }, + { + "epoch": 0.13142662580864828, + "grad_norm": 1.8463548421859741, + "learning_rate": 1e-06, + "loss": 0.0668, + "step": 772 + }, + { + "epoch": 0.13159686755192374, + "grad_norm": 1.3892488479614258, + "learning_rate": 1e-06, + "loss": 0.0718, + "step": 773 + }, + { + "epoch": 0.1317671092951992, + "grad_norm": 1.5559097528457642, + "learning_rate": 1e-06, + "loss": 0.0706, + "step": 774 + }, + { + "epoch": 0.13193735103847462, + "grad_norm": 2.101165533065796, + "learning_rate": 1e-06, + "loss": 0.1133, + "step": 775 + }, + { + "epoch": 0.13210759278175008, + "grad_norm": 1.7307274341583252, + "learning_rate": 1e-06, + "loss": 0.0663, + "step": 776 + }, + { + "epoch": 0.13227783452502553, + "grad_norm": 1.909165382385254, + "learning_rate": 1e-06, + "loss": 0.0817, + "step": 777 + }, + { + "epoch": 0.132448076268301, + "grad_norm": 1.6024906635284424, + "learning_rate": 1e-06, + "loss": 0.0757, + "step": 778 + }, + { + "epoch": 0.13261831801157645, + "grad_norm": 1.6666183471679688, + "learning_rate": 1e-06, + "loss": 0.0841, + "step": 779 + }, + { + "epoch": 0.13278855975485188, + "grad_norm": 2.6020991802215576, + "learning_rate": 1e-06, + "loss": 0.1077, + "step": 780 + }, + { + "epoch": 0.13295880149812733, + "grad_norm": 1.5541698932647705, + "learning_rate": 1e-06, + "loss": 0.0802, + "step": 781 + }, + { + "epoch": 0.1331290432414028, + "grad_norm": 1.538192868232727, + "learning_rate": 1e-06, + "loss": 0.0685, + "step": 782 + }, + { + "epoch": 0.13329928498467825, + "grad_norm": 1.704047679901123, + "learning_rate": 1e-06, + "loss": 0.0652, + "step": 783 + }, + { + "epoch": 0.1334695267279537, + "grad_norm": 1.5509108304977417, + "learning_rate": 1e-06, + "loss": 0.0736, + "step": 784 + }, + { + "epoch": 0.13363976847122916, + "grad_norm": 1.4850906133651733, + "learning_rate": 1e-06, + "loss": 0.0599, + "step": 785 + }, + { + "epoch": 0.1338100102145046, + "grad_norm": 1.46503484249115, + "learning_rate": 1e-06, + "loss": 0.0594, + "step": 786 + }, + { + "epoch": 0.13398025195778004, + "grad_norm": 1.633912205696106, + "learning_rate": 1e-06, + "loss": 0.0699, + "step": 787 + }, + { + "epoch": 0.1341504937010555, + "grad_norm": 1.7929328680038452, + "learning_rate": 1e-06, + "loss": 0.0616, + "step": 788 + }, + { + "epoch": 0.13432073544433096, + "grad_norm": 1.710610270500183, + "learning_rate": 1e-06, + "loss": 0.0724, + "step": 789 + }, + { + "epoch": 0.13449097718760641, + "grad_norm": 1.7763170003890991, + "learning_rate": 1e-06, + "loss": 0.0858, + "step": 790 + }, + { + "epoch": 0.13466121893088184, + "grad_norm": 1.5168254375457764, + "learning_rate": 1e-06, + "loss": 0.0769, + "step": 791 + }, + { + "epoch": 0.1348314606741573, + "grad_norm": 1.3891587257385254, + "learning_rate": 1e-06, + "loss": 0.0615, + "step": 792 + }, + { + "epoch": 0.13500170241743276, + "grad_norm": 1.804847002029419, + "learning_rate": 1e-06, + "loss": 0.0925, + "step": 793 + }, + { + "epoch": 0.1351719441607082, + "grad_norm": 1.5551631450653076, + "learning_rate": 1e-06, + "loss": 0.0702, + "step": 794 + }, + { + "epoch": 0.13534218590398367, + "grad_norm": 1.4586671590805054, + "learning_rate": 1e-06, + "loss": 0.0745, + "step": 795 + }, + { + "epoch": 0.1355124276472591, + "grad_norm": 1.7194534540176392, + "learning_rate": 1e-06, + "loss": 0.0944, + "step": 796 + }, + { + "epoch": 0.13568266939053455, + "grad_norm": 1.5851624011993408, + "learning_rate": 1e-06, + "loss": 0.0713, + "step": 797 + }, + { + "epoch": 0.13585291113381, + "grad_norm": 2.052922010421753, + "learning_rate": 1e-06, + "loss": 0.0922, + "step": 798 + }, + { + "epoch": 0.13602315287708547, + "grad_norm": 1.4071455001831055, + "learning_rate": 1e-06, + "loss": 0.0655, + "step": 799 + }, + { + "epoch": 0.13619339462036092, + "grad_norm": 1.8842096328735352, + "learning_rate": 1e-06, + "loss": 0.0725, + "step": 800 + }, + { + "epoch": 0.13636363636363635, + "grad_norm": 1.692454218864441, + "learning_rate": 1e-06, + "loss": 0.0718, + "step": 801 + }, + { + "epoch": 0.1365338781069118, + "grad_norm": 1.4987190961837769, + "learning_rate": 1e-06, + "loss": 0.0563, + "step": 802 + }, + { + "epoch": 0.13670411985018727, + "grad_norm": 1.3979241847991943, + "learning_rate": 1e-06, + "loss": 0.0658, + "step": 803 + }, + { + "epoch": 0.13687436159346272, + "grad_norm": 1.5798490047454834, + "learning_rate": 1e-06, + "loss": 0.0739, + "step": 804 + }, + { + "epoch": 0.13704460333673818, + "grad_norm": 1.990831732749939, + "learning_rate": 1e-06, + "loss": 0.0759, + "step": 805 + }, + { + "epoch": 0.1372148450800136, + "grad_norm": 1.9039353132247925, + "learning_rate": 1e-06, + "loss": 0.0583, + "step": 806 + }, + { + "epoch": 0.13738508682328907, + "grad_norm": 1.76191246509552, + "learning_rate": 1e-06, + "loss": 0.0836, + "step": 807 + }, + { + "epoch": 0.13755532856656452, + "grad_norm": 1.5676465034484863, + "learning_rate": 1e-06, + "loss": 0.062, + "step": 808 + }, + { + "epoch": 0.13772557030983998, + "grad_norm": 1.9201511144638062, + "learning_rate": 1e-06, + "loss": 0.0705, + "step": 809 + }, + { + "epoch": 0.13789581205311544, + "grad_norm": 1.5441217422485352, + "learning_rate": 1e-06, + "loss": 0.0594, + "step": 810 + }, + { + "epoch": 0.13806605379639086, + "grad_norm": 1.9137436151504517, + "learning_rate": 1e-06, + "loss": 0.0745, + "step": 811 + }, + { + "epoch": 0.13823629553966632, + "grad_norm": 1.3507276773452759, + "learning_rate": 1e-06, + "loss": 0.0656, + "step": 812 + }, + { + "epoch": 0.13840653728294178, + "grad_norm": 1.6629807949066162, + "learning_rate": 1e-06, + "loss": 0.0721, + "step": 813 + }, + { + "epoch": 0.13857677902621723, + "grad_norm": 1.6247719526290894, + "learning_rate": 1e-06, + "loss": 0.0717, + "step": 814 + }, + { + "epoch": 0.1387470207694927, + "grad_norm": 1.6012877225875854, + "learning_rate": 1e-06, + "loss": 0.072, + "step": 815 + }, + { + "epoch": 0.13891726251276812, + "grad_norm": 1.6676996946334839, + "learning_rate": 1e-06, + "loss": 0.0812, + "step": 816 + }, + { + "epoch": 0.13908750425604358, + "grad_norm": 1.6991463899612427, + "learning_rate": 1e-06, + "loss": 0.0729, + "step": 817 + }, + { + "epoch": 0.13925774599931903, + "grad_norm": 1.4762986898422241, + "learning_rate": 1e-06, + "loss": 0.0686, + "step": 818 + }, + { + "epoch": 0.1394279877425945, + "grad_norm": 1.65053391456604, + "learning_rate": 1e-06, + "loss": 0.0795, + "step": 819 + }, + { + "epoch": 0.13959822948586995, + "grad_norm": 1.437187910079956, + "learning_rate": 1e-06, + "loss": 0.0638, + "step": 820 + }, + { + "epoch": 0.13976847122914537, + "grad_norm": 1.4831326007843018, + "learning_rate": 1e-06, + "loss": 0.075, + "step": 821 + }, + { + "epoch": 0.13993871297242083, + "grad_norm": 1.3216532468795776, + "learning_rate": 1e-06, + "loss": 0.0532, + "step": 822 + }, + { + "epoch": 0.1401089547156963, + "grad_norm": 1.9568979740142822, + "learning_rate": 1e-06, + "loss": 0.0817, + "step": 823 + }, + { + "epoch": 0.14027919645897174, + "grad_norm": 1.7128760814666748, + "learning_rate": 1e-06, + "loss": 0.0717, + "step": 824 + }, + { + "epoch": 0.1404494382022472, + "grad_norm": 1.5816516876220703, + "learning_rate": 1e-06, + "loss": 0.0662, + "step": 825 + }, + { + "epoch": 0.14061967994552263, + "grad_norm": 1.5961077213287354, + "learning_rate": 1e-06, + "loss": 0.0748, + "step": 826 + }, + { + "epoch": 0.1407899216887981, + "grad_norm": 1.8750134706497192, + "learning_rate": 1e-06, + "loss": 0.0588, + "step": 827 + }, + { + "epoch": 0.14096016343207354, + "grad_norm": 1.8028770685195923, + "learning_rate": 1e-06, + "loss": 0.0777, + "step": 828 + }, + { + "epoch": 0.141130405175349, + "grad_norm": 1.7617617845535278, + "learning_rate": 1e-06, + "loss": 0.0753, + "step": 829 + }, + { + "epoch": 0.14130064691862446, + "grad_norm": 1.6493316888809204, + "learning_rate": 1e-06, + "loss": 0.0638, + "step": 830 + }, + { + "epoch": 0.14147088866189989, + "grad_norm": 4.191693305969238, + "learning_rate": 1e-06, + "loss": 0.1091, + "step": 831 + }, + { + "epoch": 0.14164113040517534, + "grad_norm": 1.787387728691101, + "learning_rate": 1e-06, + "loss": 0.0766, + "step": 832 + }, + { + "epoch": 0.1418113721484508, + "grad_norm": 1.9080184698104858, + "learning_rate": 1e-06, + "loss": 0.0744, + "step": 833 + }, + { + "epoch": 0.14198161389172625, + "grad_norm": 2.04909086227417, + "learning_rate": 1e-06, + "loss": 0.0637, + "step": 834 + }, + { + "epoch": 0.1421518556350017, + "grad_norm": 1.588629126548767, + "learning_rate": 1e-06, + "loss": 0.0513, + "step": 835 + }, + { + "epoch": 0.14232209737827714, + "grad_norm": 1.3033459186553955, + "learning_rate": 1e-06, + "loss": 0.0605, + "step": 836 + }, + { + "epoch": 0.1424923391215526, + "grad_norm": 1.4227380752563477, + "learning_rate": 1e-06, + "loss": 0.0583, + "step": 837 + }, + { + "epoch": 0.14266258086482805, + "grad_norm": 2.39058780670166, + "learning_rate": 1e-06, + "loss": 0.0931, + "step": 838 + }, + { + "epoch": 0.1428328226081035, + "grad_norm": 1.6543277502059937, + "learning_rate": 1e-06, + "loss": 0.0752, + "step": 839 + }, + { + "epoch": 0.14300306435137897, + "grad_norm": 1.3659149408340454, + "learning_rate": 1e-06, + "loss": 0.0439, + "step": 840 + }, + { + "epoch": 0.1431733060946544, + "grad_norm": 1.9133633375167847, + "learning_rate": 1e-06, + "loss": 0.0874, + "step": 841 + }, + { + "epoch": 0.14334354783792985, + "grad_norm": 2.2252583503723145, + "learning_rate": 1e-06, + "loss": 0.0994, + "step": 842 + }, + { + "epoch": 0.1435137895812053, + "grad_norm": 1.5215318202972412, + "learning_rate": 1e-06, + "loss": 0.065, + "step": 843 + }, + { + "epoch": 0.14368403132448077, + "grad_norm": 1.6508617401123047, + "learning_rate": 1e-06, + "loss": 0.0641, + "step": 844 + }, + { + "epoch": 0.14385427306775622, + "grad_norm": 1.4455782175064087, + "learning_rate": 1e-06, + "loss": 0.0459, + "step": 845 + }, + { + "epoch": 0.14402451481103168, + "grad_norm": 1.5651944875717163, + "learning_rate": 1e-06, + "loss": 0.0639, + "step": 846 + }, + { + "epoch": 0.1441947565543071, + "grad_norm": 1.6415917873382568, + "learning_rate": 1e-06, + "loss": 0.057, + "step": 847 + }, + { + "epoch": 0.14436499829758256, + "grad_norm": 1.7259793281555176, + "learning_rate": 1e-06, + "loss": 0.0627, + "step": 848 + }, + { + "epoch": 0.14453524004085802, + "grad_norm": 1.8741511106491089, + "learning_rate": 1e-06, + "loss": 0.0695, + "step": 849 + }, + { + "epoch": 0.14470548178413348, + "grad_norm": 1.6153074502944946, + "learning_rate": 1e-06, + "loss": 0.0753, + "step": 850 + }, + { + "epoch": 0.14487572352740893, + "grad_norm": 1.6454153060913086, + "learning_rate": 1e-06, + "loss": 0.0708, + "step": 851 + }, + { + "epoch": 0.14504596527068436, + "grad_norm": 2.058832883834839, + "learning_rate": 1e-06, + "loss": 0.0723, + "step": 852 + }, + { + "epoch": 0.14521620701395982, + "grad_norm": 1.3613487482070923, + "learning_rate": 1e-06, + "loss": 0.0551, + "step": 853 + }, + { + "epoch": 0.14538644875723528, + "grad_norm": 1.6068713665008545, + "learning_rate": 1e-06, + "loss": 0.0554, + "step": 854 + }, + { + "epoch": 0.14555669050051073, + "grad_norm": 1.7198082208633423, + "learning_rate": 1e-06, + "loss": 0.071, + "step": 855 + }, + { + "epoch": 0.1457269322437862, + "grad_norm": 1.3624064922332764, + "learning_rate": 1e-06, + "loss": 0.0504, + "step": 856 + }, + { + "epoch": 0.14589717398706162, + "grad_norm": 1.5122796297073364, + "learning_rate": 1e-06, + "loss": 0.0606, + "step": 857 + }, + { + "epoch": 0.14606741573033707, + "grad_norm": 1.4191750288009644, + "learning_rate": 1e-06, + "loss": 0.0609, + "step": 858 + }, + { + "epoch": 0.14623765747361253, + "grad_norm": 1.983462929725647, + "learning_rate": 1e-06, + "loss": 0.0647, + "step": 859 + }, + { + "epoch": 0.146407899216888, + "grad_norm": 1.5732680559158325, + "learning_rate": 1e-06, + "loss": 0.0594, + "step": 860 + }, + { + "epoch": 0.14657814096016344, + "grad_norm": 1.5888450145721436, + "learning_rate": 1e-06, + "loss": 0.0609, + "step": 861 + }, + { + "epoch": 0.14674838270343887, + "grad_norm": 1.727030873298645, + "learning_rate": 1e-06, + "loss": 0.0762, + "step": 862 + }, + { + "epoch": 0.14691862444671433, + "grad_norm": 1.599660038948059, + "learning_rate": 1e-06, + "loss": 0.0706, + "step": 863 + }, + { + "epoch": 0.1470888661899898, + "grad_norm": 1.5314996242523193, + "learning_rate": 1e-06, + "loss": 0.0706, + "step": 864 + }, + { + "epoch": 0.14725910793326524, + "grad_norm": 1.532114863395691, + "learning_rate": 1e-06, + "loss": 0.0531, + "step": 865 + }, + { + "epoch": 0.1474293496765407, + "grad_norm": 1.5699235200881958, + "learning_rate": 1e-06, + "loss": 0.0575, + "step": 866 + }, + { + "epoch": 0.14759959141981613, + "grad_norm": 1.4962087869644165, + "learning_rate": 1e-06, + "loss": 0.048, + "step": 867 + }, + { + "epoch": 0.14776983316309159, + "grad_norm": 1.6223524808883667, + "learning_rate": 1e-06, + "loss": 0.0557, + "step": 868 + }, + { + "epoch": 0.14794007490636704, + "grad_norm": 1.379461407661438, + "learning_rate": 1e-06, + "loss": 0.0488, + "step": 869 + }, + { + "epoch": 0.1481103166496425, + "grad_norm": 1.6299303770065308, + "learning_rate": 1e-06, + "loss": 0.0712, + "step": 870 + }, + { + "epoch": 0.14828055839291795, + "grad_norm": 1.5951275825500488, + "learning_rate": 1e-06, + "loss": 0.0633, + "step": 871 + }, + { + "epoch": 0.14845080013619338, + "grad_norm": 1.6737635135650635, + "learning_rate": 1e-06, + "loss": 0.0576, + "step": 872 + }, + { + "epoch": 0.14862104187946884, + "grad_norm": 2.7894175052642822, + "learning_rate": 1e-06, + "loss": 0.0916, + "step": 873 + }, + { + "epoch": 0.1487912836227443, + "grad_norm": 1.7583144903182983, + "learning_rate": 1e-06, + "loss": 0.0556, + "step": 874 + }, + { + "epoch": 0.14896152536601975, + "grad_norm": 1.6645101308822632, + "learning_rate": 1e-06, + "loss": 0.0673, + "step": 875 + }, + { + "epoch": 0.1491317671092952, + "grad_norm": 1.6808711290359497, + "learning_rate": 1e-06, + "loss": 0.0541, + "step": 876 + }, + { + "epoch": 0.14930200885257064, + "grad_norm": 1.5654128789901733, + "learning_rate": 1e-06, + "loss": 0.0636, + "step": 877 + }, + { + "epoch": 0.1494722505958461, + "grad_norm": 1.5253329277038574, + "learning_rate": 1e-06, + "loss": 0.0561, + "step": 878 + }, + { + "epoch": 0.14964249233912155, + "grad_norm": 1.4865022897720337, + "learning_rate": 1e-06, + "loss": 0.0564, + "step": 879 + }, + { + "epoch": 0.149812734082397, + "grad_norm": 1.4917443990707397, + "learning_rate": 1e-06, + "loss": 0.0575, + "step": 880 + }, + { + "epoch": 0.14998297582567247, + "grad_norm": 1.7758963108062744, + "learning_rate": 1e-06, + "loss": 0.0681, + "step": 881 + }, + { + "epoch": 0.1501532175689479, + "grad_norm": 1.369234323501587, + "learning_rate": 1e-06, + "loss": 0.0596, + "step": 882 + }, + { + "epoch": 0.15032345931222335, + "grad_norm": 2.3506531715393066, + "learning_rate": 1e-06, + "loss": 0.083, + "step": 883 + }, + { + "epoch": 0.1504937010554988, + "grad_norm": 1.8270442485809326, + "learning_rate": 1e-06, + "loss": 0.0722, + "step": 884 + }, + { + "epoch": 0.15066394279877426, + "grad_norm": 1.7770593166351318, + "learning_rate": 1e-06, + "loss": 0.0672, + "step": 885 + }, + { + "epoch": 0.15083418454204972, + "grad_norm": 1.7954052686691284, + "learning_rate": 1e-06, + "loss": 0.068, + "step": 886 + }, + { + "epoch": 0.15100442628532515, + "grad_norm": 1.5579285621643066, + "learning_rate": 1e-06, + "loss": 0.0665, + "step": 887 + }, + { + "epoch": 0.1511746680286006, + "grad_norm": 1.6267443895339966, + "learning_rate": 1e-06, + "loss": 0.0616, + "step": 888 + }, + { + "epoch": 0.15134490977187606, + "grad_norm": 1.9561134576797485, + "learning_rate": 1e-06, + "loss": 0.0672, + "step": 889 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 1.592724084854126, + "learning_rate": 1e-06, + "loss": 0.0641, + "step": 890 + }, + { + "epoch": 0.15168539325842698, + "grad_norm": 1.448255181312561, + "learning_rate": 1e-06, + "loss": 0.0499, + "step": 891 + }, + { + "epoch": 0.1518556350017024, + "grad_norm": 1.7166950702667236, + "learning_rate": 1e-06, + "loss": 0.0724, + "step": 892 + }, + { + "epoch": 0.15202587674497786, + "grad_norm": 2.7402710914611816, + "learning_rate": 1e-06, + "loss": 0.0981, + "step": 893 + }, + { + "epoch": 0.15219611848825332, + "grad_norm": 1.9034618139266968, + "learning_rate": 1e-06, + "loss": 0.0655, + "step": 894 + }, + { + "epoch": 0.15236636023152877, + "grad_norm": 1.6078972816467285, + "learning_rate": 1e-06, + "loss": 0.0545, + "step": 895 + }, + { + "epoch": 0.15253660197480423, + "grad_norm": 3.4988811016082764, + "learning_rate": 1e-06, + "loss": 0.0725, + "step": 896 + }, + { + "epoch": 0.15270684371807966, + "grad_norm": 1.6912193298339844, + "learning_rate": 1e-06, + "loss": 0.0617, + "step": 897 + }, + { + "epoch": 0.15287708546135512, + "grad_norm": 1.5782009363174438, + "learning_rate": 1e-06, + "loss": 0.0454, + "step": 898 + }, + { + "epoch": 0.15304732720463057, + "grad_norm": 2.1376571655273438, + "learning_rate": 1e-06, + "loss": 0.1007, + "step": 899 + }, + { + "epoch": 0.15321756894790603, + "grad_norm": 1.8803651332855225, + "learning_rate": 1e-06, + "loss": 0.0655, + "step": 900 + }, + { + "epoch": 0.1533878106911815, + "grad_norm": 1.6837856769561768, + "learning_rate": 1e-06, + "loss": 0.07, + "step": 901 + }, + { + "epoch": 0.15355805243445692, + "grad_norm": 1.5627678632736206, + "learning_rate": 1e-06, + "loss": 0.0583, + "step": 902 + }, + { + "epoch": 0.15372829417773237, + "grad_norm": 1.413833498954773, + "learning_rate": 1e-06, + "loss": 0.05, + "step": 903 + }, + { + "epoch": 0.15389853592100783, + "grad_norm": 2.4595329761505127, + "learning_rate": 1e-06, + "loss": 0.1166, + "step": 904 + }, + { + "epoch": 0.15406877766428329, + "grad_norm": 4.622979640960693, + "learning_rate": 1e-06, + "loss": 0.102, + "step": 905 + }, + { + "epoch": 0.15423901940755874, + "grad_norm": 1.9842865467071533, + "learning_rate": 1e-06, + "loss": 0.0704, + "step": 906 + }, + { + "epoch": 0.1544092611508342, + "grad_norm": 1.6348425149917603, + "learning_rate": 1e-06, + "loss": 0.0519, + "step": 907 + }, + { + "epoch": 0.15457950289410963, + "grad_norm": 1.920792579650879, + "learning_rate": 1e-06, + "loss": 0.0644, + "step": 908 + }, + { + "epoch": 0.15474974463738508, + "grad_norm": 2.1553070545196533, + "learning_rate": 1e-06, + "loss": 0.0698, + "step": 909 + }, + { + "epoch": 0.15491998638066054, + "grad_norm": 1.422676920890808, + "learning_rate": 1e-06, + "loss": 0.0542, + "step": 910 + }, + { + "epoch": 0.155090228123936, + "grad_norm": 1.8286123275756836, + "learning_rate": 1e-06, + "loss": 0.0608, + "step": 911 + }, + { + "epoch": 0.15526046986721145, + "grad_norm": 1.6634122133255005, + "learning_rate": 1e-06, + "loss": 0.0581, + "step": 912 + }, + { + "epoch": 0.15543071161048688, + "grad_norm": 1.5610778331756592, + "learning_rate": 1e-06, + "loss": 0.0667, + "step": 913 + }, + { + "epoch": 0.15560095335376234, + "grad_norm": 1.7075679302215576, + "learning_rate": 1e-06, + "loss": 0.0587, + "step": 914 + }, + { + "epoch": 0.1557711950970378, + "grad_norm": 1.6121772527694702, + "learning_rate": 1e-06, + "loss": 0.0511, + "step": 915 + }, + { + "epoch": 0.15594143684031325, + "grad_norm": 1.7952123880386353, + "learning_rate": 1e-06, + "loss": 0.0716, + "step": 916 + }, + { + "epoch": 0.1561116785835887, + "grad_norm": 1.8294349908828735, + "learning_rate": 1e-06, + "loss": 0.0611, + "step": 917 + }, + { + "epoch": 0.15628192032686414, + "grad_norm": 1.8930115699768066, + "learning_rate": 1e-06, + "loss": 0.083, + "step": 918 + }, + { + "epoch": 0.1564521620701396, + "grad_norm": 1.7695558071136475, + "learning_rate": 1e-06, + "loss": 0.0718, + "step": 919 + }, + { + "epoch": 0.15662240381341505, + "grad_norm": 1.6812986135482788, + "learning_rate": 1e-06, + "loss": 0.0709, + "step": 920 + }, + { + "epoch": 0.1567926455566905, + "grad_norm": 1.6664036512374878, + "learning_rate": 1e-06, + "loss": 0.0609, + "step": 921 + }, + { + "epoch": 0.15696288729996596, + "grad_norm": 1.7315361499786377, + "learning_rate": 1e-06, + "loss": 0.0638, + "step": 922 + }, + { + "epoch": 0.1571331290432414, + "grad_norm": 1.693820595741272, + "learning_rate": 1e-06, + "loss": 0.0604, + "step": 923 + }, + { + "epoch": 0.15730337078651685, + "grad_norm": 1.9926408529281616, + "learning_rate": 1e-06, + "loss": 0.065, + "step": 924 + }, + { + "epoch": 0.1574736125297923, + "grad_norm": 1.3008970022201538, + "learning_rate": 1e-06, + "loss": 0.0424, + "step": 925 + }, + { + "epoch": 0.15764385427306776, + "grad_norm": 1.7588164806365967, + "learning_rate": 1e-06, + "loss": 0.0569, + "step": 926 + }, + { + "epoch": 0.15781409601634322, + "grad_norm": 1.7521356344223022, + "learning_rate": 1e-06, + "loss": 0.057, + "step": 927 + }, + { + "epoch": 0.15798433775961865, + "grad_norm": 1.6399370431900024, + "learning_rate": 1e-06, + "loss": 0.055, + "step": 928 + }, + { + "epoch": 0.1581545795028941, + "grad_norm": 1.6132686138153076, + "learning_rate": 1e-06, + "loss": 0.062, + "step": 929 + }, + { + "epoch": 0.15832482124616956, + "grad_norm": 1.8911019563674927, + "learning_rate": 1e-06, + "loss": 0.0673, + "step": 930 + }, + { + "epoch": 0.15849506298944502, + "grad_norm": 1.8688396215438843, + "learning_rate": 1e-06, + "loss": 0.0593, + "step": 931 + }, + { + "epoch": 0.15866530473272047, + "grad_norm": 1.948891282081604, + "learning_rate": 1e-06, + "loss": 0.0717, + "step": 932 + }, + { + "epoch": 0.1588355464759959, + "grad_norm": 1.5135903358459473, + "learning_rate": 1e-06, + "loss": 0.0538, + "step": 933 + }, + { + "epoch": 0.15900578821927136, + "grad_norm": 1.983309030532837, + "learning_rate": 1e-06, + "loss": 0.0656, + "step": 934 + }, + { + "epoch": 0.15917602996254682, + "grad_norm": 2.004861354827881, + "learning_rate": 1e-06, + "loss": 0.0662, + "step": 935 + }, + { + "epoch": 0.15934627170582227, + "grad_norm": 1.3474713563919067, + "learning_rate": 1e-06, + "loss": 0.0607, + "step": 936 + }, + { + "epoch": 0.15951651344909773, + "grad_norm": 1.5540618896484375, + "learning_rate": 1e-06, + "loss": 0.0607, + "step": 937 + }, + { + "epoch": 0.15968675519237316, + "grad_norm": 1.8663222789764404, + "learning_rate": 1e-06, + "loss": 0.0694, + "step": 938 + }, + { + "epoch": 0.15985699693564862, + "grad_norm": 1.5832575559616089, + "learning_rate": 1e-06, + "loss": 0.0722, + "step": 939 + }, + { + "epoch": 0.16002723867892407, + "grad_norm": 1.6708149909973145, + "learning_rate": 1e-06, + "loss": 0.0715, + "step": 940 + }, + { + "epoch": 0.16019748042219953, + "grad_norm": 1.7680753469467163, + "learning_rate": 1e-06, + "loss": 0.0586, + "step": 941 + }, + { + "epoch": 0.16036772216547499, + "grad_norm": 1.8743613958358765, + "learning_rate": 1e-06, + "loss": 0.0568, + "step": 942 + }, + { + "epoch": 0.16053796390875041, + "grad_norm": 4.353957176208496, + "learning_rate": 1e-06, + "loss": 0.1204, + "step": 943 + }, + { + "epoch": 0.16070820565202587, + "grad_norm": 1.36697518825531, + "learning_rate": 1e-06, + "loss": 0.0462, + "step": 944 + }, + { + "epoch": 0.16087844739530133, + "grad_norm": 1.474086046218872, + "learning_rate": 1e-06, + "loss": 0.0508, + "step": 945 + }, + { + "epoch": 0.16104868913857678, + "grad_norm": 1.7142692804336548, + "learning_rate": 1e-06, + "loss": 0.0528, + "step": 946 + }, + { + "epoch": 0.16121893088185224, + "grad_norm": 2.086660623550415, + "learning_rate": 1e-06, + "loss": 0.055, + "step": 947 + }, + { + "epoch": 0.16138917262512767, + "grad_norm": 2.088574171066284, + "learning_rate": 1e-06, + "loss": 0.0755, + "step": 948 + }, + { + "epoch": 0.16155941436840313, + "grad_norm": 1.7473136186599731, + "learning_rate": 1e-06, + "loss": 0.0598, + "step": 949 + }, + { + "epoch": 0.16172965611167858, + "grad_norm": 1.3327255249023438, + "learning_rate": 1e-06, + "loss": 0.049, + "step": 950 + }, + { + "epoch": 0.16189989785495404, + "grad_norm": 3.631272792816162, + "learning_rate": 1e-06, + "loss": 0.084, + "step": 951 + }, + { + "epoch": 0.1620701395982295, + "grad_norm": 1.5777565240859985, + "learning_rate": 1e-06, + "loss": 0.0448, + "step": 952 + }, + { + "epoch": 0.16224038134150492, + "grad_norm": 2.02554988861084, + "learning_rate": 1e-06, + "loss": 0.0661, + "step": 953 + }, + { + "epoch": 0.16241062308478038, + "grad_norm": 1.5341172218322754, + "learning_rate": 1e-06, + "loss": 0.0508, + "step": 954 + }, + { + "epoch": 0.16258086482805584, + "grad_norm": 1.34352707862854, + "learning_rate": 1e-06, + "loss": 0.0397, + "step": 955 + }, + { + "epoch": 0.1627511065713313, + "grad_norm": 1.5781581401824951, + "learning_rate": 1e-06, + "loss": 0.0613, + "step": 956 + }, + { + "epoch": 0.16292134831460675, + "grad_norm": 1.7862120866775513, + "learning_rate": 1e-06, + "loss": 0.0512, + "step": 957 + }, + { + "epoch": 0.16309159005788218, + "grad_norm": 2.01899790763855, + "learning_rate": 1e-06, + "loss": 0.0689, + "step": 958 + }, + { + "epoch": 0.16326183180115764, + "grad_norm": 1.6229596138000488, + "learning_rate": 1e-06, + "loss": 0.0539, + "step": 959 + }, + { + "epoch": 0.1634320735444331, + "grad_norm": 1.663812279701233, + "learning_rate": 1e-06, + "loss": 0.0629, + "step": 960 + }, + { + "epoch": 0.16360231528770855, + "grad_norm": 1.2906622886657715, + "learning_rate": 1e-06, + "loss": 0.0541, + "step": 961 + }, + { + "epoch": 0.163772557030984, + "grad_norm": 1.674870491027832, + "learning_rate": 1e-06, + "loss": 0.0665, + "step": 962 + }, + { + "epoch": 0.16394279877425944, + "grad_norm": 1.584000587463379, + "learning_rate": 1e-06, + "loss": 0.0559, + "step": 963 + }, + { + "epoch": 0.1641130405175349, + "grad_norm": 1.8097106218338013, + "learning_rate": 1e-06, + "loss": 0.0663, + "step": 964 + }, + { + "epoch": 0.16428328226081035, + "grad_norm": 1.6824593544006348, + "learning_rate": 1e-06, + "loss": 0.0592, + "step": 965 + }, + { + "epoch": 0.1644535240040858, + "grad_norm": 1.6757420301437378, + "learning_rate": 1e-06, + "loss": 0.0565, + "step": 966 + }, + { + "epoch": 0.16462376574736126, + "grad_norm": 1.91703462600708, + "learning_rate": 1e-06, + "loss": 0.0617, + "step": 967 + }, + { + "epoch": 0.1647940074906367, + "grad_norm": 1.8160761594772339, + "learning_rate": 1e-06, + "loss": 0.0579, + "step": 968 + }, + { + "epoch": 0.16496424923391215, + "grad_norm": 1.3976374864578247, + "learning_rate": 1e-06, + "loss": 0.0428, + "step": 969 + }, + { + "epoch": 0.1651344909771876, + "grad_norm": 2.016111373901367, + "learning_rate": 1e-06, + "loss": 0.054, + "step": 970 + }, + { + "epoch": 0.16530473272046306, + "grad_norm": 1.7259122133255005, + "learning_rate": 1e-06, + "loss": 0.0488, + "step": 971 + }, + { + "epoch": 0.16547497446373852, + "grad_norm": 1.5745218992233276, + "learning_rate": 1e-06, + "loss": 0.0569, + "step": 972 + }, + { + "epoch": 0.16564521620701397, + "grad_norm": 1.3785955905914307, + "learning_rate": 1e-06, + "loss": 0.0412, + "step": 973 + }, + { + "epoch": 0.1658154579502894, + "grad_norm": 1.76412034034729, + "learning_rate": 1e-06, + "loss": 0.0533, + "step": 974 + }, + { + "epoch": 0.16598569969356486, + "grad_norm": 1.6779992580413818, + "learning_rate": 1e-06, + "loss": 0.0636, + "step": 975 + }, + { + "epoch": 0.16615594143684032, + "grad_norm": 1.5417025089263916, + "learning_rate": 1e-06, + "loss": 0.0477, + "step": 976 + }, + { + "epoch": 0.16632618318011577, + "grad_norm": 1.7664419412612915, + "learning_rate": 1e-06, + "loss": 0.0628, + "step": 977 + }, + { + "epoch": 0.16649642492339123, + "grad_norm": 1.905269980430603, + "learning_rate": 1e-06, + "loss": 0.0582, + "step": 978 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 2.259796619415283, + "learning_rate": 1e-06, + "loss": 0.0721, + "step": 979 + }, + { + "epoch": 0.16683690840994211, + "grad_norm": 1.6967008113861084, + "learning_rate": 1e-06, + "loss": 0.0517, + "step": 980 + }, + { + "epoch": 0.16700715015321757, + "grad_norm": 2.18546986579895, + "learning_rate": 1e-06, + "loss": 0.0645, + "step": 981 + }, + { + "epoch": 0.16717739189649303, + "grad_norm": 1.5567885637283325, + "learning_rate": 1e-06, + "loss": 0.0471, + "step": 982 + }, + { + "epoch": 0.16734763363976848, + "grad_norm": 1.516837477684021, + "learning_rate": 1e-06, + "loss": 0.0442, + "step": 983 + }, + { + "epoch": 0.1675178753830439, + "grad_norm": 1.4908742904663086, + "learning_rate": 1e-06, + "loss": 0.0524, + "step": 984 + }, + { + "epoch": 0.16768811712631937, + "grad_norm": 1.570236086845398, + "learning_rate": 1e-06, + "loss": 0.054, + "step": 985 + }, + { + "epoch": 0.16785835886959483, + "grad_norm": 1.672652006149292, + "learning_rate": 1e-06, + "loss": 0.0613, + "step": 986 + }, + { + "epoch": 0.16802860061287028, + "grad_norm": 1.6871075630187988, + "learning_rate": 1e-06, + "loss": 0.0683, + "step": 987 + }, + { + "epoch": 0.16819884235614574, + "grad_norm": 1.6366732120513916, + "learning_rate": 1e-06, + "loss": 0.0542, + "step": 988 + }, + { + "epoch": 0.16836908409942117, + "grad_norm": 1.6381199359893799, + "learning_rate": 1e-06, + "loss": 0.0522, + "step": 989 + }, + { + "epoch": 0.16853932584269662, + "grad_norm": 1.6033238172531128, + "learning_rate": 1e-06, + "loss": 0.0519, + "step": 990 + }, + { + "epoch": 0.16870956758597208, + "grad_norm": 2.074885606765747, + "learning_rate": 1e-06, + "loss": 0.0668, + "step": 991 + }, + { + "epoch": 0.16887980932924754, + "grad_norm": 1.8530831336975098, + "learning_rate": 1e-06, + "loss": 0.0569, + "step": 992 + }, + { + "epoch": 0.169050051072523, + "grad_norm": 1.8304858207702637, + "learning_rate": 1e-06, + "loss": 0.0613, + "step": 993 + }, + { + "epoch": 0.16922029281579842, + "grad_norm": 2.9935953617095947, + "learning_rate": 1e-06, + "loss": 0.0868, + "step": 994 + }, + { + "epoch": 0.16939053455907388, + "grad_norm": 1.593682050704956, + "learning_rate": 1e-06, + "loss": 0.0525, + "step": 995 + }, + { + "epoch": 0.16956077630234934, + "grad_norm": 1.4647144079208374, + "learning_rate": 1e-06, + "loss": 0.0539, + "step": 996 + }, + { + "epoch": 0.1697310180456248, + "grad_norm": 1.6193076372146606, + "learning_rate": 1e-06, + "loss": 0.0556, + "step": 997 + }, + { + "epoch": 0.16990125978890025, + "grad_norm": 1.3825421333312988, + "learning_rate": 1e-06, + "loss": 0.0485, + "step": 998 + }, + { + "epoch": 0.17007150153217568, + "grad_norm": 1.6311777830123901, + "learning_rate": 1e-06, + "loss": 0.0604, + "step": 999 + }, + { + "epoch": 0.17024174327545114, + "grad_norm": 1.859731912612915, + "learning_rate": 1e-06, + "loss": 0.0722, + "step": 1000 + }, + { + "epoch": 0.17024174327545114, + "eval_loss": 0.262949138879776, + "eval_runtime": 21.3587, + "eval_samples_per_second": 14.046, + "eval_steps_per_second": 0.375, + "step": 1000 + }, + { + "epoch": 0.1704119850187266, + "grad_norm": 1.5789490938186646, + "learning_rate": 1e-06, + "loss": 0.0589, + "step": 1001 + }, + { + "epoch": 0.17058222676200205, + "grad_norm": 1.7059580087661743, + "learning_rate": 1e-06, + "loss": 0.0505, + "step": 1002 + }, + { + "epoch": 0.1707524685052775, + "grad_norm": 1.5325723886489868, + "learning_rate": 1e-06, + "loss": 0.0557, + "step": 1003 + }, + { + "epoch": 0.17092271024855293, + "grad_norm": 1.3633655309677124, + "learning_rate": 1e-06, + "loss": 0.0467, + "step": 1004 + }, + { + "epoch": 0.1710929519918284, + "grad_norm": 2.3447840213775635, + "learning_rate": 1e-06, + "loss": 0.0671, + "step": 1005 + }, + { + "epoch": 0.17126319373510385, + "grad_norm": 1.9885766506195068, + "learning_rate": 1e-06, + "loss": 0.0615, + "step": 1006 + }, + { + "epoch": 0.1714334354783793, + "grad_norm": 1.5275179147720337, + "learning_rate": 1e-06, + "loss": 0.0505, + "step": 1007 + }, + { + "epoch": 0.17160367722165476, + "grad_norm": 1.6165765523910522, + "learning_rate": 1e-06, + "loss": 0.0465, + "step": 1008 + }, + { + "epoch": 0.1717739189649302, + "grad_norm": 2.4540200233459473, + "learning_rate": 1e-06, + "loss": 0.0682, + "step": 1009 + }, + { + "epoch": 0.17194416070820565, + "grad_norm": 1.5132991075515747, + "learning_rate": 1e-06, + "loss": 0.0498, + "step": 1010 + }, + { + "epoch": 0.1721144024514811, + "grad_norm": 1.4574193954467773, + "learning_rate": 1e-06, + "loss": 0.0515, + "step": 1011 + }, + { + "epoch": 0.17228464419475656, + "grad_norm": 2.0926284790039062, + "learning_rate": 1e-06, + "loss": 0.0676, + "step": 1012 + }, + { + "epoch": 0.17245488593803202, + "grad_norm": 1.316934585571289, + "learning_rate": 1e-06, + "loss": 0.049, + "step": 1013 + }, + { + "epoch": 0.17262512768130744, + "grad_norm": 1.4470518827438354, + "learning_rate": 1e-06, + "loss": 0.0527, + "step": 1014 + }, + { + "epoch": 0.1727953694245829, + "grad_norm": 1.254742980003357, + "learning_rate": 1e-06, + "loss": 0.04, + "step": 1015 + }, + { + "epoch": 0.17296561116785836, + "grad_norm": 1.6093502044677734, + "learning_rate": 1e-06, + "loss": 0.0575, + "step": 1016 + }, + { + "epoch": 0.17313585291113381, + "grad_norm": 1.9037551879882812, + "learning_rate": 1e-06, + "loss": 0.0602, + "step": 1017 + }, + { + "epoch": 0.17330609465440927, + "grad_norm": 1.4383158683776855, + "learning_rate": 1e-06, + "loss": 0.0481, + "step": 1018 + }, + { + "epoch": 0.1734763363976847, + "grad_norm": 1.7174170017242432, + "learning_rate": 1e-06, + "loss": 0.0677, + "step": 1019 + }, + { + "epoch": 0.17364657814096016, + "grad_norm": 1.838952660560608, + "learning_rate": 1e-06, + "loss": 0.0555, + "step": 1020 + }, + { + "epoch": 0.1738168198842356, + "grad_norm": 1.9162826538085938, + "learning_rate": 1e-06, + "loss": 0.0619, + "step": 1021 + }, + { + "epoch": 0.17398706162751107, + "grad_norm": 1.5161962509155273, + "learning_rate": 1e-06, + "loss": 0.048, + "step": 1022 + }, + { + "epoch": 0.17415730337078653, + "grad_norm": 1.8026893138885498, + "learning_rate": 1e-06, + "loss": 0.0615, + "step": 1023 + }, + { + "epoch": 0.17432754511406195, + "grad_norm": 1.637364387512207, + "learning_rate": 1e-06, + "loss": 0.0497, + "step": 1024 + }, + { + "epoch": 0.1744977868573374, + "grad_norm": 1.7427301406860352, + "learning_rate": 1e-06, + "loss": 0.0467, + "step": 1025 + }, + { + "epoch": 0.17466802860061287, + "grad_norm": 1.6015710830688477, + "learning_rate": 1e-06, + "loss": 0.0456, + "step": 1026 + }, + { + "epoch": 0.17483827034388832, + "grad_norm": 1.5106914043426514, + "learning_rate": 1e-06, + "loss": 0.0583, + "step": 1027 + }, + { + "epoch": 0.17500851208716378, + "grad_norm": 1.7485862970352173, + "learning_rate": 1e-06, + "loss": 0.0743, + "step": 1028 + }, + { + "epoch": 0.1751787538304392, + "grad_norm": 1.5311837196350098, + "learning_rate": 1e-06, + "loss": 0.0555, + "step": 1029 + }, + { + "epoch": 0.17534899557371467, + "grad_norm": 1.6525341272354126, + "learning_rate": 1e-06, + "loss": 0.0566, + "step": 1030 + }, + { + "epoch": 0.17551923731699012, + "grad_norm": 1.3050777912139893, + "learning_rate": 1e-06, + "loss": 0.0454, + "step": 1031 + }, + { + "epoch": 0.17568947906026558, + "grad_norm": 1.8630820512771606, + "learning_rate": 1e-06, + "loss": 0.0615, + "step": 1032 + }, + { + "epoch": 0.17585972080354104, + "grad_norm": 1.974867582321167, + "learning_rate": 1e-06, + "loss": 0.0574, + "step": 1033 + }, + { + "epoch": 0.1760299625468165, + "grad_norm": 1.6042343378067017, + "learning_rate": 1e-06, + "loss": 0.0469, + "step": 1034 + }, + { + "epoch": 0.17620020429009192, + "grad_norm": 1.5605472326278687, + "learning_rate": 1e-06, + "loss": 0.0607, + "step": 1035 + }, + { + "epoch": 0.17637044603336738, + "grad_norm": 1.6991108655929565, + "learning_rate": 1e-06, + "loss": 0.0458, + "step": 1036 + }, + { + "epoch": 0.17654068777664284, + "grad_norm": 2.3888437747955322, + "learning_rate": 1e-06, + "loss": 0.0646, + "step": 1037 + }, + { + "epoch": 0.1767109295199183, + "grad_norm": 1.9789478778839111, + "learning_rate": 1e-06, + "loss": 0.0657, + "step": 1038 + }, + { + "epoch": 0.17688117126319375, + "grad_norm": 1.5386762619018555, + "learning_rate": 1e-06, + "loss": 0.0591, + "step": 1039 + }, + { + "epoch": 0.17705141300646918, + "grad_norm": 1.657543659210205, + "learning_rate": 1e-06, + "loss": 0.0504, + "step": 1040 + }, + { + "epoch": 0.17722165474974463, + "grad_norm": 1.5504556894302368, + "learning_rate": 1e-06, + "loss": 0.0467, + "step": 1041 + }, + { + "epoch": 0.1773918964930201, + "grad_norm": 1.718420147895813, + "learning_rate": 1e-06, + "loss": 0.0559, + "step": 1042 + }, + { + "epoch": 0.17756213823629555, + "grad_norm": 2.331953525543213, + "learning_rate": 1e-06, + "loss": 0.056, + "step": 1043 + }, + { + "epoch": 0.177732379979571, + "grad_norm": 1.6991020441055298, + "learning_rate": 1e-06, + "loss": 0.0474, + "step": 1044 + }, + { + "epoch": 0.17790262172284643, + "grad_norm": 1.8949966430664062, + "learning_rate": 1e-06, + "loss": 0.043, + "step": 1045 + }, + { + "epoch": 0.1780728634661219, + "grad_norm": 1.8064547777175903, + "learning_rate": 1e-06, + "loss": 0.0436, + "step": 1046 + }, + { + "epoch": 0.17824310520939735, + "grad_norm": 1.4450627565383911, + "learning_rate": 1e-06, + "loss": 0.0453, + "step": 1047 + }, + { + "epoch": 0.1784133469526728, + "grad_norm": 1.9606854915618896, + "learning_rate": 1e-06, + "loss": 0.052, + "step": 1048 + }, + { + "epoch": 0.17858358869594826, + "grad_norm": 1.8185573816299438, + "learning_rate": 1e-06, + "loss": 0.0438, + "step": 1049 + }, + { + "epoch": 0.1787538304392237, + "grad_norm": 1.8913239240646362, + "learning_rate": 1e-06, + "loss": 0.0456, + "step": 1050 + }, + { + "epoch": 0.17892407218249914, + "grad_norm": 1.6773005723953247, + "learning_rate": 1e-06, + "loss": 0.0502, + "step": 1051 + }, + { + "epoch": 0.1790943139257746, + "grad_norm": 1.6672918796539307, + "learning_rate": 1e-06, + "loss": 0.0471, + "step": 1052 + }, + { + "epoch": 0.17926455566905006, + "grad_norm": 1.8581830263137817, + "learning_rate": 1e-06, + "loss": 0.0539, + "step": 1053 + }, + { + "epoch": 0.17943479741232551, + "grad_norm": 1.4864681959152222, + "learning_rate": 1e-06, + "loss": 0.0546, + "step": 1054 + }, + { + "epoch": 0.17960503915560094, + "grad_norm": 1.527038812637329, + "learning_rate": 1e-06, + "loss": 0.0417, + "step": 1055 + }, + { + "epoch": 0.1797752808988764, + "grad_norm": 1.6635370254516602, + "learning_rate": 1e-06, + "loss": 0.0457, + "step": 1056 + }, + { + "epoch": 0.17994552264215186, + "grad_norm": 1.5412622690200806, + "learning_rate": 1e-06, + "loss": 0.0437, + "step": 1057 + }, + { + "epoch": 0.1801157643854273, + "grad_norm": 1.4829204082489014, + "learning_rate": 1e-06, + "loss": 0.0493, + "step": 1058 + }, + { + "epoch": 0.18028600612870277, + "grad_norm": 1.4343328475952148, + "learning_rate": 1e-06, + "loss": 0.0405, + "step": 1059 + }, + { + "epoch": 0.1804562478719782, + "grad_norm": 1.7797609567642212, + "learning_rate": 1e-06, + "loss": 0.0464, + "step": 1060 + }, + { + "epoch": 0.18062648961525365, + "grad_norm": 1.488383412361145, + "learning_rate": 1e-06, + "loss": 0.0355, + "step": 1061 + }, + { + "epoch": 0.1807967313585291, + "grad_norm": 1.3695396184921265, + "learning_rate": 1e-06, + "loss": 0.048, + "step": 1062 + }, + { + "epoch": 0.18096697310180457, + "grad_norm": 1.8657076358795166, + "learning_rate": 1e-06, + "loss": 0.0478, + "step": 1063 + }, + { + "epoch": 0.18113721484508002, + "grad_norm": 1.7366667985916138, + "learning_rate": 1e-06, + "loss": 0.0429, + "step": 1064 + }, + { + "epoch": 0.18130745658835545, + "grad_norm": 1.4432430267333984, + "learning_rate": 1e-06, + "loss": 0.0451, + "step": 1065 + }, + { + "epoch": 0.1814776983316309, + "grad_norm": 1.6333496570587158, + "learning_rate": 1e-06, + "loss": 0.0612, + "step": 1066 + }, + { + "epoch": 0.18164794007490637, + "grad_norm": 2.0347399711608887, + "learning_rate": 1e-06, + "loss": 0.0544, + "step": 1067 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 2.103520393371582, + "learning_rate": 1e-06, + "loss": 0.0528, + "step": 1068 + }, + { + "epoch": 0.18198842356145728, + "grad_norm": 2.289806604385376, + "learning_rate": 1e-06, + "loss": 0.0714, + "step": 1069 + }, + { + "epoch": 0.1821586653047327, + "grad_norm": 1.8326396942138672, + "learning_rate": 1e-06, + "loss": 0.0477, + "step": 1070 + }, + { + "epoch": 0.18232890704800817, + "grad_norm": 1.6226252317428589, + "learning_rate": 1e-06, + "loss": 0.0596, + "step": 1071 + }, + { + "epoch": 0.18249914879128362, + "grad_norm": 1.6972850561141968, + "learning_rate": 1e-06, + "loss": 0.0555, + "step": 1072 + }, + { + "epoch": 0.18266939053455908, + "grad_norm": 1.7216845750808716, + "learning_rate": 1e-06, + "loss": 0.0727, + "step": 1073 + }, + { + "epoch": 0.18283963227783454, + "grad_norm": 1.396399974822998, + "learning_rate": 1e-06, + "loss": 0.0451, + "step": 1074 + }, + { + "epoch": 0.18300987402110996, + "grad_norm": 1.5802775621414185, + "learning_rate": 1e-06, + "loss": 0.0523, + "step": 1075 + }, + { + "epoch": 0.18318011576438542, + "grad_norm": 1.5000909566879272, + "learning_rate": 1e-06, + "loss": 0.0503, + "step": 1076 + }, + { + "epoch": 0.18335035750766088, + "grad_norm": 1.5903329849243164, + "learning_rate": 1e-06, + "loss": 0.0453, + "step": 1077 + }, + { + "epoch": 0.18352059925093633, + "grad_norm": 1.3907889127731323, + "learning_rate": 1e-06, + "loss": 0.0482, + "step": 1078 + }, + { + "epoch": 0.1836908409942118, + "grad_norm": 1.4356728792190552, + "learning_rate": 1e-06, + "loss": 0.0358, + "step": 1079 + }, + { + "epoch": 0.18386108273748722, + "grad_norm": 1.425837755203247, + "learning_rate": 1e-06, + "loss": 0.0481, + "step": 1080 + }, + { + "epoch": 0.18403132448076268, + "grad_norm": 1.793095588684082, + "learning_rate": 1e-06, + "loss": 0.0577, + "step": 1081 + }, + { + "epoch": 0.18420156622403813, + "grad_norm": 1.5674705505371094, + "learning_rate": 1e-06, + "loss": 0.0405, + "step": 1082 + }, + { + "epoch": 0.1843718079673136, + "grad_norm": 1.527916431427002, + "learning_rate": 1e-06, + "loss": 0.0394, + "step": 1083 + }, + { + "epoch": 0.18454204971058905, + "grad_norm": 1.8252404928207397, + "learning_rate": 1e-06, + "loss": 0.0506, + "step": 1084 + }, + { + "epoch": 0.18471229145386447, + "grad_norm": 1.6396743059158325, + "learning_rate": 1e-06, + "loss": 0.0462, + "step": 1085 + }, + { + "epoch": 0.18488253319713993, + "grad_norm": 1.7540291547775269, + "learning_rate": 1e-06, + "loss": 0.051, + "step": 1086 + }, + { + "epoch": 0.1850527749404154, + "grad_norm": 1.7107782363891602, + "learning_rate": 1e-06, + "loss": 0.0552, + "step": 1087 + }, + { + "epoch": 0.18522301668369084, + "grad_norm": 1.7277930974960327, + "learning_rate": 1e-06, + "loss": 0.0558, + "step": 1088 + }, + { + "epoch": 0.1853932584269663, + "grad_norm": 1.531584620475769, + "learning_rate": 1e-06, + "loss": 0.0436, + "step": 1089 + }, + { + "epoch": 0.18556350017024173, + "grad_norm": 1.708498477935791, + "learning_rate": 1e-06, + "loss": 0.0599, + "step": 1090 + }, + { + "epoch": 0.1857337419135172, + "grad_norm": 1.5215576887130737, + "learning_rate": 1e-06, + "loss": 0.0413, + "step": 1091 + }, + { + "epoch": 0.18590398365679264, + "grad_norm": 1.6072683334350586, + "learning_rate": 1e-06, + "loss": 0.0472, + "step": 1092 + }, + { + "epoch": 0.1860742254000681, + "grad_norm": 2.161273717880249, + "learning_rate": 1e-06, + "loss": 0.053, + "step": 1093 + }, + { + "epoch": 0.18624446714334356, + "grad_norm": 2.1874208450317383, + "learning_rate": 1e-06, + "loss": 0.0676, + "step": 1094 + }, + { + "epoch": 0.186414708886619, + "grad_norm": 1.7869967222213745, + "learning_rate": 1e-06, + "loss": 0.0566, + "step": 1095 + }, + { + "epoch": 0.18658495062989444, + "grad_norm": 1.7522093057632446, + "learning_rate": 1e-06, + "loss": 0.0482, + "step": 1096 + }, + { + "epoch": 0.1867551923731699, + "grad_norm": 1.5887916088104248, + "learning_rate": 1e-06, + "loss": 0.0439, + "step": 1097 + }, + { + "epoch": 0.18692543411644535, + "grad_norm": 1.3974252939224243, + "learning_rate": 1e-06, + "loss": 0.0413, + "step": 1098 + }, + { + "epoch": 0.1870956758597208, + "grad_norm": 1.798572301864624, + "learning_rate": 1e-06, + "loss": 0.0612, + "step": 1099 + }, + { + "epoch": 0.18726591760299627, + "grad_norm": 2.0376672744750977, + "learning_rate": 1e-06, + "loss": 0.0512, + "step": 1100 + }, + { + "epoch": 0.1874361593462717, + "grad_norm": 1.7600435018539429, + "learning_rate": 1e-06, + "loss": 0.0516, + "step": 1101 + }, + { + "epoch": 0.18760640108954715, + "grad_norm": 1.638627529144287, + "learning_rate": 1e-06, + "loss": 0.0438, + "step": 1102 + }, + { + "epoch": 0.1877766428328226, + "grad_norm": 1.4002454280853271, + "learning_rate": 1e-06, + "loss": 0.0406, + "step": 1103 + }, + { + "epoch": 0.18794688457609807, + "grad_norm": 1.3637229204177856, + "learning_rate": 1e-06, + "loss": 0.0363, + "step": 1104 + }, + { + "epoch": 0.18811712631937352, + "grad_norm": 1.7648390531539917, + "learning_rate": 1e-06, + "loss": 0.0491, + "step": 1105 + }, + { + "epoch": 0.18828736806264895, + "grad_norm": 1.8793604373931885, + "learning_rate": 1e-06, + "loss": 0.0658, + "step": 1106 + }, + { + "epoch": 0.1884576098059244, + "grad_norm": 1.87686288356781, + "learning_rate": 1e-06, + "loss": 0.0495, + "step": 1107 + }, + { + "epoch": 0.18862785154919987, + "grad_norm": 1.5583778619766235, + "learning_rate": 1e-06, + "loss": 0.047, + "step": 1108 + }, + { + "epoch": 0.18879809329247532, + "grad_norm": 1.7888959646224976, + "learning_rate": 1e-06, + "loss": 0.0373, + "step": 1109 + }, + { + "epoch": 0.18896833503575078, + "grad_norm": 1.628745436668396, + "learning_rate": 1e-06, + "loss": 0.0405, + "step": 1110 + }, + { + "epoch": 0.1891385767790262, + "grad_norm": 2.113985776901245, + "learning_rate": 1e-06, + "loss": 0.0488, + "step": 1111 + }, + { + "epoch": 0.18930881852230166, + "grad_norm": 2.1266591548919678, + "learning_rate": 1e-06, + "loss": 0.058, + "step": 1112 + }, + { + "epoch": 0.18947906026557712, + "grad_norm": 1.9797148704528809, + "learning_rate": 1e-06, + "loss": 0.0585, + "step": 1113 + }, + { + "epoch": 0.18964930200885258, + "grad_norm": 1.6259199380874634, + "learning_rate": 1e-06, + "loss": 0.0519, + "step": 1114 + }, + { + "epoch": 0.18981954375212803, + "grad_norm": 1.7262063026428223, + "learning_rate": 1e-06, + "loss": 0.038, + "step": 1115 + }, + { + "epoch": 0.18998978549540346, + "grad_norm": 1.5812803506851196, + "learning_rate": 1e-06, + "loss": 0.0487, + "step": 1116 + }, + { + "epoch": 0.19016002723867892, + "grad_norm": 2.021449327468872, + "learning_rate": 1e-06, + "loss": 0.057, + "step": 1117 + }, + { + "epoch": 0.19033026898195438, + "grad_norm": 1.7292096614837646, + "learning_rate": 1e-06, + "loss": 0.0379, + "step": 1118 + }, + { + "epoch": 0.19050051072522983, + "grad_norm": 1.4616914987564087, + "learning_rate": 1e-06, + "loss": 0.0401, + "step": 1119 + }, + { + "epoch": 0.1906707524685053, + "grad_norm": 1.4097241163253784, + "learning_rate": 1e-06, + "loss": 0.0463, + "step": 1120 + }, + { + "epoch": 0.19084099421178072, + "grad_norm": 1.8328516483306885, + "learning_rate": 1e-06, + "loss": 0.0454, + "step": 1121 + }, + { + "epoch": 0.19101123595505617, + "grad_norm": 2.055018663406372, + "learning_rate": 1e-06, + "loss": 0.0606, + "step": 1122 + }, + { + "epoch": 0.19118147769833163, + "grad_norm": 1.4977216720581055, + "learning_rate": 1e-06, + "loss": 0.0347, + "step": 1123 + }, + { + "epoch": 0.1913517194416071, + "grad_norm": 1.8649815320968628, + "learning_rate": 1e-06, + "loss": 0.0484, + "step": 1124 + }, + { + "epoch": 0.19152196118488254, + "grad_norm": 1.5965502262115479, + "learning_rate": 1e-06, + "loss": 0.0413, + "step": 1125 + }, + { + "epoch": 0.19169220292815797, + "grad_norm": 1.7801200151443481, + "learning_rate": 1e-06, + "loss": 0.0462, + "step": 1126 + }, + { + "epoch": 0.19186244467143343, + "grad_norm": 5.706264495849609, + "learning_rate": 1e-06, + "loss": 0.1093, + "step": 1127 + }, + { + "epoch": 0.1920326864147089, + "grad_norm": 1.6867637634277344, + "learning_rate": 1e-06, + "loss": 0.0417, + "step": 1128 + }, + { + "epoch": 0.19220292815798434, + "grad_norm": 1.7014727592468262, + "learning_rate": 1e-06, + "loss": 0.0461, + "step": 1129 + }, + { + "epoch": 0.1923731699012598, + "grad_norm": 1.6920650005340576, + "learning_rate": 1e-06, + "loss": 0.0513, + "step": 1130 + }, + { + "epoch": 0.19254341164453523, + "grad_norm": 1.7450604438781738, + "learning_rate": 1e-06, + "loss": 0.0562, + "step": 1131 + }, + { + "epoch": 0.19271365338781068, + "grad_norm": 1.501325011253357, + "learning_rate": 1e-06, + "loss": 0.0447, + "step": 1132 + }, + { + "epoch": 0.19288389513108614, + "grad_norm": 1.8626445531845093, + "learning_rate": 1e-06, + "loss": 0.0755, + "step": 1133 + }, + { + "epoch": 0.1930541368743616, + "grad_norm": 1.3214149475097656, + "learning_rate": 1e-06, + "loss": 0.0405, + "step": 1134 + }, + { + "epoch": 0.19322437861763705, + "grad_norm": 1.639206886291504, + "learning_rate": 1e-06, + "loss": 0.0486, + "step": 1135 + }, + { + "epoch": 0.19339462036091248, + "grad_norm": 1.6951420307159424, + "learning_rate": 1e-06, + "loss": 0.0458, + "step": 1136 + }, + { + "epoch": 0.19356486210418794, + "grad_norm": 1.5768959522247314, + "learning_rate": 1e-06, + "loss": 0.0466, + "step": 1137 + }, + { + "epoch": 0.1937351038474634, + "grad_norm": 1.6302675008773804, + "learning_rate": 1e-06, + "loss": 0.0413, + "step": 1138 + }, + { + "epoch": 0.19390534559073885, + "grad_norm": 1.4018832445144653, + "learning_rate": 1e-06, + "loss": 0.0429, + "step": 1139 + }, + { + "epoch": 0.1940755873340143, + "grad_norm": 1.482530951499939, + "learning_rate": 1e-06, + "loss": 0.0342, + "step": 1140 + }, + { + "epoch": 0.19424582907728974, + "grad_norm": 1.4372907876968384, + "learning_rate": 1e-06, + "loss": 0.0384, + "step": 1141 + }, + { + "epoch": 0.1944160708205652, + "grad_norm": 2.328382730484009, + "learning_rate": 1e-06, + "loss": 0.0549, + "step": 1142 + }, + { + "epoch": 0.19458631256384065, + "grad_norm": 1.780719518661499, + "learning_rate": 1e-06, + "loss": 0.0468, + "step": 1143 + }, + { + "epoch": 0.1947565543071161, + "grad_norm": 1.8703736066818237, + "learning_rate": 1e-06, + "loss": 0.044, + "step": 1144 + }, + { + "epoch": 0.19492679605039157, + "grad_norm": 1.92543625831604, + "learning_rate": 1e-06, + "loss": 0.0583, + "step": 1145 + }, + { + "epoch": 0.195097037793667, + "grad_norm": 1.7063385248184204, + "learning_rate": 1e-06, + "loss": 0.0481, + "step": 1146 + }, + { + "epoch": 0.19526727953694245, + "grad_norm": 1.2378170490264893, + "learning_rate": 1e-06, + "loss": 0.0309, + "step": 1147 + }, + { + "epoch": 0.1954375212802179, + "grad_norm": 1.4159069061279297, + "learning_rate": 1e-06, + "loss": 0.0413, + "step": 1148 + }, + { + "epoch": 0.19560776302349336, + "grad_norm": 1.816462516784668, + "learning_rate": 1e-06, + "loss": 0.0421, + "step": 1149 + }, + { + "epoch": 0.19577800476676882, + "grad_norm": 1.6749504804611206, + "learning_rate": 1e-06, + "loss": 0.0514, + "step": 1150 + }, + { + "epoch": 0.19594824651004425, + "grad_norm": 1.813492774963379, + "learning_rate": 1e-06, + "loss": 0.0387, + "step": 1151 + }, + { + "epoch": 0.1961184882533197, + "grad_norm": 1.8329745531082153, + "learning_rate": 1e-06, + "loss": 0.0493, + "step": 1152 + }, + { + "epoch": 0.19628872999659516, + "grad_norm": 2.6087090969085693, + "learning_rate": 1e-06, + "loss": 0.0479, + "step": 1153 + }, + { + "epoch": 0.19645897173987062, + "grad_norm": 2.178959369659424, + "learning_rate": 1e-06, + "loss": 0.0515, + "step": 1154 + }, + { + "epoch": 0.19662921348314608, + "grad_norm": 1.7682230472564697, + "learning_rate": 1e-06, + "loss": 0.0426, + "step": 1155 + }, + { + "epoch": 0.1967994552264215, + "grad_norm": 1.9333738088607788, + "learning_rate": 1e-06, + "loss": 0.0395, + "step": 1156 + }, + { + "epoch": 0.19696969696969696, + "grad_norm": 2.0627245903015137, + "learning_rate": 1e-06, + "loss": 0.0678, + "step": 1157 + }, + { + "epoch": 0.19713993871297242, + "grad_norm": 1.649121642112732, + "learning_rate": 1e-06, + "loss": 0.0548, + "step": 1158 + }, + { + "epoch": 0.19731018045624787, + "grad_norm": 1.3847073316574097, + "learning_rate": 1e-06, + "loss": 0.0336, + "step": 1159 + }, + { + "epoch": 0.19748042219952333, + "grad_norm": 1.4557738304138184, + "learning_rate": 1e-06, + "loss": 0.0334, + "step": 1160 + }, + { + "epoch": 0.1976506639427988, + "grad_norm": 1.7297767400741577, + "learning_rate": 1e-06, + "loss": 0.0404, + "step": 1161 + }, + { + "epoch": 0.19782090568607422, + "grad_norm": 1.6031403541564941, + "learning_rate": 1e-06, + "loss": 0.0457, + "step": 1162 + }, + { + "epoch": 0.19799114742934967, + "grad_norm": 1.4427376985549927, + "learning_rate": 1e-06, + "loss": 0.0394, + "step": 1163 + }, + { + "epoch": 0.19816138917262513, + "grad_norm": 1.751774787902832, + "learning_rate": 1e-06, + "loss": 0.0394, + "step": 1164 + }, + { + "epoch": 0.1983316309159006, + "grad_norm": 1.5437021255493164, + "learning_rate": 1e-06, + "loss": 0.0405, + "step": 1165 + }, + { + "epoch": 0.19850187265917604, + "grad_norm": 1.793131947517395, + "learning_rate": 1e-06, + "loss": 0.0521, + "step": 1166 + }, + { + "epoch": 0.19867211440245147, + "grad_norm": 1.4936705827713013, + "learning_rate": 1e-06, + "loss": 0.0365, + "step": 1167 + }, + { + "epoch": 0.19884235614572693, + "grad_norm": 1.961836814880371, + "learning_rate": 1e-06, + "loss": 0.0466, + "step": 1168 + }, + { + "epoch": 0.19901259788900239, + "grad_norm": 1.4718447923660278, + "learning_rate": 1e-06, + "loss": 0.0377, + "step": 1169 + }, + { + "epoch": 0.19918283963227784, + "grad_norm": 1.747491717338562, + "learning_rate": 1e-06, + "loss": 0.0542, + "step": 1170 + }, + { + "epoch": 0.1993530813755533, + "grad_norm": 1.4570775032043457, + "learning_rate": 1e-06, + "loss": 0.0378, + "step": 1171 + }, + { + "epoch": 0.19952332311882873, + "grad_norm": 1.637116551399231, + "learning_rate": 1e-06, + "loss": 0.0397, + "step": 1172 + }, + { + "epoch": 0.19969356486210418, + "grad_norm": 1.496801733970642, + "learning_rate": 1e-06, + "loss": 0.0526, + "step": 1173 + }, + { + "epoch": 0.19986380660537964, + "grad_norm": 1.7981535196304321, + "learning_rate": 1e-06, + "loss": 0.0393, + "step": 1174 + }, + { + "epoch": 0.2000340483486551, + "grad_norm": 1.7196568250656128, + "learning_rate": 1e-06, + "loss": 0.0409, + "step": 1175 + }, + { + "epoch": 0.20020429009193055, + "grad_norm": 2.1747894287109375, + "learning_rate": 1e-06, + "loss": 0.0571, + "step": 1176 + }, + { + "epoch": 0.20037453183520598, + "grad_norm": 1.8108248710632324, + "learning_rate": 1e-06, + "loss": 0.0495, + "step": 1177 + }, + { + "epoch": 0.20054477357848144, + "grad_norm": 1.820467233657837, + "learning_rate": 1e-06, + "loss": 0.0341, + "step": 1178 + }, + { + "epoch": 0.2007150153217569, + "grad_norm": 1.7773289680480957, + "learning_rate": 1e-06, + "loss": 0.0464, + "step": 1179 + }, + { + "epoch": 0.20088525706503235, + "grad_norm": 1.433434247970581, + "learning_rate": 1e-06, + "loss": 0.0351, + "step": 1180 + }, + { + "epoch": 0.2010554988083078, + "grad_norm": 1.836634874343872, + "learning_rate": 1e-06, + "loss": 0.0588, + "step": 1181 + }, + { + "epoch": 0.20122574055158324, + "grad_norm": 1.8555299043655396, + "learning_rate": 1e-06, + "loss": 0.0517, + "step": 1182 + }, + { + "epoch": 0.2013959822948587, + "grad_norm": 1.853201985359192, + "learning_rate": 1e-06, + "loss": 0.0396, + "step": 1183 + }, + { + "epoch": 0.20156622403813415, + "grad_norm": 1.4348528385162354, + "learning_rate": 1e-06, + "loss": 0.0393, + "step": 1184 + }, + { + "epoch": 0.2017364657814096, + "grad_norm": 1.7525590658187866, + "learning_rate": 1e-06, + "loss": 0.043, + "step": 1185 + }, + { + "epoch": 0.20190670752468506, + "grad_norm": 1.9075602293014526, + "learning_rate": 1e-06, + "loss": 0.0399, + "step": 1186 + }, + { + "epoch": 0.2020769492679605, + "grad_norm": 1.6843738555908203, + "learning_rate": 1e-06, + "loss": 0.0434, + "step": 1187 + }, + { + "epoch": 0.20224719101123595, + "grad_norm": 1.3292150497436523, + "learning_rate": 1e-06, + "loss": 0.0267, + "step": 1188 + }, + { + "epoch": 0.2024174327545114, + "grad_norm": 1.8276677131652832, + "learning_rate": 1e-06, + "loss": 0.0394, + "step": 1189 + }, + { + "epoch": 0.20258767449778686, + "grad_norm": 1.8090132474899292, + "learning_rate": 1e-06, + "loss": 0.0448, + "step": 1190 + }, + { + "epoch": 0.20275791624106232, + "grad_norm": 1.500030755996704, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 1191 + }, + { + "epoch": 0.20292815798433775, + "grad_norm": 1.9649564027786255, + "learning_rate": 1e-06, + "loss": 0.0413, + "step": 1192 + }, + { + "epoch": 0.2030983997276132, + "grad_norm": 1.6460673809051514, + "learning_rate": 1e-06, + "loss": 0.0322, + "step": 1193 + }, + { + "epoch": 0.20326864147088866, + "grad_norm": 2.098881959915161, + "learning_rate": 1e-06, + "loss": 0.0381, + "step": 1194 + }, + { + "epoch": 0.20343888321416412, + "grad_norm": 1.7853596210479736, + "learning_rate": 1e-06, + "loss": 0.0428, + "step": 1195 + }, + { + "epoch": 0.20360912495743957, + "grad_norm": 2.36584210395813, + "learning_rate": 1e-06, + "loss": 0.0648, + "step": 1196 + }, + { + "epoch": 0.203779366700715, + "grad_norm": 2.340277671813965, + "learning_rate": 1e-06, + "loss": 0.0431, + "step": 1197 + }, + { + "epoch": 0.20394960844399046, + "grad_norm": 1.4898146390914917, + "learning_rate": 1e-06, + "loss": 0.0441, + "step": 1198 + }, + { + "epoch": 0.20411985018726592, + "grad_norm": 1.8527204990386963, + "learning_rate": 1e-06, + "loss": 0.0483, + "step": 1199 + }, + { + "epoch": 0.20429009193054137, + "grad_norm": 1.7014769315719604, + "learning_rate": 1e-06, + "loss": 0.0329, + "step": 1200 + }, + { + "epoch": 0.20446033367381683, + "grad_norm": 1.4059642553329468, + "learning_rate": 1e-06, + "loss": 0.0355, + "step": 1201 + }, + { + "epoch": 0.20463057541709226, + "grad_norm": 2.0433590412139893, + "learning_rate": 1e-06, + "loss": 0.0547, + "step": 1202 + }, + { + "epoch": 0.20480081716036772, + "grad_norm": 1.3744386434555054, + "learning_rate": 1e-06, + "loss": 0.0304, + "step": 1203 + }, + { + "epoch": 0.20497105890364317, + "grad_norm": 1.6091663837432861, + "learning_rate": 1e-06, + "loss": 0.0469, + "step": 1204 + }, + { + "epoch": 0.20514130064691863, + "grad_norm": 1.5374170541763306, + "learning_rate": 1e-06, + "loss": 0.0378, + "step": 1205 + }, + { + "epoch": 0.20531154239019409, + "grad_norm": 1.6638165712356567, + "learning_rate": 1e-06, + "loss": 0.04, + "step": 1206 + }, + { + "epoch": 0.2054817841334695, + "grad_norm": 1.4747909307479858, + "learning_rate": 1e-06, + "loss": 0.05, + "step": 1207 + }, + { + "epoch": 0.20565202587674497, + "grad_norm": 1.9081274271011353, + "learning_rate": 1e-06, + "loss": 0.0428, + "step": 1208 + }, + { + "epoch": 0.20582226762002043, + "grad_norm": 1.4791818857192993, + "learning_rate": 1e-06, + "loss": 0.0359, + "step": 1209 + }, + { + "epoch": 0.20599250936329588, + "grad_norm": 1.411188006401062, + "learning_rate": 1e-06, + "loss": 0.0325, + "step": 1210 + }, + { + "epoch": 0.20616275110657134, + "grad_norm": 2.2119061946868896, + "learning_rate": 1e-06, + "loss": 0.0473, + "step": 1211 + }, + { + "epoch": 0.20633299284984677, + "grad_norm": 1.4180293083190918, + "learning_rate": 1e-06, + "loss": 0.0399, + "step": 1212 + }, + { + "epoch": 0.20650323459312223, + "grad_norm": 1.3908727169036865, + "learning_rate": 1e-06, + "loss": 0.0362, + "step": 1213 + }, + { + "epoch": 0.20667347633639768, + "grad_norm": 2.12986421585083, + "learning_rate": 1e-06, + "loss": 0.0596, + "step": 1214 + }, + { + "epoch": 0.20684371807967314, + "grad_norm": 1.529612421989441, + "learning_rate": 1e-06, + "loss": 0.0486, + "step": 1215 + }, + { + "epoch": 0.2070139598229486, + "grad_norm": 1.533721685409546, + "learning_rate": 1e-06, + "loss": 0.0343, + "step": 1216 + }, + { + "epoch": 0.20718420156622402, + "grad_norm": 1.1491042375564575, + "learning_rate": 1e-06, + "loss": 0.0301, + "step": 1217 + }, + { + "epoch": 0.20735444330949948, + "grad_norm": 1.4477553367614746, + "learning_rate": 1e-06, + "loss": 0.0401, + "step": 1218 + }, + { + "epoch": 0.20752468505277494, + "grad_norm": 1.9513334035873413, + "learning_rate": 1e-06, + "loss": 0.0453, + "step": 1219 + }, + { + "epoch": 0.2076949267960504, + "grad_norm": 1.6228469610214233, + "learning_rate": 1e-06, + "loss": 0.0335, + "step": 1220 + }, + { + "epoch": 0.20786516853932585, + "grad_norm": 1.736630916595459, + "learning_rate": 1e-06, + "loss": 0.0431, + "step": 1221 + }, + { + "epoch": 0.2080354102826013, + "grad_norm": 1.7524003982543945, + "learning_rate": 1e-06, + "loss": 0.0354, + "step": 1222 + }, + { + "epoch": 0.20820565202587674, + "grad_norm": 1.9606053829193115, + "learning_rate": 1e-06, + "loss": 0.0413, + "step": 1223 + }, + { + "epoch": 0.2083758937691522, + "grad_norm": 1.4940640926361084, + "learning_rate": 1e-06, + "loss": 0.0351, + "step": 1224 + }, + { + "epoch": 0.20854613551242765, + "grad_norm": 1.542036533355713, + "learning_rate": 1e-06, + "loss": 0.0445, + "step": 1225 + }, + { + "epoch": 0.2087163772557031, + "grad_norm": 1.5856529474258423, + "learning_rate": 1e-06, + "loss": 0.0389, + "step": 1226 + }, + { + "epoch": 0.20888661899897856, + "grad_norm": 1.537645697593689, + "learning_rate": 1e-06, + "loss": 0.0402, + "step": 1227 + }, + { + "epoch": 0.209056860742254, + "grad_norm": 1.4162458181381226, + "learning_rate": 1e-06, + "loss": 0.0421, + "step": 1228 + }, + { + "epoch": 0.20922710248552945, + "grad_norm": 2.1916866302490234, + "learning_rate": 1e-06, + "loss": 0.038, + "step": 1229 + }, + { + "epoch": 0.2093973442288049, + "grad_norm": 1.5257288217544556, + "learning_rate": 1e-06, + "loss": 0.0417, + "step": 1230 + }, + { + "epoch": 0.20956758597208036, + "grad_norm": 1.9720427989959717, + "learning_rate": 1e-06, + "loss": 0.0439, + "step": 1231 + }, + { + "epoch": 0.20973782771535582, + "grad_norm": 1.6400502920150757, + "learning_rate": 1e-06, + "loss": 0.0373, + "step": 1232 + }, + { + "epoch": 0.20990806945863125, + "grad_norm": 1.5405869483947754, + "learning_rate": 1e-06, + "loss": 0.0459, + "step": 1233 + }, + { + "epoch": 0.2100783112019067, + "grad_norm": 1.7419536113739014, + "learning_rate": 1e-06, + "loss": 0.0407, + "step": 1234 + }, + { + "epoch": 0.21024855294518216, + "grad_norm": 1.5766996145248413, + "learning_rate": 1e-06, + "loss": 0.0347, + "step": 1235 + }, + { + "epoch": 0.21041879468845762, + "grad_norm": 2.0112199783325195, + "learning_rate": 1e-06, + "loss": 0.058, + "step": 1236 + }, + { + "epoch": 0.21058903643173307, + "grad_norm": 2.1589255332946777, + "learning_rate": 1e-06, + "loss": 0.0524, + "step": 1237 + }, + { + "epoch": 0.2107592781750085, + "grad_norm": 1.5398106575012207, + "learning_rate": 1e-06, + "loss": 0.0405, + "step": 1238 + }, + { + "epoch": 0.21092951991828396, + "grad_norm": 5.148733139038086, + "learning_rate": 1e-06, + "loss": 0.0627, + "step": 1239 + }, + { + "epoch": 0.21109976166155942, + "grad_norm": 1.5265625715255737, + "learning_rate": 1e-06, + "loss": 0.0405, + "step": 1240 + }, + { + "epoch": 0.21127000340483487, + "grad_norm": 2.0388882160186768, + "learning_rate": 1e-06, + "loss": 0.0422, + "step": 1241 + }, + { + "epoch": 0.21144024514811033, + "grad_norm": 1.5845919847488403, + "learning_rate": 1e-06, + "loss": 0.0388, + "step": 1242 + }, + { + "epoch": 0.21161048689138576, + "grad_norm": 1.984523057937622, + "learning_rate": 1e-06, + "loss": 0.0507, + "step": 1243 + }, + { + "epoch": 0.21178072863466121, + "grad_norm": 1.4896509647369385, + "learning_rate": 1e-06, + "loss": 0.0357, + "step": 1244 + }, + { + "epoch": 0.21195097037793667, + "grad_norm": 1.240064024925232, + "learning_rate": 1e-06, + "loss": 0.0345, + "step": 1245 + }, + { + "epoch": 0.21212121212121213, + "grad_norm": 1.753086805343628, + "learning_rate": 1e-06, + "loss": 0.0492, + "step": 1246 + }, + { + "epoch": 0.21229145386448758, + "grad_norm": 1.9514403343200684, + "learning_rate": 1e-06, + "loss": 0.038, + "step": 1247 + }, + { + "epoch": 0.212461695607763, + "grad_norm": 1.8083099126815796, + "learning_rate": 1e-06, + "loss": 0.0425, + "step": 1248 + }, + { + "epoch": 0.21263193735103847, + "grad_norm": 1.6861008405685425, + "learning_rate": 1e-06, + "loss": 0.043, + "step": 1249 + }, + { + "epoch": 0.21280217909431393, + "grad_norm": 1.4214813709259033, + "learning_rate": 1e-06, + "loss": 0.037, + "step": 1250 + }, + { + "epoch": 0.21297242083758938, + "grad_norm": 1.8322423696517944, + "learning_rate": 1e-06, + "loss": 0.0427, + "step": 1251 + }, + { + "epoch": 0.21314266258086484, + "grad_norm": 4.470928192138672, + "learning_rate": 1e-06, + "loss": 0.0992, + "step": 1252 + }, + { + "epoch": 0.21331290432414027, + "grad_norm": 1.9335957765579224, + "learning_rate": 1e-06, + "loss": 0.0402, + "step": 1253 + }, + { + "epoch": 0.21348314606741572, + "grad_norm": 1.5598244667053223, + "learning_rate": 1e-06, + "loss": 0.0402, + "step": 1254 + }, + { + "epoch": 0.21365338781069118, + "grad_norm": 1.5040446519851685, + "learning_rate": 1e-06, + "loss": 0.0361, + "step": 1255 + }, + { + "epoch": 0.21382362955396664, + "grad_norm": 2.0225491523742676, + "learning_rate": 1e-06, + "loss": 0.0518, + "step": 1256 + }, + { + "epoch": 0.2139938712972421, + "grad_norm": 1.7718695402145386, + "learning_rate": 1e-06, + "loss": 0.0487, + "step": 1257 + }, + { + "epoch": 0.21416411304051752, + "grad_norm": 2.1986560821533203, + "learning_rate": 1e-06, + "loss": 0.0538, + "step": 1258 + }, + { + "epoch": 0.21433435478379298, + "grad_norm": 1.7088544368743896, + "learning_rate": 1e-06, + "loss": 0.0452, + "step": 1259 + }, + { + "epoch": 0.21450459652706844, + "grad_norm": 1.5693634748458862, + "learning_rate": 1e-06, + "loss": 0.036, + "step": 1260 + }, + { + "epoch": 0.2146748382703439, + "grad_norm": 1.8983958959579468, + "learning_rate": 1e-06, + "loss": 0.0543, + "step": 1261 + }, + { + "epoch": 0.21484508001361935, + "grad_norm": 1.3244619369506836, + "learning_rate": 1e-06, + "loss": 0.032, + "step": 1262 + }, + { + "epoch": 0.21501532175689478, + "grad_norm": 1.7973686456680298, + "learning_rate": 1e-06, + "loss": 0.0458, + "step": 1263 + }, + { + "epoch": 0.21518556350017023, + "grad_norm": 1.7030102014541626, + "learning_rate": 1e-06, + "loss": 0.041, + "step": 1264 + }, + { + "epoch": 0.2153558052434457, + "grad_norm": 1.5380405187606812, + "learning_rate": 1e-06, + "loss": 0.0382, + "step": 1265 + }, + { + "epoch": 0.21552604698672115, + "grad_norm": 2.168999433517456, + "learning_rate": 1e-06, + "loss": 0.0585, + "step": 1266 + }, + { + "epoch": 0.2156962887299966, + "grad_norm": 2.1720099449157715, + "learning_rate": 1e-06, + "loss": 0.0363, + "step": 1267 + }, + { + "epoch": 0.21586653047327203, + "grad_norm": 2.3076796531677246, + "learning_rate": 1e-06, + "loss": 0.0679, + "step": 1268 + }, + { + "epoch": 0.2160367722165475, + "grad_norm": 1.696959376335144, + "learning_rate": 1e-06, + "loss": 0.0331, + "step": 1269 + }, + { + "epoch": 0.21620701395982295, + "grad_norm": 1.8012551069259644, + "learning_rate": 1e-06, + "loss": 0.0399, + "step": 1270 + }, + { + "epoch": 0.2163772557030984, + "grad_norm": 1.706807017326355, + "learning_rate": 1e-06, + "loss": 0.044, + "step": 1271 + }, + { + "epoch": 0.21654749744637386, + "grad_norm": 1.3523311614990234, + "learning_rate": 1e-06, + "loss": 0.04, + "step": 1272 + }, + { + "epoch": 0.2167177391896493, + "grad_norm": 1.5000497102737427, + "learning_rate": 1e-06, + "loss": 0.0383, + "step": 1273 + }, + { + "epoch": 0.21688798093292475, + "grad_norm": 1.4352927207946777, + "learning_rate": 1e-06, + "loss": 0.0327, + "step": 1274 + }, + { + "epoch": 0.2170582226762002, + "grad_norm": 1.4274100065231323, + "learning_rate": 1e-06, + "loss": 0.0302, + "step": 1275 + }, + { + "epoch": 0.21722846441947566, + "grad_norm": 1.4734169244766235, + "learning_rate": 1e-06, + "loss": 0.0291, + "step": 1276 + }, + { + "epoch": 0.21739870616275112, + "grad_norm": 1.532433271408081, + "learning_rate": 1e-06, + "loss": 0.0424, + "step": 1277 + }, + { + "epoch": 0.21756894790602654, + "grad_norm": 1.4756892919540405, + "learning_rate": 1e-06, + "loss": 0.0291, + "step": 1278 + }, + { + "epoch": 0.217739189649302, + "grad_norm": 1.4791312217712402, + "learning_rate": 1e-06, + "loss": 0.0428, + "step": 1279 + }, + { + "epoch": 0.21790943139257746, + "grad_norm": 1.201405644416809, + "learning_rate": 1e-06, + "loss": 0.0272, + "step": 1280 + }, + { + "epoch": 0.21807967313585291, + "grad_norm": 3.065424680709839, + "learning_rate": 1e-06, + "loss": 0.0406, + "step": 1281 + }, + { + "epoch": 0.21824991487912837, + "grad_norm": 1.7154473066329956, + "learning_rate": 1e-06, + "loss": 0.0307, + "step": 1282 + }, + { + "epoch": 0.21842015662240383, + "grad_norm": 1.5745826959609985, + "learning_rate": 1e-06, + "loss": 0.0295, + "step": 1283 + }, + { + "epoch": 0.21859039836567926, + "grad_norm": 1.9603149890899658, + "learning_rate": 1e-06, + "loss": 0.0469, + "step": 1284 + }, + { + "epoch": 0.2187606401089547, + "grad_norm": 1.6602104902267456, + "learning_rate": 1e-06, + "loss": 0.0433, + "step": 1285 + }, + { + "epoch": 0.21893088185223017, + "grad_norm": 1.7477741241455078, + "learning_rate": 1e-06, + "loss": 0.0291, + "step": 1286 + }, + { + "epoch": 0.21910112359550563, + "grad_norm": 1.8926868438720703, + "learning_rate": 1e-06, + "loss": 0.0392, + "step": 1287 + }, + { + "epoch": 0.21927136533878108, + "grad_norm": 1.508943796157837, + "learning_rate": 1e-06, + "loss": 0.0286, + "step": 1288 + }, + { + "epoch": 0.2194416070820565, + "grad_norm": 1.6563001871109009, + "learning_rate": 1e-06, + "loss": 0.0359, + "step": 1289 + }, + { + "epoch": 0.21961184882533197, + "grad_norm": 1.6094985008239746, + "learning_rate": 1e-06, + "loss": 0.0357, + "step": 1290 + }, + { + "epoch": 0.21978209056860742, + "grad_norm": 1.9460690021514893, + "learning_rate": 1e-06, + "loss": 0.0515, + "step": 1291 + }, + { + "epoch": 0.21995233231188288, + "grad_norm": 1.3542574644088745, + "learning_rate": 1e-06, + "loss": 0.0389, + "step": 1292 + }, + { + "epoch": 0.22012257405515834, + "grad_norm": 1.3899677991867065, + "learning_rate": 1e-06, + "loss": 0.0318, + "step": 1293 + }, + { + "epoch": 0.22029281579843377, + "grad_norm": 1.9105228185653687, + "learning_rate": 1e-06, + "loss": 0.049, + "step": 1294 + }, + { + "epoch": 0.22046305754170922, + "grad_norm": 1.5152599811553955, + "learning_rate": 1e-06, + "loss": 0.0416, + "step": 1295 + }, + { + "epoch": 0.22063329928498468, + "grad_norm": 1.2295457124710083, + "learning_rate": 1e-06, + "loss": 0.031, + "step": 1296 + }, + { + "epoch": 0.22080354102826014, + "grad_norm": 1.8313056230545044, + "learning_rate": 1e-06, + "loss": 0.0451, + "step": 1297 + }, + { + "epoch": 0.2209737827715356, + "grad_norm": 1.5762004852294922, + "learning_rate": 1e-06, + "loss": 0.0437, + "step": 1298 + }, + { + "epoch": 0.22114402451481102, + "grad_norm": 1.603684663772583, + "learning_rate": 1e-06, + "loss": 0.0372, + "step": 1299 + }, + { + "epoch": 0.22131426625808648, + "grad_norm": 1.809328317642212, + "learning_rate": 1e-06, + "loss": 0.0328, + "step": 1300 + }, + { + "epoch": 0.22148450800136193, + "grad_norm": 1.6629222631454468, + "learning_rate": 1e-06, + "loss": 0.0421, + "step": 1301 + }, + { + "epoch": 0.2216547497446374, + "grad_norm": 1.5856209993362427, + "learning_rate": 1e-06, + "loss": 0.0377, + "step": 1302 + }, + { + "epoch": 0.22182499148791285, + "grad_norm": 1.8281983137130737, + "learning_rate": 1e-06, + "loss": 0.0339, + "step": 1303 + }, + { + "epoch": 0.22199523323118828, + "grad_norm": 1.8064937591552734, + "learning_rate": 1e-06, + "loss": 0.0359, + "step": 1304 + }, + { + "epoch": 0.22216547497446373, + "grad_norm": 1.6382505893707275, + "learning_rate": 1e-06, + "loss": 0.0376, + "step": 1305 + }, + { + "epoch": 0.2223357167177392, + "grad_norm": 1.8759084939956665, + "learning_rate": 1e-06, + "loss": 0.0397, + "step": 1306 + }, + { + "epoch": 0.22250595846101465, + "grad_norm": 1.587537407875061, + "learning_rate": 1e-06, + "loss": 0.0361, + "step": 1307 + }, + { + "epoch": 0.2226762002042901, + "grad_norm": 1.6119834184646606, + "learning_rate": 1e-06, + "loss": 0.0368, + "step": 1308 + }, + { + "epoch": 0.22284644194756553, + "grad_norm": 1.4815311431884766, + "learning_rate": 1e-06, + "loss": 0.0318, + "step": 1309 + }, + { + "epoch": 0.223016683690841, + "grad_norm": 1.4200611114501953, + "learning_rate": 1e-06, + "loss": 0.0282, + "step": 1310 + }, + { + "epoch": 0.22318692543411645, + "grad_norm": 1.3545717000961304, + "learning_rate": 1e-06, + "loss": 0.0374, + "step": 1311 + }, + { + "epoch": 0.2233571671773919, + "grad_norm": 1.5961992740631104, + "learning_rate": 1e-06, + "loss": 0.0402, + "step": 1312 + }, + { + "epoch": 0.22352740892066736, + "grad_norm": 1.420921802520752, + "learning_rate": 1e-06, + "loss": 0.0443, + "step": 1313 + }, + { + "epoch": 0.2236976506639428, + "grad_norm": 1.7957236766815186, + "learning_rate": 1e-06, + "loss": 0.0354, + "step": 1314 + }, + { + "epoch": 0.22386789240721824, + "grad_norm": 1.4843519926071167, + "learning_rate": 1e-06, + "loss": 0.0337, + "step": 1315 + }, + { + "epoch": 0.2240381341504937, + "grad_norm": 1.5852254629135132, + "learning_rate": 1e-06, + "loss": 0.0366, + "step": 1316 + }, + { + "epoch": 0.22420837589376916, + "grad_norm": 1.654274344444275, + "learning_rate": 1e-06, + "loss": 0.0407, + "step": 1317 + }, + { + "epoch": 0.22437861763704461, + "grad_norm": 1.884421467781067, + "learning_rate": 1e-06, + "loss": 0.0379, + "step": 1318 + }, + { + "epoch": 0.22454885938032004, + "grad_norm": 1.8321963548660278, + "learning_rate": 1e-06, + "loss": 0.0404, + "step": 1319 + }, + { + "epoch": 0.2247191011235955, + "grad_norm": 1.735929012298584, + "learning_rate": 1e-06, + "loss": 0.0477, + "step": 1320 + }, + { + "epoch": 0.22488934286687096, + "grad_norm": 1.5793464183807373, + "learning_rate": 1e-06, + "loss": 0.0268, + "step": 1321 + }, + { + "epoch": 0.2250595846101464, + "grad_norm": 1.6962897777557373, + "learning_rate": 1e-06, + "loss": 0.0387, + "step": 1322 + }, + { + "epoch": 0.22522982635342187, + "grad_norm": 1.4622056484222412, + "learning_rate": 1e-06, + "loss": 0.0315, + "step": 1323 + }, + { + "epoch": 0.2254000680966973, + "grad_norm": 1.4477473497390747, + "learning_rate": 1e-06, + "loss": 0.0419, + "step": 1324 + }, + { + "epoch": 0.22557030983997275, + "grad_norm": 1.508447289466858, + "learning_rate": 1e-06, + "loss": 0.0317, + "step": 1325 + }, + { + "epoch": 0.2257405515832482, + "grad_norm": 1.3146283626556396, + "learning_rate": 1e-06, + "loss": 0.0271, + "step": 1326 + }, + { + "epoch": 0.22591079332652367, + "grad_norm": 1.3582558631896973, + "learning_rate": 1e-06, + "loss": 0.0304, + "step": 1327 + }, + { + "epoch": 0.22608103506979912, + "grad_norm": 2.6392621994018555, + "learning_rate": 1e-06, + "loss": 0.0521, + "step": 1328 + }, + { + "epoch": 0.22625127681307455, + "grad_norm": 1.992223858833313, + "learning_rate": 1e-06, + "loss": 0.0608, + "step": 1329 + }, + { + "epoch": 0.22642151855635, + "grad_norm": 1.6414374113082886, + "learning_rate": 1e-06, + "loss": 0.0394, + "step": 1330 + }, + { + "epoch": 0.22659176029962547, + "grad_norm": 1.3846744298934937, + "learning_rate": 1e-06, + "loss": 0.0291, + "step": 1331 + }, + { + "epoch": 0.22676200204290092, + "grad_norm": 1.5988634824752808, + "learning_rate": 1e-06, + "loss": 0.0307, + "step": 1332 + }, + { + "epoch": 0.22693224378617638, + "grad_norm": 1.4296313524246216, + "learning_rate": 1e-06, + "loss": 0.0322, + "step": 1333 + }, + { + "epoch": 0.2271024855294518, + "grad_norm": 1.8813896179199219, + "learning_rate": 1e-06, + "loss": 0.0487, + "step": 1334 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 1.5750256776809692, + "learning_rate": 1e-06, + "loss": 0.0335, + "step": 1335 + }, + { + "epoch": 0.22744296901600272, + "grad_norm": 1.7783371210098267, + "learning_rate": 1e-06, + "loss": 0.0428, + "step": 1336 + }, + { + "epoch": 0.22761321075927818, + "grad_norm": 1.2634700536727905, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 1337 + }, + { + "epoch": 0.22778345250255363, + "grad_norm": 1.2207298278808594, + "learning_rate": 1e-06, + "loss": 0.0283, + "step": 1338 + }, + { + "epoch": 0.22795369424582906, + "grad_norm": 1.94607412815094, + "learning_rate": 1e-06, + "loss": 0.0345, + "step": 1339 + }, + { + "epoch": 0.22812393598910452, + "grad_norm": 3.4443957805633545, + "learning_rate": 1e-06, + "loss": 0.0546, + "step": 1340 + }, + { + "epoch": 0.22829417773237998, + "grad_norm": 2.1095688343048096, + "learning_rate": 1e-06, + "loss": 0.0488, + "step": 1341 + }, + { + "epoch": 0.22846441947565543, + "grad_norm": 5.005306720733643, + "learning_rate": 1e-06, + "loss": 0.079, + "step": 1342 + }, + { + "epoch": 0.2286346612189309, + "grad_norm": 1.6204818487167358, + "learning_rate": 1e-06, + "loss": 0.034, + "step": 1343 + }, + { + "epoch": 0.22880490296220635, + "grad_norm": 1.8814785480499268, + "learning_rate": 1e-06, + "loss": 0.0365, + "step": 1344 + }, + { + "epoch": 0.22897514470548178, + "grad_norm": 1.5905134677886963, + "learning_rate": 1e-06, + "loss": 0.0328, + "step": 1345 + }, + { + "epoch": 0.22914538644875723, + "grad_norm": 1.4241366386413574, + "learning_rate": 1e-06, + "loss": 0.0307, + "step": 1346 + }, + { + "epoch": 0.2293156281920327, + "grad_norm": 1.5406595468521118, + "learning_rate": 1e-06, + "loss": 0.032, + "step": 1347 + }, + { + "epoch": 0.22948586993530815, + "grad_norm": 1.3714312314987183, + "learning_rate": 1e-06, + "loss": 0.0271, + "step": 1348 + }, + { + "epoch": 0.2296561116785836, + "grad_norm": 1.919778823852539, + "learning_rate": 1e-06, + "loss": 0.0372, + "step": 1349 + }, + { + "epoch": 0.22982635342185903, + "grad_norm": 1.4447976350784302, + "learning_rate": 1e-06, + "loss": 0.0289, + "step": 1350 + }, + { + "epoch": 0.2299965951651345, + "grad_norm": 2.128904104232788, + "learning_rate": 1e-06, + "loss": 0.0554, + "step": 1351 + }, + { + "epoch": 0.23016683690840994, + "grad_norm": 1.2440881729125977, + "learning_rate": 1e-06, + "loss": 0.0365, + "step": 1352 + }, + { + "epoch": 0.2303370786516854, + "grad_norm": 2.688145637512207, + "learning_rate": 1e-06, + "loss": 0.0558, + "step": 1353 + }, + { + "epoch": 0.23050732039496086, + "grad_norm": 1.7107890844345093, + "learning_rate": 1e-06, + "loss": 0.0364, + "step": 1354 + }, + { + "epoch": 0.2306775621382363, + "grad_norm": 1.3972139358520508, + "learning_rate": 1e-06, + "loss": 0.0306, + "step": 1355 + }, + { + "epoch": 0.23084780388151174, + "grad_norm": 1.6243966817855835, + "learning_rate": 1e-06, + "loss": 0.0385, + "step": 1356 + }, + { + "epoch": 0.2310180456247872, + "grad_norm": 1.5047791004180908, + "learning_rate": 1e-06, + "loss": 0.0344, + "step": 1357 + }, + { + "epoch": 0.23118828736806266, + "grad_norm": 1.5549938678741455, + "learning_rate": 1e-06, + "loss": 0.0321, + "step": 1358 + }, + { + "epoch": 0.2313585291113381, + "grad_norm": 1.5315052270889282, + "learning_rate": 1e-06, + "loss": 0.0272, + "step": 1359 + }, + { + "epoch": 0.23152877085461354, + "grad_norm": 1.6653809547424316, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 1360 + }, + { + "epoch": 0.231699012597889, + "grad_norm": 1.8215115070343018, + "learning_rate": 1e-06, + "loss": 0.034, + "step": 1361 + }, + { + "epoch": 0.23186925434116445, + "grad_norm": 1.3523900508880615, + "learning_rate": 1e-06, + "loss": 0.0282, + "step": 1362 + }, + { + "epoch": 0.2320394960844399, + "grad_norm": 1.6606805324554443, + "learning_rate": 1e-06, + "loss": 0.0462, + "step": 1363 + }, + { + "epoch": 0.23220973782771537, + "grad_norm": 1.6801470518112183, + "learning_rate": 1e-06, + "loss": 0.0428, + "step": 1364 + }, + { + "epoch": 0.2323799795709908, + "grad_norm": 1.6056513786315918, + "learning_rate": 1e-06, + "loss": 0.037, + "step": 1365 + }, + { + "epoch": 0.23255022131426625, + "grad_norm": 1.549023151397705, + "learning_rate": 1e-06, + "loss": 0.0362, + "step": 1366 + }, + { + "epoch": 0.2327204630575417, + "grad_norm": 1.8244285583496094, + "learning_rate": 1e-06, + "loss": 0.0469, + "step": 1367 + }, + { + "epoch": 0.23289070480081717, + "grad_norm": 1.7030590772628784, + "learning_rate": 1e-06, + "loss": 0.0396, + "step": 1368 + }, + { + "epoch": 0.23306094654409262, + "grad_norm": 1.9018099308013916, + "learning_rate": 1e-06, + "loss": 0.0462, + "step": 1369 + }, + { + "epoch": 0.23323118828736805, + "grad_norm": 1.8366174697875977, + "learning_rate": 1e-06, + "loss": 0.0431, + "step": 1370 + }, + { + "epoch": 0.2334014300306435, + "grad_norm": 1.7807953357696533, + "learning_rate": 1e-06, + "loss": 0.0397, + "step": 1371 + }, + { + "epoch": 0.23357167177391897, + "grad_norm": 3.430758476257324, + "learning_rate": 1e-06, + "loss": 0.0504, + "step": 1372 + }, + { + "epoch": 0.23374191351719442, + "grad_norm": 1.6039738655090332, + "learning_rate": 1e-06, + "loss": 0.0326, + "step": 1373 + }, + { + "epoch": 0.23391215526046988, + "grad_norm": 2.001668691635132, + "learning_rate": 1e-06, + "loss": 0.0322, + "step": 1374 + }, + { + "epoch": 0.2340823970037453, + "grad_norm": 1.9030237197875977, + "learning_rate": 1e-06, + "loss": 0.0322, + "step": 1375 + }, + { + "epoch": 0.23425263874702076, + "grad_norm": 1.4003009796142578, + "learning_rate": 1e-06, + "loss": 0.0317, + "step": 1376 + }, + { + "epoch": 0.23442288049029622, + "grad_norm": 1.6500396728515625, + "learning_rate": 1e-06, + "loss": 0.0332, + "step": 1377 + }, + { + "epoch": 0.23459312223357168, + "grad_norm": 4.6141438484191895, + "learning_rate": 1e-06, + "loss": 0.0547, + "step": 1378 + }, + { + "epoch": 0.23476336397684713, + "grad_norm": 1.4108147621154785, + "learning_rate": 1e-06, + "loss": 0.0233, + "step": 1379 + }, + { + "epoch": 0.23493360572012256, + "grad_norm": 1.68300461769104, + "learning_rate": 1e-06, + "loss": 0.0265, + "step": 1380 + }, + { + "epoch": 0.23510384746339802, + "grad_norm": 1.4702105522155762, + "learning_rate": 1e-06, + "loss": 0.0311, + "step": 1381 + }, + { + "epoch": 0.23527408920667348, + "grad_norm": 1.669421911239624, + "learning_rate": 1e-06, + "loss": 0.0459, + "step": 1382 + }, + { + "epoch": 0.23544433094994893, + "grad_norm": 1.987108588218689, + "learning_rate": 1e-06, + "loss": 0.0317, + "step": 1383 + }, + { + "epoch": 0.2356145726932244, + "grad_norm": 1.5817410945892334, + "learning_rate": 1e-06, + "loss": 0.0297, + "step": 1384 + }, + { + "epoch": 0.23578481443649982, + "grad_norm": 1.524600863456726, + "learning_rate": 1e-06, + "loss": 0.027, + "step": 1385 + }, + { + "epoch": 0.23595505617977527, + "grad_norm": 1.9512219429016113, + "learning_rate": 1e-06, + "loss": 0.0426, + "step": 1386 + }, + { + "epoch": 0.23612529792305073, + "grad_norm": 1.6807881593704224, + "learning_rate": 1e-06, + "loss": 0.0332, + "step": 1387 + }, + { + "epoch": 0.2362955396663262, + "grad_norm": 1.5827664136886597, + "learning_rate": 1e-06, + "loss": 0.0393, + "step": 1388 + }, + { + "epoch": 0.23646578140960164, + "grad_norm": 1.4349877834320068, + "learning_rate": 1e-06, + "loss": 0.0316, + "step": 1389 + }, + { + "epoch": 0.23663602315287707, + "grad_norm": 1.8205857276916504, + "learning_rate": 1e-06, + "loss": 0.0422, + "step": 1390 + }, + { + "epoch": 0.23680626489615253, + "grad_norm": 2.0506768226623535, + "learning_rate": 1e-06, + "loss": 0.0371, + "step": 1391 + }, + { + "epoch": 0.236976506639428, + "grad_norm": 3.028653860092163, + "learning_rate": 1e-06, + "loss": 0.043, + "step": 1392 + }, + { + "epoch": 0.23714674838270344, + "grad_norm": 1.7093989849090576, + "learning_rate": 1e-06, + "loss": 0.0338, + "step": 1393 + }, + { + "epoch": 0.2373169901259789, + "grad_norm": 1.727501630783081, + "learning_rate": 1e-06, + "loss": 0.0279, + "step": 1394 + }, + { + "epoch": 0.23748723186925433, + "grad_norm": 1.9987183809280396, + "learning_rate": 1e-06, + "loss": 0.0358, + "step": 1395 + }, + { + "epoch": 0.23765747361252978, + "grad_norm": 1.4499446153640747, + "learning_rate": 1e-06, + "loss": 0.0228, + "step": 1396 + }, + { + "epoch": 0.23782771535580524, + "grad_norm": 2.0619688034057617, + "learning_rate": 1e-06, + "loss": 0.0451, + "step": 1397 + }, + { + "epoch": 0.2379979570990807, + "grad_norm": 1.4072680473327637, + "learning_rate": 1e-06, + "loss": 0.0331, + "step": 1398 + }, + { + "epoch": 0.23816819884235615, + "grad_norm": 1.9472742080688477, + "learning_rate": 1e-06, + "loss": 0.0357, + "step": 1399 + }, + { + "epoch": 0.23833844058563158, + "grad_norm": 1.682454228401184, + "learning_rate": 1e-06, + "loss": 0.0349, + "step": 1400 + }, + { + "epoch": 0.23850868232890704, + "grad_norm": 1.4352442026138306, + "learning_rate": 1e-06, + "loss": 0.0222, + "step": 1401 + }, + { + "epoch": 0.2386789240721825, + "grad_norm": 1.4091311693191528, + "learning_rate": 1e-06, + "loss": 0.0328, + "step": 1402 + }, + { + "epoch": 0.23884916581545795, + "grad_norm": 1.3856940269470215, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 1403 + }, + { + "epoch": 0.2390194075587334, + "grad_norm": 3.6680867671966553, + "learning_rate": 1e-06, + "loss": 0.049, + "step": 1404 + }, + { + "epoch": 0.23918964930200884, + "grad_norm": 1.7768093347549438, + "learning_rate": 1e-06, + "loss": 0.0396, + "step": 1405 + }, + { + "epoch": 0.2393598910452843, + "grad_norm": 1.83253014087677, + "learning_rate": 1e-06, + "loss": 0.0303, + "step": 1406 + }, + { + "epoch": 0.23953013278855975, + "grad_norm": 3.3626549243927, + "learning_rate": 1e-06, + "loss": 0.0739, + "step": 1407 + }, + { + "epoch": 0.2397003745318352, + "grad_norm": 1.9892669916152954, + "learning_rate": 1e-06, + "loss": 0.0423, + "step": 1408 + }, + { + "epoch": 0.23987061627511067, + "grad_norm": 1.340533971786499, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 1409 + }, + { + "epoch": 0.24004085801838612, + "grad_norm": 1.6560157537460327, + "learning_rate": 1e-06, + "loss": 0.0401, + "step": 1410 + }, + { + "epoch": 0.24021109976166155, + "grad_norm": 1.6459312438964844, + "learning_rate": 1e-06, + "loss": 0.0447, + "step": 1411 + }, + { + "epoch": 0.240381341504937, + "grad_norm": 2.4004392623901367, + "learning_rate": 1e-06, + "loss": 0.0525, + "step": 1412 + }, + { + "epoch": 0.24055158324821246, + "grad_norm": 1.7633525133132935, + "learning_rate": 1e-06, + "loss": 0.0334, + "step": 1413 + }, + { + "epoch": 0.24072182499148792, + "grad_norm": 1.6249415874481201, + "learning_rate": 1e-06, + "loss": 0.039, + "step": 1414 + }, + { + "epoch": 0.24089206673476338, + "grad_norm": 1.817467212677002, + "learning_rate": 1e-06, + "loss": 0.0329, + "step": 1415 + }, + { + "epoch": 0.2410623084780388, + "grad_norm": 1.592246174812317, + "learning_rate": 1e-06, + "loss": 0.0493, + "step": 1416 + }, + { + "epoch": 0.24123255022131426, + "grad_norm": 2.0346999168395996, + "learning_rate": 1e-06, + "loss": 0.0353, + "step": 1417 + }, + { + "epoch": 0.24140279196458972, + "grad_norm": 1.3118454217910767, + "learning_rate": 1e-06, + "loss": 0.0321, + "step": 1418 + }, + { + "epoch": 0.24157303370786518, + "grad_norm": 1.5990475416183472, + "learning_rate": 1e-06, + "loss": 0.0401, + "step": 1419 + }, + { + "epoch": 0.24174327545114063, + "grad_norm": 1.3188730478286743, + "learning_rate": 1e-06, + "loss": 0.0188, + "step": 1420 + }, + { + "epoch": 0.24191351719441606, + "grad_norm": 1.5186814069747925, + "learning_rate": 1e-06, + "loss": 0.0286, + "step": 1421 + }, + { + "epoch": 0.24208375893769152, + "grad_norm": 3.969571352005005, + "learning_rate": 1e-06, + "loss": 0.0573, + "step": 1422 + }, + { + "epoch": 0.24225400068096697, + "grad_norm": 1.6175631284713745, + "learning_rate": 1e-06, + "loss": 0.0308, + "step": 1423 + }, + { + "epoch": 0.24242424242424243, + "grad_norm": 1.3874751329421997, + "learning_rate": 1e-06, + "loss": 0.0298, + "step": 1424 + }, + { + "epoch": 0.2425944841675179, + "grad_norm": 2.3302395343780518, + "learning_rate": 1e-06, + "loss": 0.0483, + "step": 1425 + }, + { + "epoch": 0.24276472591079332, + "grad_norm": 2.0167653560638428, + "learning_rate": 1e-06, + "loss": 0.0356, + "step": 1426 + }, + { + "epoch": 0.24293496765406877, + "grad_norm": 1.5584750175476074, + "learning_rate": 1e-06, + "loss": 0.0331, + "step": 1427 + }, + { + "epoch": 0.24310520939734423, + "grad_norm": 1.7811205387115479, + "learning_rate": 1e-06, + "loss": 0.0296, + "step": 1428 + }, + { + "epoch": 0.2432754511406197, + "grad_norm": 1.6993045806884766, + "learning_rate": 1e-06, + "loss": 0.0341, + "step": 1429 + }, + { + "epoch": 0.24344569288389514, + "grad_norm": 1.6028584241867065, + "learning_rate": 1e-06, + "loss": 0.03, + "step": 1430 + }, + { + "epoch": 0.24361593462717057, + "grad_norm": 2.2071762084960938, + "learning_rate": 1e-06, + "loss": 0.0418, + "step": 1431 + }, + { + "epoch": 0.24378617637044603, + "grad_norm": 1.5499199628829956, + "learning_rate": 1e-06, + "loss": 0.037, + "step": 1432 + }, + { + "epoch": 0.24395641811372148, + "grad_norm": 1.59584641456604, + "learning_rate": 1e-06, + "loss": 0.0312, + "step": 1433 + }, + { + "epoch": 0.24412665985699694, + "grad_norm": 1.5070613622665405, + "learning_rate": 1e-06, + "loss": 0.0274, + "step": 1434 + }, + { + "epoch": 0.2442969016002724, + "grad_norm": 1.6811244487762451, + "learning_rate": 1e-06, + "loss": 0.0277, + "step": 1435 + }, + { + "epoch": 0.24446714334354783, + "grad_norm": 1.631415843963623, + "learning_rate": 1e-06, + "loss": 0.0363, + "step": 1436 + }, + { + "epoch": 0.24463738508682328, + "grad_norm": 1.6105327606201172, + "learning_rate": 1e-06, + "loss": 0.0279, + "step": 1437 + }, + { + "epoch": 0.24480762683009874, + "grad_norm": 1.791298508644104, + "learning_rate": 1e-06, + "loss": 0.0494, + "step": 1438 + }, + { + "epoch": 0.2449778685733742, + "grad_norm": 1.1787437200546265, + "learning_rate": 1e-06, + "loss": 0.0278, + "step": 1439 + }, + { + "epoch": 0.24514811031664965, + "grad_norm": 1.3481965065002441, + "learning_rate": 1e-06, + "loss": 0.031, + "step": 1440 + }, + { + "epoch": 0.24531835205992508, + "grad_norm": 1.495886206626892, + "learning_rate": 1e-06, + "loss": 0.0419, + "step": 1441 + }, + { + "epoch": 0.24548859380320054, + "grad_norm": 1.6901342868804932, + "learning_rate": 1e-06, + "loss": 0.0329, + "step": 1442 + }, + { + "epoch": 0.245658835546476, + "grad_norm": 1.443678379058838, + "learning_rate": 1e-06, + "loss": 0.0412, + "step": 1443 + }, + { + "epoch": 0.24582907728975145, + "grad_norm": 1.8046194314956665, + "learning_rate": 1e-06, + "loss": 0.0384, + "step": 1444 + }, + { + "epoch": 0.2459993190330269, + "grad_norm": 1.5077475309371948, + "learning_rate": 1e-06, + "loss": 0.0321, + "step": 1445 + }, + { + "epoch": 0.24616956077630234, + "grad_norm": 1.7492755651474, + "learning_rate": 1e-06, + "loss": 0.0361, + "step": 1446 + }, + { + "epoch": 0.2463398025195778, + "grad_norm": 1.5502599477767944, + "learning_rate": 1e-06, + "loss": 0.0273, + "step": 1447 + }, + { + "epoch": 0.24651004426285325, + "grad_norm": 1.9191864728927612, + "learning_rate": 1e-06, + "loss": 0.0369, + "step": 1448 + }, + { + "epoch": 0.2466802860061287, + "grad_norm": 1.9377306699752808, + "learning_rate": 1e-06, + "loss": 0.0401, + "step": 1449 + }, + { + "epoch": 0.24685052774940416, + "grad_norm": 2.082916259765625, + "learning_rate": 1e-06, + "loss": 0.0402, + "step": 1450 + }, + { + "epoch": 0.2470207694926796, + "grad_norm": 1.3470356464385986, + "learning_rate": 1e-06, + "loss": 0.0214, + "step": 1451 + }, + { + "epoch": 0.24719101123595505, + "grad_norm": 1.5550161600112915, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 1452 + }, + { + "epoch": 0.2473612529792305, + "grad_norm": 1.9106121063232422, + "learning_rate": 1e-06, + "loss": 0.0399, + "step": 1453 + }, + { + "epoch": 0.24753149472250596, + "grad_norm": 1.2977889776229858, + "learning_rate": 1e-06, + "loss": 0.0238, + "step": 1454 + }, + { + "epoch": 0.24770173646578142, + "grad_norm": 1.8015820980072021, + "learning_rate": 1e-06, + "loss": 0.0337, + "step": 1455 + }, + { + "epoch": 0.24787197820905685, + "grad_norm": 1.515289545059204, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 1456 + }, + { + "epoch": 0.2480422199523323, + "grad_norm": 2.0699691772460938, + "learning_rate": 1e-06, + "loss": 0.0408, + "step": 1457 + }, + { + "epoch": 0.24821246169560776, + "grad_norm": 1.9157500267028809, + "learning_rate": 1e-06, + "loss": 0.0431, + "step": 1458 + }, + { + "epoch": 0.24838270343888322, + "grad_norm": 1.7912602424621582, + "learning_rate": 1e-06, + "loss": 0.0362, + "step": 1459 + }, + { + "epoch": 0.24855294518215867, + "grad_norm": 1.5445823669433594, + "learning_rate": 1e-06, + "loss": 0.0337, + "step": 1460 + }, + { + "epoch": 0.2487231869254341, + "grad_norm": 1.5842821598052979, + "learning_rate": 1e-06, + "loss": 0.0283, + "step": 1461 + }, + { + "epoch": 0.24889342866870956, + "grad_norm": 2.117060899734497, + "learning_rate": 1e-06, + "loss": 0.033, + "step": 1462 + }, + { + "epoch": 0.24906367041198502, + "grad_norm": 1.1993916034698486, + "learning_rate": 1e-06, + "loss": 0.0265, + "step": 1463 + }, + { + "epoch": 0.24923391215526047, + "grad_norm": 1.9758274555206299, + "learning_rate": 1e-06, + "loss": 0.0448, + "step": 1464 + }, + { + "epoch": 0.24940415389853593, + "grad_norm": 1.6750165224075317, + "learning_rate": 1e-06, + "loss": 0.0371, + "step": 1465 + }, + { + "epoch": 0.24957439564181136, + "grad_norm": 1.9693377017974854, + "learning_rate": 1e-06, + "loss": 0.0462, + "step": 1466 + }, + { + "epoch": 0.24974463738508682, + "grad_norm": 1.2334027290344238, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 1467 + }, + { + "epoch": 0.24991487912836227, + "grad_norm": 1.2368199825286865, + "learning_rate": 1e-06, + "loss": 0.0276, + "step": 1468 + }, + { + "epoch": 0.2500851208716377, + "grad_norm": 1.7857489585876465, + "learning_rate": 1e-06, + "loss": 0.0408, + "step": 1469 + }, + { + "epoch": 0.25025536261491316, + "grad_norm": 1.4412367343902588, + "learning_rate": 1e-06, + "loss": 0.0289, + "step": 1470 + }, + { + "epoch": 0.2504256043581886, + "grad_norm": 1.598808765411377, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 1471 + }, + { + "epoch": 0.25059584610146407, + "grad_norm": 1.440071940422058, + "learning_rate": 1e-06, + "loss": 0.0343, + "step": 1472 + }, + { + "epoch": 0.2507660878447395, + "grad_norm": 1.5133793354034424, + "learning_rate": 1e-06, + "loss": 0.0295, + "step": 1473 + }, + { + "epoch": 0.250936329588015, + "grad_norm": 1.5876449346542358, + "learning_rate": 1e-06, + "loss": 0.0282, + "step": 1474 + }, + { + "epoch": 0.25110657133129044, + "grad_norm": 1.636086106300354, + "learning_rate": 1e-06, + "loss": 0.0178, + "step": 1475 + }, + { + "epoch": 0.2512768130745659, + "grad_norm": 1.7871562242507935, + "learning_rate": 1e-06, + "loss": 0.0362, + "step": 1476 + }, + { + "epoch": 0.25144705481784135, + "grad_norm": 2.1147830486297607, + "learning_rate": 1e-06, + "loss": 0.043, + "step": 1477 + }, + { + "epoch": 0.2516172965611168, + "grad_norm": 2.0674445629119873, + "learning_rate": 1e-06, + "loss": 0.0411, + "step": 1478 + }, + { + "epoch": 0.2517875383043922, + "grad_norm": 1.684304118156433, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 1479 + }, + { + "epoch": 0.25195778004766767, + "grad_norm": 1.8230916261672974, + "learning_rate": 1e-06, + "loss": 0.0328, + "step": 1480 + }, + { + "epoch": 0.2521280217909431, + "grad_norm": 1.3470852375030518, + "learning_rate": 1e-06, + "loss": 0.0276, + "step": 1481 + }, + { + "epoch": 0.2522982635342186, + "grad_norm": 1.640721321105957, + "learning_rate": 1e-06, + "loss": 0.037, + "step": 1482 + }, + { + "epoch": 0.25246850527749404, + "grad_norm": 1.4035189151763916, + "learning_rate": 1e-06, + "loss": 0.0286, + "step": 1483 + }, + { + "epoch": 0.2526387470207695, + "grad_norm": 1.6055939197540283, + "learning_rate": 1e-06, + "loss": 0.0269, + "step": 1484 + }, + { + "epoch": 0.25280898876404495, + "grad_norm": 1.5646830797195435, + "learning_rate": 1e-06, + "loss": 0.0357, + "step": 1485 + }, + { + "epoch": 0.2529792305073204, + "grad_norm": 1.7002915143966675, + "learning_rate": 1e-06, + "loss": 0.0351, + "step": 1486 + }, + { + "epoch": 0.25314947225059586, + "grad_norm": 1.1237139701843262, + "learning_rate": 1e-06, + "loss": 0.0229, + "step": 1487 + }, + { + "epoch": 0.2533197139938713, + "grad_norm": 1.8360260725021362, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 1488 + }, + { + "epoch": 0.2534899557371467, + "grad_norm": 1.483035683631897, + "learning_rate": 1e-06, + "loss": 0.0387, + "step": 1489 + }, + { + "epoch": 0.2536601974804222, + "grad_norm": 1.544597864151001, + "learning_rate": 1e-06, + "loss": 0.0308, + "step": 1490 + }, + { + "epoch": 0.25383043922369763, + "grad_norm": 1.5810545682907104, + "learning_rate": 1e-06, + "loss": 0.0358, + "step": 1491 + }, + { + "epoch": 0.2540006809669731, + "grad_norm": 1.6289693117141724, + "learning_rate": 1e-06, + "loss": 0.032, + "step": 1492 + }, + { + "epoch": 0.25417092271024855, + "grad_norm": 1.4735667705535889, + "learning_rate": 1e-06, + "loss": 0.035, + "step": 1493 + }, + { + "epoch": 0.254341164453524, + "grad_norm": 1.8274134397506714, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 1494 + }, + { + "epoch": 0.25451140619679946, + "grad_norm": 1.5539690256118774, + "learning_rate": 1e-06, + "loss": 0.0302, + "step": 1495 + }, + { + "epoch": 0.2546816479400749, + "grad_norm": 1.251301884651184, + "learning_rate": 1e-06, + "loss": 0.0325, + "step": 1496 + }, + { + "epoch": 0.2548518896833504, + "grad_norm": 2.4516842365264893, + "learning_rate": 1e-06, + "loss": 0.033, + "step": 1497 + }, + { + "epoch": 0.25502213142662583, + "grad_norm": 1.677709698677063, + "learning_rate": 1e-06, + "loss": 0.0331, + "step": 1498 + }, + { + "epoch": 0.25519237316990123, + "grad_norm": 1.771200180053711, + "learning_rate": 1e-06, + "loss": 0.0336, + "step": 1499 + }, + { + "epoch": 0.2553626149131767, + "grad_norm": 1.574262022972107, + "learning_rate": 1e-06, + "loss": 0.0306, + "step": 1500 + }, + { + "epoch": 0.25553285665645215, + "grad_norm": 1.3920859098434448, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 1501 + }, + { + "epoch": 0.2557030983997276, + "grad_norm": 2.0418004989624023, + "learning_rate": 1e-06, + "loss": 0.036, + "step": 1502 + }, + { + "epoch": 0.25587334014300306, + "grad_norm": 1.639630913734436, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 1503 + }, + { + "epoch": 0.2560435818862785, + "grad_norm": 3.079418659210205, + "learning_rate": 1e-06, + "loss": 0.0548, + "step": 1504 + }, + { + "epoch": 0.25621382362955397, + "grad_norm": 1.5250223875045776, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 1505 + }, + { + "epoch": 0.25638406537282943, + "grad_norm": 1.6824452877044678, + "learning_rate": 1e-06, + "loss": 0.034, + "step": 1506 + }, + { + "epoch": 0.2565543071161049, + "grad_norm": 1.9331268072128296, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 1507 + }, + { + "epoch": 0.25672454885938034, + "grad_norm": 1.964712142944336, + "learning_rate": 1e-06, + "loss": 0.0385, + "step": 1508 + }, + { + "epoch": 0.2568947906026558, + "grad_norm": 1.5585107803344727, + "learning_rate": 1e-06, + "loss": 0.0267, + "step": 1509 + }, + { + "epoch": 0.2570650323459312, + "grad_norm": 1.9522138833999634, + "learning_rate": 1e-06, + "loss": 0.0481, + "step": 1510 + }, + { + "epoch": 0.25723527408920666, + "grad_norm": 2.522395372390747, + "learning_rate": 1e-06, + "loss": 0.0503, + "step": 1511 + }, + { + "epoch": 0.2574055158324821, + "grad_norm": 1.4549627304077148, + "learning_rate": 1e-06, + "loss": 0.0349, + "step": 1512 + }, + { + "epoch": 0.25757575757575757, + "grad_norm": 1.0665477514266968, + "learning_rate": 1e-06, + "loss": 0.0216, + "step": 1513 + }, + { + "epoch": 0.257745999319033, + "grad_norm": 1.1988400220870972, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 1514 + }, + { + "epoch": 0.2579162410623085, + "grad_norm": 1.5212563276290894, + "learning_rate": 1e-06, + "loss": 0.0319, + "step": 1515 + }, + { + "epoch": 0.25808648280558394, + "grad_norm": 1.6229534149169922, + "learning_rate": 1e-06, + "loss": 0.0387, + "step": 1516 + }, + { + "epoch": 0.2582567245488594, + "grad_norm": 1.9084266424179077, + "learning_rate": 1e-06, + "loss": 0.0354, + "step": 1517 + }, + { + "epoch": 0.25842696629213485, + "grad_norm": 1.2355334758758545, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 1518 + }, + { + "epoch": 0.2585972080354103, + "grad_norm": 1.6511914730072021, + "learning_rate": 1e-06, + "loss": 0.0238, + "step": 1519 + }, + { + "epoch": 0.2587674497786857, + "grad_norm": 1.7273814678192139, + "learning_rate": 1e-06, + "loss": 0.0369, + "step": 1520 + }, + { + "epoch": 0.25893769152196117, + "grad_norm": 1.715128779411316, + "learning_rate": 1e-06, + "loss": 0.0337, + "step": 1521 + }, + { + "epoch": 0.2591079332652366, + "grad_norm": 1.704223871231079, + "learning_rate": 1e-06, + "loss": 0.029, + "step": 1522 + }, + { + "epoch": 0.2592781750085121, + "grad_norm": 1.452068567276001, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 1523 + }, + { + "epoch": 0.25944841675178754, + "grad_norm": 2.0453882217407227, + "learning_rate": 1e-06, + "loss": 0.0342, + "step": 1524 + }, + { + "epoch": 0.259618658495063, + "grad_norm": 2.1579129695892334, + "learning_rate": 1e-06, + "loss": 0.0409, + "step": 1525 + }, + { + "epoch": 0.25978890023833845, + "grad_norm": 1.6939774751663208, + "learning_rate": 1e-06, + "loss": 0.0228, + "step": 1526 + }, + { + "epoch": 0.2599591419816139, + "grad_norm": 1.9167999029159546, + "learning_rate": 1e-06, + "loss": 0.0269, + "step": 1527 + }, + { + "epoch": 0.26012938372488936, + "grad_norm": 1.4578943252563477, + "learning_rate": 1e-06, + "loss": 0.0269, + "step": 1528 + }, + { + "epoch": 0.2602996254681648, + "grad_norm": 1.426521897315979, + "learning_rate": 1e-06, + "loss": 0.0268, + "step": 1529 + }, + { + "epoch": 0.2604698672114402, + "grad_norm": 1.7971490621566772, + "learning_rate": 1e-06, + "loss": 0.0337, + "step": 1530 + }, + { + "epoch": 0.2606401089547157, + "grad_norm": 1.6107807159423828, + "learning_rate": 1e-06, + "loss": 0.0434, + "step": 1531 + }, + { + "epoch": 0.26081035069799113, + "grad_norm": 1.8623952865600586, + "learning_rate": 1e-06, + "loss": 0.0319, + "step": 1532 + }, + { + "epoch": 0.2609805924412666, + "grad_norm": 1.3411558866500854, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 1533 + }, + { + "epoch": 0.26115083418454205, + "grad_norm": 1.5986648797988892, + "learning_rate": 1e-06, + "loss": 0.023, + "step": 1534 + }, + { + "epoch": 0.2613210759278175, + "grad_norm": 1.803185224533081, + "learning_rate": 1e-06, + "loss": 0.034, + "step": 1535 + }, + { + "epoch": 0.26149131767109296, + "grad_norm": 1.4397742748260498, + "learning_rate": 1e-06, + "loss": 0.0408, + "step": 1536 + }, + { + "epoch": 0.2616615594143684, + "grad_norm": 1.5375397205352783, + "learning_rate": 1e-06, + "loss": 0.0277, + "step": 1537 + }, + { + "epoch": 0.2618318011576439, + "grad_norm": 1.6096251010894775, + "learning_rate": 1e-06, + "loss": 0.0284, + "step": 1538 + }, + { + "epoch": 0.26200204290091933, + "grad_norm": 1.9255061149597168, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 1539 + }, + { + "epoch": 0.26217228464419473, + "grad_norm": 1.6099532842636108, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 1540 + }, + { + "epoch": 0.2623425263874702, + "grad_norm": 2.104907989501953, + "learning_rate": 1e-06, + "loss": 0.0265, + "step": 1541 + }, + { + "epoch": 0.26251276813074564, + "grad_norm": 1.6234943866729736, + "learning_rate": 1e-06, + "loss": 0.0365, + "step": 1542 + }, + { + "epoch": 0.2626830098740211, + "grad_norm": 1.7157036066055298, + "learning_rate": 1e-06, + "loss": 0.0321, + "step": 1543 + }, + { + "epoch": 0.26285325161729656, + "grad_norm": 1.219995141029358, + "learning_rate": 1e-06, + "loss": 0.0201, + "step": 1544 + }, + { + "epoch": 0.263023493360572, + "grad_norm": 1.5472625494003296, + "learning_rate": 1e-06, + "loss": 0.0327, + "step": 1545 + }, + { + "epoch": 0.26319373510384747, + "grad_norm": 1.6542850732803345, + "learning_rate": 1e-06, + "loss": 0.0285, + "step": 1546 + }, + { + "epoch": 0.2633639768471229, + "grad_norm": 1.4218086004257202, + "learning_rate": 1e-06, + "loss": 0.0267, + "step": 1547 + }, + { + "epoch": 0.2635342185903984, + "grad_norm": 1.3034826517105103, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 1548 + }, + { + "epoch": 0.26370446033367384, + "grad_norm": 2.216113805770874, + "learning_rate": 1e-06, + "loss": 0.0568, + "step": 1549 + }, + { + "epoch": 0.26387470207694924, + "grad_norm": 1.4958362579345703, + "learning_rate": 1e-06, + "loss": 0.0278, + "step": 1550 + }, + { + "epoch": 0.2640449438202247, + "grad_norm": 1.4572584629058838, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 1551 + }, + { + "epoch": 0.26421518556350015, + "grad_norm": 1.7182279825210571, + "learning_rate": 1e-06, + "loss": 0.0406, + "step": 1552 + }, + { + "epoch": 0.2643854273067756, + "grad_norm": 2.1565287113189697, + "learning_rate": 1e-06, + "loss": 0.0321, + "step": 1553 + }, + { + "epoch": 0.26455566905005107, + "grad_norm": 1.421190619468689, + "learning_rate": 1e-06, + "loss": 0.031, + "step": 1554 + }, + { + "epoch": 0.2647259107933265, + "grad_norm": 1.4493217468261719, + "learning_rate": 1e-06, + "loss": 0.0334, + "step": 1555 + }, + { + "epoch": 0.264896152536602, + "grad_norm": 1.5228818655014038, + "learning_rate": 1e-06, + "loss": 0.0305, + "step": 1556 + }, + { + "epoch": 0.26506639427987744, + "grad_norm": 1.5019917488098145, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 1557 + }, + { + "epoch": 0.2652366360231529, + "grad_norm": 1.8647435903549194, + "learning_rate": 1e-06, + "loss": 0.0295, + "step": 1558 + }, + { + "epoch": 0.26540687776642835, + "grad_norm": 1.5852400064468384, + "learning_rate": 1e-06, + "loss": 0.0235, + "step": 1559 + }, + { + "epoch": 0.26557711950970375, + "grad_norm": 1.5699732303619385, + "learning_rate": 1e-06, + "loss": 0.0369, + "step": 1560 + }, + { + "epoch": 0.2657473612529792, + "grad_norm": 1.5314867496490479, + "learning_rate": 1e-06, + "loss": 0.0231, + "step": 1561 + }, + { + "epoch": 0.26591760299625467, + "grad_norm": 1.3057242631912231, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 1562 + }, + { + "epoch": 0.2660878447395301, + "grad_norm": 1.5217634439468384, + "learning_rate": 1e-06, + "loss": 0.0383, + "step": 1563 + }, + { + "epoch": 0.2662580864828056, + "grad_norm": 1.9693313837051392, + "learning_rate": 1e-06, + "loss": 0.0274, + "step": 1564 + }, + { + "epoch": 0.26642832822608103, + "grad_norm": 2.246293544769287, + "learning_rate": 1e-06, + "loss": 0.0421, + "step": 1565 + }, + { + "epoch": 0.2665985699693565, + "grad_norm": 1.439656376838684, + "learning_rate": 1e-06, + "loss": 0.0319, + "step": 1566 + }, + { + "epoch": 0.26676881171263195, + "grad_norm": 1.522289752960205, + "learning_rate": 1e-06, + "loss": 0.0309, + "step": 1567 + }, + { + "epoch": 0.2669390534559074, + "grad_norm": 1.6061466932296753, + "learning_rate": 1e-06, + "loss": 0.0264, + "step": 1568 + }, + { + "epoch": 0.26710929519918286, + "grad_norm": 1.4138331413269043, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 1569 + }, + { + "epoch": 0.2672795369424583, + "grad_norm": 1.924535870552063, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 1570 + }, + { + "epoch": 0.2674497786857337, + "grad_norm": 1.5255926847457886, + "learning_rate": 1e-06, + "loss": 0.0285, + "step": 1571 + }, + { + "epoch": 0.2676200204290092, + "grad_norm": 1.752744197845459, + "learning_rate": 1e-06, + "loss": 0.0223, + "step": 1572 + }, + { + "epoch": 0.26779026217228463, + "grad_norm": 1.2975589036941528, + "learning_rate": 1e-06, + "loss": 0.029, + "step": 1573 + }, + { + "epoch": 0.2679605039155601, + "grad_norm": 1.4539504051208496, + "learning_rate": 1e-06, + "loss": 0.0303, + "step": 1574 + }, + { + "epoch": 0.26813074565883555, + "grad_norm": 1.4913675785064697, + "learning_rate": 1e-06, + "loss": 0.0337, + "step": 1575 + }, + { + "epoch": 0.268300987402111, + "grad_norm": 1.5786329507827759, + "learning_rate": 1e-06, + "loss": 0.0317, + "step": 1576 + }, + { + "epoch": 0.26847122914538646, + "grad_norm": 1.719687581062317, + "learning_rate": 1e-06, + "loss": 0.0419, + "step": 1577 + }, + { + "epoch": 0.2686414708886619, + "grad_norm": 1.5540446043014526, + "learning_rate": 1e-06, + "loss": 0.0295, + "step": 1578 + }, + { + "epoch": 0.26881171263193737, + "grad_norm": 1.3455661535263062, + "learning_rate": 1e-06, + "loss": 0.0216, + "step": 1579 + }, + { + "epoch": 0.26898195437521283, + "grad_norm": 1.514863133430481, + "learning_rate": 1e-06, + "loss": 0.0309, + "step": 1580 + }, + { + "epoch": 0.26915219611848823, + "grad_norm": 1.63846755027771, + "learning_rate": 1e-06, + "loss": 0.0317, + "step": 1581 + }, + { + "epoch": 0.2693224378617637, + "grad_norm": 1.4479107856750488, + "learning_rate": 1e-06, + "loss": 0.03, + "step": 1582 + }, + { + "epoch": 0.26949267960503914, + "grad_norm": 1.504742980003357, + "learning_rate": 1e-06, + "loss": 0.0204, + "step": 1583 + }, + { + "epoch": 0.2696629213483146, + "grad_norm": 1.895027995109558, + "learning_rate": 1e-06, + "loss": 0.0383, + "step": 1584 + }, + { + "epoch": 0.26983316309159006, + "grad_norm": 1.5659292936325073, + "learning_rate": 1e-06, + "loss": 0.0269, + "step": 1585 + }, + { + "epoch": 0.2700034048348655, + "grad_norm": 1.855080246925354, + "learning_rate": 1e-06, + "loss": 0.0341, + "step": 1586 + }, + { + "epoch": 0.27017364657814097, + "grad_norm": 2.0592076778411865, + "learning_rate": 1e-06, + "loss": 0.0367, + "step": 1587 + }, + { + "epoch": 0.2703438883214164, + "grad_norm": 1.626511812210083, + "learning_rate": 1e-06, + "loss": 0.0286, + "step": 1588 + }, + { + "epoch": 0.2705141300646919, + "grad_norm": 1.715602159500122, + "learning_rate": 1e-06, + "loss": 0.0267, + "step": 1589 + }, + { + "epoch": 0.27068437180796734, + "grad_norm": 1.7395516633987427, + "learning_rate": 1e-06, + "loss": 0.0306, + "step": 1590 + }, + { + "epoch": 0.27085461355124274, + "grad_norm": 2.016375780105591, + "learning_rate": 1e-06, + "loss": 0.0316, + "step": 1591 + }, + { + "epoch": 0.2710248552945182, + "grad_norm": 1.7519330978393555, + "learning_rate": 1e-06, + "loss": 0.0323, + "step": 1592 + }, + { + "epoch": 0.27119509703779365, + "grad_norm": 1.3459405899047852, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 1593 + }, + { + "epoch": 0.2713653387810691, + "grad_norm": 1.7252540588378906, + "learning_rate": 1e-06, + "loss": 0.0313, + "step": 1594 + }, + { + "epoch": 0.27153558052434457, + "grad_norm": 1.3951072692871094, + "learning_rate": 1e-06, + "loss": 0.0341, + "step": 1595 + }, + { + "epoch": 0.27170582226762, + "grad_norm": 1.615648627281189, + "learning_rate": 1e-06, + "loss": 0.0302, + "step": 1596 + }, + { + "epoch": 0.2718760640108955, + "grad_norm": 1.938193678855896, + "learning_rate": 1e-06, + "loss": 0.0267, + "step": 1597 + }, + { + "epoch": 0.27204630575417094, + "grad_norm": 1.4162940979003906, + "learning_rate": 1e-06, + "loss": 0.0267, + "step": 1598 + }, + { + "epoch": 0.2722165474974464, + "grad_norm": 1.7857416868209839, + "learning_rate": 1e-06, + "loss": 0.0287, + "step": 1599 + }, + { + "epoch": 0.27238678924072185, + "grad_norm": 2.2122995853424072, + "learning_rate": 1e-06, + "loss": 0.0339, + "step": 1600 + }, + { + "epoch": 0.27255703098399725, + "grad_norm": 1.2226827144622803, + "learning_rate": 1e-06, + "loss": 0.0224, + "step": 1601 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 1.366214394569397, + "learning_rate": 1e-06, + "loss": 0.0268, + "step": 1602 + }, + { + "epoch": 0.27289751447054816, + "grad_norm": 1.472756266593933, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 1603 + }, + { + "epoch": 0.2730677562138236, + "grad_norm": 1.4528722763061523, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 1604 + }, + { + "epoch": 0.2732379979570991, + "grad_norm": 1.2808316946029663, + "learning_rate": 1e-06, + "loss": 0.0231, + "step": 1605 + }, + { + "epoch": 0.27340823970037453, + "grad_norm": 1.5016905069351196, + "learning_rate": 1e-06, + "loss": 0.0338, + "step": 1606 + }, + { + "epoch": 0.27357848144365, + "grad_norm": 1.743894100189209, + "learning_rate": 1e-06, + "loss": 0.029, + "step": 1607 + }, + { + "epoch": 0.27374872318692545, + "grad_norm": 1.6557116508483887, + "learning_rate": 1e-06, + "loss": 0.0288, + "step": 1608 + }, + { + "epoch": 0.2739189649302009, + "grad_norm": 1.2677494287490845, + "learning_rate": 1e-06, + "loss": 0.0229, + "step": 1609 + }, + { + "epoch": 0.27408920667347636, + "grad_norm": 1.6573220491409302, + "learning_rate": 1e-06, + "loss": 0.0307, + "step": 1610 + }, + { + "epoch": 0.27425944841675176, + "grad_norm": 1.8028841018676758, + "learning_rate": 1e-06, + "loss": 0.0289, + "step": 1611 + }, + { + "epoch": 0.2744296901600272, + "grad_norm": 1.5891400575637817, + "learning_rate": 1e-06, + "loss": 0.0344, + "step": 1612 + }, + { + "epoch": 0.2745999319033027, + "grad_norm": 1.492483377456665, + "learning_rate": 1e-06, + "loss": 0.0214, + "step": 1613 + }, + { + "epoch": 0.27477017364657813, + "grad_norm": 1.5204499959945679, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 1614 + }, + { + "epoch": 0.2749404153898536, + "grad_norm": 2.0463969707489014, + "learning_rate": 1e-06, + "loss": 0.0287, + "step": 1615 + }, + { + "epoch": 0.27511065713312904, + "grad_norm": 1.5479378700256348, + "learning_rate": 1e-06, + "loss": 0.0291, + "step": 1616 + }, + { + "epoch": 0.2752808988764045, + "grad_norm": 2.649272918701172, + "learning_rate": 1e-06, + "loss": 0.0455, + "step": 1617 + }, + { + "epoch": 0.27545114061967996, + "grad_norm": 1.8164010047912598, + "learning_rate": 1e-06, + "loss": 0.0291, + "step": 1618 + }, + { + "epoch": 0.2756213823629554, + "grad_norm": 1.2867664098739624, + "learning_rate": 1e-06, + "loss": 0.0197, + "step": 1619 + }, + { + "epoch": 0.27579162410623087, + "grad_norm": 1.3096405267715454, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 1620 + }, + { + "epoch": 0.27596186584950627, + "grad_norm": 1.3673827648162842, + "learning_rate": 1e-06, + "loss": 0.0177, + "step": 1621 + }, + { + "epoch": 0.27613210759278173, + "grad_norm": 1.6427737474441528, + "learning_rate": 1e-06, + "loss": 0.0302, + "step": 1622 + }, + { + "epoch": 0.2763023493360572, + "grad_norm": 1.4813086986541748, + "learning_rate": 1e-06, + "loss": 0.0307, + "step": 1623 + }, + { + "epoch": 0.27647259107933264, + "grad_norm": 1.318463921546936, + "learning_rate": 1e-06, + "loss": 0.0221, + "step": 1624 + }, + { + "epoch": 0.2766428328226081, + "grad_norm": 1.6629798412322998, + "learning_rate": 1e-06, + "loss": 0.0289, + "step": 1625 + }, + { + "epoch": 0.27681307456588355, + "grad_norm": 1.4706048965454102, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 1626 + }, + { + "epoch": 0.276983316309159, + "grad_norm": 1.6562780141830444, + "learning_rate": 1e-06, + "loss": 0.027, + "step": 1627 + }, + { + "epoch": 0.27715355805243447, + "grad_norm": 1.7971525192260742, + "learning_rate": 1e-06, + "loss": 0.0322, + "step": 1628 + }, + { + "epoch": 0.2773237997957099, + "grad_norm": 1.9127774238586426, + "learning_rate": 1e-06, + "loss": 0.0382, + "step": 1629 + }, + { + "epoch": 0.2774940415389854, + "grad_norm": 1.661810278892517, + "learning_rate": 1e-06, + "loss": 0.0334, + "step": 1630 + }, + { + "epoch": 0.27766428328226084, + "grad_norm": 2.6759703159332275, + "learning_rate": 1e-06, + "loss": 0.0297, + "step": 1631 + }, + { + "epoch": 0.27783452502553624, + "grad_norm": 1.6530686616897583, + "learning_rate": 1e-06, + "loss": 0.0353, + "step": 1632 + }, + { + "epoch": 0.2780047667688117, + "grad_norm": 1.8694590330123901, + "learning_rate": 1e-06, + "loss": 0.0461, + "step": 1633 + }, + { + "epoch": 0.27817500851208715, + "grad_norm": 1.3104816675186157, + "learning_rate": 1e-06, + "loss": 0.0233, + "step": 1634 + }, + { + "epoch": 0.2783452502553626, + "grad_norm": 1.3213906288146973, + "learning_rate": 1e-06, + "loss": 0.0212, + "step": 1635 + }, + { + "epoch": 0.27851549199863807, + "grad_norm": 1.9300750494003296, + "learning_rate": 1e-06, + "loss": 0.0359, + "step": 1636 + }, + { + "epoch": 0.2786857337419135, + "grad_norm": 1.7006559371948242, + "learning_rate": 1e-06, + "loss": 0.0276, + "step": 1637 + }, + { + "epoch": 0.278855975485189, + "grad_norm": 1.3280925750732422, + "learning_rate": 1e-06, + "loss": 0.0228, + "step": 1638 + }, + { + "epoch": 0.27902621722846443, + "grad_norm": 1.5057625770568848, + "learning_rate": 1e-06, + "loss": 0.0228, + "step": 1639 + }, + { + "epoch": 0.2791964589717399, + "grad_norm": 1.3928744792938232, + "learning_rate": 1e-06, + "loss": 0.0194, + "step": 1640 + }, + { + "epoch": 0.27936670071501535, + "grad_norm": 1.3840206861495972, + "learning_rate": 1e-06, + "loss": 0.0209, + "step": 1641 + }, + { + "epoch": 0.27953694245829075, + "grad_norm": 1.7165454626083374, + "learning_rate": 1e-06, + "loss": 0.0301, + "step": 1642 + }, + { + "epoch": 0.2797071842015662, + "grad_norm": 2.1626944541931152, + "learning_rate": 1e-06, + "loss": 0.0354, + "step": 1643 + }, + { + "epoch": 0.27987742594484166, + "grad_norm": 1.480241060256958, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 1644 + }, + { + "epoch": 0.2800476676881171, + "grad_norm": 1.88970947265625, + "learning_rate": 1e-06, + "loss": 0.0319, + "step": 1645 + }, + { + "epoch": 0.2802179094313926, + "grad_norm": 1.82145357131958, + "learning_rate": 1e-06, + "loss": 0.033, + "step": 1646 + }, + { + "epoch": 0.28038815117466803, + "grad_norm": 1.4362519979476929, + "learning_rate": 1e-06, + "loss": 0.0279, + "step": 1647 + }, + { + "epoch": 0.2805583929179435, + "grad_norm": 1.5925521850585938, + "learning_rate": 1e-06, + "loss": 0.0274, + "step": 1648 + }, + { + "epoch": 0.28072863466121895, + "grad_norm": 1.46236252784729, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 1649 + }, + { + "epoch": 0.2808988764044944, + "grad_norm": 2.9916434288024902, + "learning_rate": 1e-06, + "loss": 0.0428, + "step": 1650 + }, + { + "epoch": 0.28106911814776986, + "grad_norm": 1.4459913969039917, + "learning_rate": 1e-06, + "loss": 0.0264, + "step": 1651 + }, + { + "epoch": 0.28123935989104526, + "grad_norm": 1.4686808586120605, + "learning_rate": 1e-06, + "loss": 0.0319, + "step": 1652 + }, + { + "epoch": 0.2814096016343207, + "grad_norm": 1.5071327686309814, + "learning_rate": 1e-06, + "loss": 0.0234, + "step": 1653 + }, + { + "epoch": 0.2815798433775962, + "grad_norm": 1.4609589576721191, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 1654 + }, + { + "epoch": 0.28175008512087163, + "grad_norm": 1.7668788433074951, + "learning_rate": 1e-06, + "loss": 0.0235, + "step": 1655 + }, + { + "epoch": 0.2819203268641471, + "grad_norm": 1.9806582927703857, + "learning_rate": 1e-06, + "loss": 0.0307, + "step": 1656 + }, + { + "epoch": 0.28209056860742254, + "grad_norm": 1.548424243927002, + "learning_rate": 1e-06, + "loss": 0.0177, + "step": 1657 + }, + { + "epoch": 0.282260810350698, + "grad_norm": 1.318804144859314, + "learning_rate": 1e-06, + "loss": 0.0148, + "step": 1658 + }, + { + "epoch": 0.28243105209397346, + "grad_norm": 1.7616188526153564, + "learning_rate": 1e-06, + "loss": 0.0308, + "step": 1659 + }, + { + "epoch": 0.2826012938372489, + "grad_norm": 1.4536751508712769, + "learning_rate": 1e-06, + "loss": 0.0226, + "step": 1660 + }, + { + "epoch": 0.28277153558052437, + "grad_norm": 1.8000961542129517, + "learning_rate": 1e-06, + "loss": 0.0287, + "step": 1661 + }, + { + "epoch": 0.28294177732379977, + "grad_norm": 1.4188201427459717, + "learning_rate": 1e-06, + "loss": 0.0202, + "step": 1662 + }, + { + "epoch": 0.2831120190670752, + "grad_norm": 1.6086786985397339, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 1663 + }, + { + "epoch": 0.2832822608103507, + "grad_norm": 1.3086551427841187, + "learning_rate": 1e-06, + "loss": 0.0232, + "step": 1664 + }, + { + "epoch": 0.28345250255362614, + "grad_norm": 1.5026829242706299, + "learning_rate": 1e-06, + "loss": 0.0203, + "step": 1665 + }, + { + "epoch": 0.2836227442969016, + "grad_norm": 1.9546946287155151, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 1666 + }, + { + "epoch": 0.28379298604017705, + "grad_norm": 1.1366575956344604, + "learning_rate": 1e-06, + "loss": 0.0228, + "step": 1667 + }, + { + "epoch": 0.2839632277834525, + "grad_norm": 1.8793129920959473, + "learning_rate": 1e-06, + "loss": 0.0443, + "step": 1668 + }, + { + "epoch": 0.28413346952672797, + "grad_norm": 1.4484946727752686, + "learning_rate": 1e-06, + "loss": 0.0312, + "step": 1669 + }, + { + "epoch": 0.2843037112700034, + "grad_norm": 1.7814072370529175, + "learning_rate": 1e-06, + "loss": 0.0346, + "step": 1670 + }, + { + "epoch": 0.2844739530132789, + "grad_norm": 1.5171910524368286, + "learning_rate": 1e-06, + "loss": 0.0231, + "step": 1671 + }, + { + "epoch": 0.2846441947565543, + "grad_norm": 1.7551907300949097, + "learning_rate": 1e-06, + "loss": 0.0196, + "step": 1672 + }, + { + "epoch": 0.28481443649982974, + "grad_norm": 1.8677160739898682, + "learning_rate": 1e-06, + "loss": 0.0276, + "step": 1673 + }, + { + "epoch": 0.2849846782431052, + "grad_norm": 1.342197299003601, + "learning_rate": 1e-06, + "loss": 0.0183, + "step": 1674 + }, + { + "epoch": 0.28515491998638065, + "grad_norm": 1.656657338142395, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 1675 + }, + { + "epoch": 0.2853251617296561, + "grad_norm": 1.6236470937728882, + "learning_rate": 1e-06, + "loss": 0.0234, + "step": 1676 + }, + { + "epoch": 0.28549540347293156, + "grad_norm": 1.7253212928771973, + "learning_rate": 1e-06, + "loss": 0.0274, + "step": 1677 + }, + { + "epoch": 0.285665645216207, + "grad_norm": 1.9276634454727173, + "learning_rate": 1e-06, + "loss": 0.0295, + "step": 1678 + }, + { + "epoch": 0.2858358869594825, + "grad_norm": 1.4296467304229736, + "learning_rate": 1e-06, + "loss": 0.0208, + "step": 1679 + }, + { + "epoch": 0.28600612870275793, + "grad_norm": 2.423842430114746, + "learning_rate": 1e-06, + "loss": 0.0329, + "step": 1680 + }, + { + "epoch": 0.2861763704460334, + "grad_norm": 1.48721444606781, + "learning_rate": 1e-06, + "loss": 0.0237, + "step": 1681 + }, + { + "epoch": 0.2863466121893088, + "grad_norm": 1.7897123098373413, + "learning_rate": 1e-06, + "loss": 0.0281, + "step": 1682 + }, + { + "epoch": 0.28651685393258425, + "grad_norm": 1.6738888025283813, + "learning_rate": 1e-06, + "loss": 0.0332, + "step": 1683 + }, + { + "epoch": 0.2866870956758597, + "grad_norm": 1.615144968032837, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 1684 + }, + { + "epoch": 0.28685733741913516, + "grad_norm": 1.4232816696166992, + "learning_rate": 1e-06, + "loss": 0.0209, + "step": 1685 + }, + { + "epoch": 0.2870275791624106, + "grad_norm": 1.6573175191879272, + "learning_rate": 1e-06, + "loss": 0.0269, + "step": 1686 + }, + { + "epoch": 0.2871978209056861, + "grad_norm": 1.7181310653686523, + "learning_rate": 1e-06, + "loss": 0.0267, + "step": 1687 + }, + { + "epoch": 0.28736806264896153, + "grad_norm": 1.8144760131835938, + "learning_rate": 1e-06, + "loss": 0.0373, + "step": 1688 + }, + { + "epoch": 0.287538304392237, + "grad_norm": 1.360830545425415, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 1689 + }, + { + "epoch": 0.28770854613551244, + "grad_norm": 1.4611048698425293, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 1690 + }, + { + "epoch": 0.2878787878787879, + "grad_norm": 1.525647759437561, + "learning_rate": 1e-06, + "loss": 0.0283, + "step": 1691 + }, + { + "epoch": 0.28804902962206336, + "grad_norm": 1.4439351558685303, + "learning_rate": 1e-06, + "loss": 0.0274, + "step": 1692 + }, + { + "epoch": 0.28821927136533876, + "grad_norm": 1.6593124866485596, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 1693 + }, + { + "epoch": 0.2883895131086142, + "grad_norm": 1.6211251020431519, + "learning_rate": 1e-06, + "loss": 0.0388, + "step": 1694 + }, + { + "epoch": 0.28855975485188967, + "grad_norm": 1.6730520725250244, + "learning_rate": 1e-06, + "loss": 0.0229, + "step": 1695 + }, + { + "epoch": 0.28872999659516513, + "grad_norm": 1.850048542022705, + "learning_rate": 1e-06, + "loss": 0.0202, + "step": 1696 + }, + { + "epoch": 0.2889002383384406, + "grad_norm": 1.4730114936828613, + "learning_rate": 1e-06, + "loss": 0.0235, + "step": 1697 + }, + { + "epoch": 0.28907048008171604, + "grad_norm": 1.8073022365570068, + "learning_rate": 1e-06, + "loss": 0.04, + "step": 1698 + }, + { + "epoch": 0.2892407218249915, + "grad_norm": 1.8779622316360474, + "learning_rate": 1e-06, + "loss": 0.0437, + "step": 1699 + }, + { + "epoch": 0.28941096356826695, + "grad_norm": 1.5742496252059937, + "learning_rate": 1e-06, + "loss": 0.0293, + "step": 1700 + }, + { + "epoch": 0.2895812053115424, + "grad_norm": 1.4416614770889282, + "learning_rate": 1e-06, + "loss": 0.023, + "step": 1701 + }, + { + "epoch": 0.28975144705481787, + "grad_norm": 1.4189447164535522, + "learning_rate": 1e-06, + "loss": 0.0281, + "step": 1702 + }, + { + "epoch": 0.28992168879809327, + "grad_norm": 1.4489812850952148, + "learning_rate": 1e-06, + "loss": 0.0269, + "step": 1703 + }, + { + "epoch": 0.2900919305413687, + "grad_norm": 1.590919852256775, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 1704 + }, + { + "epoch": 0.2902621722846442, + "grad_norm": 1.528601884841919, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 1705 + }, + { + "epoch": 0.29043241402791964, + "grad_norm": 1.3978060483932495, + "learning_rate": 1e-06, + "loss": 0.0202, + "step": 1706 + }, + { + "epoch": 0.2906026557711951, + "grad_norm": 1.4895422458648682, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 1707 + }, + { + "epoch": 0.29077289751447055, + "grad_norm": 1.5982104539871216, + "learning_rate": 1e-06, + "loss": 0.0143, + "step": 1708 + }, + { + "epoch": 0.290943139257746, + "grad_norm": 1.5358942747116089, + "learning_rate": 1e-06, + "loss": 0.0223, + "step": 1709 + }, + { + "epoch": 0.29111338100102147, + "grad_norm": 1.491877794265747, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 1710 + }, + { + "epoch": 0.2912836227442969, + "grad_norm": 1.6114052534103394, + "learning_rate": 1e-06, + "loss": 0.0226, + "step": 1711 + }, + { + "epoch": 0.2914538644875724, + "grad_norm": 1.6193110942840576, + "learning_rate": 1e-06, + "loss": 0.0221, + "step": 1712 + }, + { + "epoch": 0.2916241062308478, + "grad_norm": 1.40981924533844, + "learning_rate": 1e-06, + "loss": 0.0209, + "step": 1713 + }, + { + "epoch": 0.29179434797412324, + "grad_norm": 2.308316469192505, + "learning_rate": 1e-06, + "loss": 0.0314, + "step": 1714 + }, + { + "epoch": 0.2919645897173987, + "grad_norm": 1.631408929824829, + "learning_rate": 1e-06, + "loss": 0.0307, + "step": 1715 + }, + { + "epoch": 0.29213483146067415, + "grad_norm": 1.4540927410125732, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 1716 + }, + { + "epoch": 0.2923050732039496, + "grad_norm": 1.7663581371307373, + "learning_rate": 1e-06, + "loss": 0.0332, + "step": 1717 + }, + { + "epoch": 0.29247531494722506, + "grad_norm": 1.2516728639602661, + "learning_rate": 1e-06, + "loss": 0.021, + "step": 1718 + }, + { + "epoch": 0.2926455566905005, + "grad_norm": 1.9072999954223633, + "learning_rate": 1e-06, + "loss": 0.0308, + "step": 1719 + }, + { + "epoch": 0.292815798433776, + "grad_norm": 1.8111470937728882, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 1720 + }, + { + "epoch": 0.29298604017705143, + "grad_norm": 1.5507558584213257, + "learning_rate": 1e-06, + "loss": 0.022, + "step": 1721 + }, + { + "epoch": 0.2931562819203269, + "grad_norm": 1.2865614891052246, + "learning_rate": 1e-06, + "loss": 0.0196, + "step": 1722 + }, + { + "epoch": 0.2933265236636023, + "grad_norm": 1.6672788858413696, + "learning_rate": 1e-06, + "loss": 0.0191, + "step": 1723 + }, + { + "epoch": 0.29349676540687775, + "grad_norm": 1.670101284980774, + "learning_rate": 1e-06, + "loss": 0.03, + "step": 1724 + }, + { + "epoch": 0.2936670071501532, + "grad_norm": 1.2135088443756104, + "learning_rate": 1e-06, + "loss": 0.0219, + "step": 1725 + }, + { + "epoch": 0.29383724889342866, + "grad_norm": 1.2077770233154297, + "learning_rate": 1e-06, + "loss": 0.0179, + "step": 1726 + }, + { + "epoch": 0.2940074906367041, + "grad_norm": 1.3592512607574463, + "learning_rate": 1e-06, + "loss": 0.0235, + "step": 1727 + }, + { + "epoch": 0.2941777323799796, + "grad_norm": 1.661252737045288, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 1728 + }, + { + "epoch": 0.29434797412325503, + "grad_norm": 1.8278785943984985, + "learning_rate": 1e-06, + "loss": 0.0403, + "step": 1729 + }, + { + "epoch": 0.2945182158665305, + "grad_norm": 1.3359884023666382, + "learning_rate": 1e-06, + "loss": 0.0206, + "step": 1730 + }, + { + "epoch": 0.29468845760980594, + "grad_norm": 1.8488993644714355, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 1731 + }, + { + "epoch": 0.2948586993530814, + "grad_norm": 1.6495951414108276, + "learning_rate": 1e-06, + "loss": 0.0268, + "step": 1732 + }, + { + "epoch": 0.2950289410963568, + "grad_norm": 2.0835161209106445, + "learning_rate": 1e-06, + "loss": 0.0373, + "step": 1733 + }, + { + "epoch": 0.29519918283963226, + "grad_norm": 1.2408726215362549, + "learning_rate": 1e-06, + "loss": 0.0212, + "step": 1734 + }, + { + "epoch": 0.2953694245829077, + "grad_norm": 1.3629958629608154, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 1735 + }, + { + "epoch": 0.29553966632618317, + "grad_norm": 1.1778775453567505, + "learning_rate": 1e-06, + "loss": 0.0185, + "step": 1736 + }, + { + "epoch": 0.2957099080694586, + "grad_norm": 1.1877045631408691, + "learning_rate": 1e-06, + "loss": 0.0216, + "step": 1737 + }, + { + "epoch": 0.2958801498127341, + "grad_norm": 1.8210134506225586, + "learning_rate": 1e-06, + "loss": 0.0264, + "step": 1738 + }, + { + "epoch": 0.29605039155600954, + "grad_norm": 1.5456937551498413, + "learning_rate": 1e-06, + "loss": 0.0188, + "step": 1739 + }, + { + "epoch": 0.296220633299285, + "grad_norm": 1.523476004600525, + "learning_rate": 1e-06, + "loss": 0.0195, + "step": 1740 + }, + { + "epoch": 0.29639087504256045, + "grad_norm": 1.5979241132736206, + "learning_rate": 1e-06, + "loss": 0.0226, + "step": 1741 + }, + { + "epoch": 0.2965611167858359, + "grad_norm": 1.4582843780517578, + "learning_rate": 1e-06, + "loss": 0.0227, + "step": 1742 + }, + { + "epoch": 0.2967313585291113, + "grad_norm": 1.4900776147842407, + "learning_rate": 1e-06, + "loss": 0.0283, + "step": 1743 + }, + { + "epoch": 0.29690160027238677, + "grad_norm": 1.4199949502944946, + "learning_rate": 1e-06, + "loss": 0.031, + "step": 1744 + }, + { + "epoch": 0.2970718420156622, + "grad_norm": 1.9383891820907593, + "learning_rate": 1e-06, + "loss": 0.0351, + "step": 1745 + }, + { + "epoch": 0.2972420837589377, + "grad_norm": 1.6513484716415405, + "learning_rate": 1e-06, + "loss": 0.0283, + "step": 1746 + }, + { + "epoch": 0.29741232550221314, + "grad_norm": 1.4408584833145142, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 1747 + }, + { + "epoch": 0.2975825672454886, + "grad_norm": 1.4601348638534546, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 1748 + }, + { + "epoch": 0.29775280898876405, + "grad_norm": 2.014059066772461, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 1749 + }, + { + "epoch": 0.2979230507320395, + "grad_norm": 1.4093255996704102, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 1750 + }, + { + "epoch": 0.29809329247531496, + "grad_norm": 1.8597747087478638, + "learning_rate": 1e-06, + "loss": 0.0305, + "step": 1751 + }, + { + "epoch": 0.2982635342185904, + "grad_norm": 1.4685695171356201, + "learning_rate": 1e-06, + "loss": 0.0226, + "step": 1752 + }, + { + "epoch": 0.2984337759618659, + "grad_norm": 1.4107317924499512, + "learning_rate": 1e-06, + "loss": 0.0278, + "step": 1753 + }, + { + "epoch": 0.2986040177051413, + "grad_norm": 1.1390619277954102, + "learning_rate": 1e-06, + "loss": 0.0272, + "step": 1754 + }, + { + "epoch": 0.29877425944841673, + "grad_norm": 1.5630130767822266, + "learning_rate": 1e-06, + "loss": 0.03, + "step": 1755 + }, + { + "epoch": 0.2989445011916922, + "grad_norm": 1.6469208002090454, + "learning_rate": 1e-06, + "loss": 0.02, + "step": 1756 + }, + { + "epoch": 0.29911474293496765, + "grad_norm": 1.4508461952209473, + "learning_rate": 1e-06, + "loss": 0.028, + "step": 1757 + }, + { + "epoch": 0.2992849846782431, + "grad_norm": 1.506699800491333, + "learning_rate": 1e-06, + "loss": 0.0292, + "step": 1758 + }, + { + "epoch": 0.29945522642151856, + "grad_norm": 1.0725475549697876, + "learning_rate": 1e-06, + "loss": 0.0173, + "step": 1759 + }, + { + "epoch": 0.299625468164794, + "grad_norm": 1.733446717262268, + "learning_rate": 1e-06, + "loss": 0.0375, + "step": 1760 + }, + { + "epoch": 0.2997957099080695, + "grad_norm": 1.4227663278579712, + "learning_rate": 1e-06, + "loss": 0.0203, + "step": 1761 + }, + { + "epoch": 0.29996595165134493, + "grad_norm": 1.4566940069198608, + "learning_rate": 1e-06, + "loss": 0.0264, + "step": 1762 + }, + { + "epoch": 0.3001361933946204, + "grad_norm": 1.6807788610458374, + "learning_rate": 1e-06, + "loss": 0.0281, + "step": 1763 + }, + { + "epoch": 0.3003064351378958, + "grad_norm": 1.8362802267074585, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 1764 + }, + { + "epoch": 0.30047667688117125, + "grad_norm": 1.8242303133010864, + "learning_rate": 1e-06, + "loss": 0.0274, + "step": 1765 + }, + { + "epoch": 0.3006469186244467, + "grad_norm": 1.639448881149292, + "learning_rate": 1e-06, + "loss": 0.0282, + "step": 1766 + }, + { + "epoch": 0.30081716036772216, + "grad_norm": 1.406978726387024, + "learning_rate": 1e-06, + "loss": 0.0238, + "step": 1767 + }, + { + "epoch": 0.3009874021109976, + "grad_norm": 1.5948950052261353, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 1768 + }, + { + "epoch": 0.30115764385427307, + "grad_norm": 1.1911594867706299, + "learning_rate": 1e-06, + "loss": 0.016, + "step": 1769 + }, + { + "epoch": 0.30132788559754853, + "grad_norm": 1.3455018997192383, + "learning_rate": 1e-06, + "loss": 0.0196, + "step": 1770 + }, + { + "epoch": 0.301498127340824, + "grad_norm": 1.5457820892333984, + "learning_rate": 1e-06, + "loss": 0.021, + "step": 1771 + }, + { + "epoch": 0.30166836908409944, + "grad_norm": 1.7167928218841553, + "learning_rate": 1e-06, + "loss": 0.0282, + "step": 1772 + }, + { + "epoch": 0.3018386108273749, + "grad_norm": 1.1246789693832397, + "learning_rate": 1e-06, + "loss": 0.0225, + "step": 1773 + }, + { + "epoch": 0.3020088525706503, + "grad_norm": 1.7185958623886108, + "learning_rate": 1e-06, + "loss": 0.0319, + "step": 1774 + }, + { + "epoch": 0.30217909431392576, + "grad_norm": 1.2617838382720947, + "learning_rate": 1e-06, + "loss": 0.0219, + "step": 1775 + }, + { + "epoch": 0.3023493360572012, + "grad_norm": 1.5254855155944824, + "learning_rate": 1e-06, + "loss": 0.0202, + "step": 1776 + }, + { + "epoch": 0.30251957780047667, + "grad_norm": 1.1870532035827637, + "learning_rate": 1e-06, + "loss": 0.0131, + "step": 1777 + }, + { + "epoch": 0.3026898195437521, + "grad_norm": 1.7242926359176636, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 1778 + }, + { + "epoch": 0.3028600612870276, + "grad_norm": 1.8268846273422241, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 1779 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 1.4591509103775024, + "learning_rate": 1e-06, + "loss": 0.0193, + "step": 1780 + }, + { + "epoch": 0.3032005447735785, + "grad_norm": 1.7261550426483154, + "learning_rate": 1e-06, + "loss": 0.0299, + "step": 1781 + }, + { + "epoch": 0.30337078651685395, + "grad_norm": 1.9499040842056274, + "learning_rate": 1e-06, + "loss": 0.0374, + "step": 1782 + }, + { + "epoch": 0.3035410282601294, + "grad_norm": 1.2761621475219727, + "learning_rate": 1e-06, + "loss": 0.0215, + "step": 1783 + }, + { + "epoch": 0.3037112700034048, + "grad_norm": 1.402917742729187, + "learning_rate": 1e-06, + "loss": 0.0223, + "step": 1784 + }, + { + "epoch": 0.30388151174668027, + "grad_norm": 3.2693288326263428, + "learning_rate": 1e-06, + "loss": 0.0334, + "step": 1785 + }, + { + "epoch": 0.3040517534899557, + "grad_norm": 1.3207221031188965, + "learning_rate": 1e-06, + "loss": 0.0174, + "step": 1786 + }, + { + "epoch": 0.3042219952332312, + "grad_norm": 2.6473004817962646, + "learning_rate": 1e-06, + "loss": 0.0406, + "step": 1787 + }, + { + "epoch": 0.30439223697650664, + "grad_norm": 1.6894813776016235, + "learning_rate": 1e-06, + "loss": 0.0283, + "step": 1788 + }, + { + "epoch": 0.3045624787197821, + "grad_norm": 1.3546245098114014, + "learning_rate": 1e-06, + "loss": 0.0238, + "step": 1789 + }, + { + "epoch": 0.30473272046305755, + "grad_norm": 1.6987773180007935, + "learning_rate": 1e-06, + "loss": 0.0301, + "step": 1790 + }, + { + "epoch": 0.304902962206333, + "grad_norm": 1.6079025268554688, + "learning_rate": 1e-06, + "loss": 0.0222, + "step": 1791 + }, + { + "epoch": 0.30507320394960846, + "grad_norm": 1.5735406875610352, + "learning_rate": 1e-06, + "loss": 0.0222, + "step": 1792 + }, + { + "epoch": 0.3052434456928839, + "grad_norm": 1.862619400024414, + "learning_rate": 1e-06, + "loss": 0.0432, + "step": 1793 + }, + { + "epoch": 0.3054136874361593, + "grad_norm": 1.18687903881073, + "learning_rate": 1e-06, + "loss": 0.0145, + "step": 1794 + }, + { + "epoch": 0.3055839291794348, + "grad_norm": 1.4821864366531372, + "learning_rate": 1e-06, + "loss": 0.0234, + "step": 1795 + }, + { + "epoch": 0.30575417092271023, + "grad_norm": 1.6538077592849731, + "learning_rate": 1e-06, + "loss": 0.0225, + "step": 1796 + }, + { + "epoch": 0.3059244126659857, + "grad_norm": 1.546225666999817, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 1797 + }, + { + "epoch": 0.30609465440926115, + "grad_norm": 1.4453809261322021, + "learning_rate": 1e-06, + "loss": 0.0216, + "step": 1798 + }, + { + "epoch": 0.3062648961525366, + "grad_norm": 2.1485629081726074, + "learning_rate": 1e-06, + "loss": 0.0324, + "step": 1799 + }, + { + "epoch": 0.30643513789581206, + "grad_norm": 1.6263507604599, + "learning_rate": 1e-06, + "loss": 0.0212, + "step": 1800 + }, + { + "epoch": 0.3066053796390875, + "grad_norm": 1.6911835670471191, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 1801 + }, + { + "epoch": 0.306775621382363, + "grad_norm": 1.4971544742584229, + "learning_rate": 1e-06, + "loss": 0.0215, + "step": 1802 + }, + { + "epoch": 0.30694586312563843, + "grad_norm": 1.3353301286697388, + "learning_rate": 1e-06, + "loss": 0.0167, + "step": 1803 + }, + { + "epoch": 0.30711610486891383, + "grad_norm": 1.5353763103485107, + "learning_rate": 1e-06, + "loss": 0.021, + "step": 1804 + }, + { + "epoch": 0.3072863466121893, + "grad_norm": 1.2348135709762573, + "learning_rate": 1e-06, + "loss": 0.0203, + "step": 1805 + }, + { + "epoch": 0.30745658835546474, + "grad_norm": 1.4471068382263184, + "learning_rate": 1e-06, + "loss": 0.0204, + "step": 1806 + }, + { + "epoch": 0.3076268300987402, + "grad_norm": 1.7940800189971924, + "learning_rate": 1e-06, + "loss": 0.0204, + "step": 1807 + }, + { + "epoch": 0.30779707184201566, + "grad_norm": 1.4490859508514404, + "learning_rate": 1e-06, + "loss": 0.0186, + "step": 1808 + }, + { + "epoch": 0.3079673135852911, + "grad_norm": 1.3849128484725952, + "learning_rate": 1e-06, + "loss": 0.0229, + "step": 1809 + }, + { + "epoch": 0.30813755532856657, + "grad_norm": 1.6811866760253906, + "learning_rate": 1e-06, + "loss": 0.0159, + "step": 1810 + }, + { + "epoch": 0.308307797071842, + "grad_norm": 1.5585503578186035, + "learning_rate": 1e-06, + "loss": 0.0178, + "step": 1811 + }, + { + "epoch": 0.3084780388151175, + "grad_norm": 1.498989462852478, + "learning_rate": 1e-06, + "loss": 0.0215, + "step": 1812 + }, + { + "epoch": 0.30864828055839294, + "grad_norm": 1.5518637895584106, + "learning_rate": 1e-06, + "loss": 0.0145, + "step": 1813 + }, + { + "epoch": 0.3088185223016684, + "grad_norm": 1.9985408782958984, + "learning_rate": 1e-06, + "loss": 0.0384, + "step": 1814 + }, + { + "epoch": 0.3089887640449438, + "grad_norm": 1.9101368188858032, + "learning_rate": 1e-06, + "loss": 0.022, + "step": 1815 + }, + { + "epoch": 0.30915900578821925, + "grad_norm": 1.438533902168274, + "learning_rate": 1e-06, + "loss": 0.0206, + "step": 1816 + }, + { + "epoch": 0.3093292475314947, + "grad_norm": 1.6751413345336914, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 1817 + }, + { + "epoch": 0.30949948927477017, + "grad_norm": 1.1888127326965332, + "learning_rate": 1e-06, + "loss": 0.0207, + "step": 1818 + }, + { + "epoch": 0.3096697310180456, + "grad_norm": 2.00555157661438, + "learning_rate": 1e-06, + "loss": 0.0188, + "step": 1819 + }, + { + "epoch": 0.3098399727613211, + "grad_norm": 1.2455949783325195, + "learning_rate": 1e-06, + "loss": 0.021, + "step": 1820 + }, + { + "epoch": 0.31001021450459654, + "grad_norm": 1.2500693798065186, + "learning_rate": 1e-06, + "loss": 0.0188, + "step": 1821 + }, + { + "epoch": 0.310180456247872, + "grad_norm": 1.3195619583129883, + "learning_rate": 1e-06, + "loss": 0.022, + "step": 1822 + }, + { + "epoch": 0.31035069799114745, + "grad_norm": 1.429261326789856, + "learning_rate": 1e-06, + "loss": 0.0294, + "step": 1823 + }, + { + "epoch": 0.3105209397344229, + "grad_norm": 1.3425854444503784, + "learning_rate": 1e-06, + "loss": 0.0209, + "step": 1824 + }, + { + "epoch": 0.3106911814776983, + "grad_norm": 1.5613363981246948, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 1825 + }, + { + "epoch": 0.31086142322097376, + "grad_norm": 1.9861626625061035, + "learning_rate": 1e-06, + "loss": 0.0237, + "step": 1826 + }, + { + "epoch": 0.3110316649642492, + "grad_norm": 1.774733304977417, + "learning_rate": 1e-06, + "loss": 0.0295, + "step": 1827 + }, + { + "epoch": 0.3112019067075247, + "grad_norm": 1.4516446590423584, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 1828 + }, + { + "epoch": 0.31137214845080013, + "grad_norm": 1.5618529319763184, + "learning_rate": 1e-06, + "loss": 0.0227, + "step": 1829 + }, + { + "epoch": 0.3115423901940756, + "grad_norm": 1.410374641418457, + "learning_rate": 1e-06, + "loss": 0.019, + "step": 1830 + }, + { + "epoch": 0.31171263193735105, + "grad_norm": 1.3269094228744507, + "learning_rate": 1e-06, + "loss": 0.0232, + "step": 1831 + }, + { + "epoch": 0.3118828736806265, + "grad_norm": 2.6260929107666016, + "learning_rate": 1e-06, + "loss": 0.0334, + "step": 1832 + }, + { + "epoch": 0.31205311542390196, + "grad_norm": 1.4052633047103882, + "learning_rate": 1e-06, + "loss": 0.0147, + "step": 1833 + }, + { + "epoch": 0.3122233571671774, + "grad_norm": 1.3996262550354004, + "learning_rate": 1e-06, + "loss": 0.0231, + "step": 1834 + }, + { + "epoch": 0.3123935989104528, + "grad_norm": 1.7314420938491821, + "learning_rate": 1e-06, + "loss": 0.0289, + "step": 1835 + }, + { + "epoch": 0.3125638406537283, + "grad_norm": 1.2537227869033813, + "learning_rate": 1e-06, + "loss": 0.0209, + "step": 1836 + }, + { + "epoch": 0.31273408239700373, + "grad_norm": 1.3801499605178833, + "learning_rate": 1e-06, + "loss": 0.023, + "step": 1837 + }, + { + "epoch": 0.3129043241402792, + "grad_norm": 1.6672334671020508, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 1838 + }, + { + "epoch": 0.31307456588355465, + "grad_norm": 1.7810288667678833, + "learning_rate": 1e-06, + "loss": 0.031, + "step": 1839 + }, + { + "epoch": 0.3132448076268301, + "grad_norm": 1.435265064239502, + "learning_rate": 1e-06, + "loss": 0.0178, + "step": 1840 + }, + { + "epoch": 0.31341504937010556, + "grad_norm": 1.537177562713623, + "learning_rate": 1e-06, + "loss": 0.0229, + "step": 1841 + }, + { + "epoch": 0.313585291113381, + "grad_norm": 1.7948952913284302, + "learning_rate": 1e-06, + "loss": 0.0289, + "step": 1842 + }, + { + "epoch": 0.31375553285665647, + "grad_norm": 1.2194470167160034, + "learning_rate": 1e-06, + "loss": 0.0172, + "step": 1843 + }, + { + "epoch": 0.31392577459993193, + "grad_norm": 1.6376681327819824, + "learning_rate": 1e-06, + "loss": 0.0291, + "step": 1844 + }, + { + "epoch": 0.31409601634320733, + "grad_norm": 1.389352798461914, + "learning_rate": 1e-06, + "loss": 0.015, + "step": 1845 + }, + { + "epoch": 0.3142662580864828, + "grad_norm": 1.5804452896118164, + "learning_rate": 1e-06, + "loss": 0.0386, + "step": 1846 + }, + { + "epoch": 0.31443649982975824, + "grad_norm": 1.9733364582061768, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 1847 + }, + { + "epoch": 0.3146067415730337, + "grad_norm": 1.4925074577331543, + "learning_rate": 1e-06, + "loss": 0.0318, + "step": 1848 + }, + { + "epoch": 0.31477698331630916, + "grad_norm": 1.4996342658996582, + "learning_rate": 1e-06, + "loss": 0.0203, + "step": 1849 + }, + { + "epoch": 0.3149472250595846, + "grad_norm": 1.4963217973709106, + "learning_rate": 1e-06, + "loss": 0.0268, + "step": 1850 + }, + { + "epoch": 0.31511746680286007, + "grad_norm": 1.6725646257400513, + "learning_rate": 1e-06, + "loss": 0.0176, + "step": 1851 + }, + { + "epoch": 0.3152877085461355, + "grad_norm": 1.3835301399230957, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 1852 + }, + { + "epoch": 0.315457950289411, + "grad_norm": 1.7713634967803955, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 1853 + }, + { + "epoch": 0.31562819203268644, + "grad_norm": 1.1334712505340576, + "learning_rate": 1e-06, + "loss": 0.0175, + "step": 1854 + }, + { + "epoch": 0.31579843377596184, + "grad_norm": 1.6713346242904663, + "learning_rate": 1e-06, + "loss": 0.0192, + "step": 1855 + }, + { + "epoch": 0.3159686755192373, + "grad_norm": 1.463097095489502, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 1856 + }, + { + "epoch": 0.31613891726251275, + "grad_norm": 1.6081440448760986, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 1857 + }, + { + "epoch": 0.3163091590057882, + "grad_norm": 1.7677350044250488, + "learning_rate": 1e-06, + "loss": 0.022, + "step": 1858 + }, + { + "epoch": 0.31647940074906367, + "grad_norm": 2.0143535137176514, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 1859 + }, + { + "epoch": 0.3166496424923391, + "grad_norm": 1.4883443117141724, + "learning_rate": 1e-06, + "loss": 0.0303, + "step": 1860 + }, + { + "epoch": 0.3168198842356146, + "grad_norm": 1.5226455926895142, + "learning_rate": 1e-06, + "loss": 0.0199, + "step": 1861 + }, + { + "epoch": 0.31699012597889004, + "grad_norm": 1.593501091003418, + "learning_rate": 1e-06, + "loss": 0.0232, + "step": 1862 + }, + { + "epoch": 0.3171603677221655, + "grad_norm": 1.3263167142868042, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 1863 + }, + { + "epoch": 0.31733060946544095, + "grad_norm": 1.4621020555496216, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 1864 + }, + { + "epoch": 0.31750085120871635, + "grad_norm": 1.8614875078201294, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 1865 + }, + { + "epoch": 0.3176710929519918, + "grad_norm": 1.5570732355117798, + "learning_rate": 1e-06, + "loss": 0.0178, + "step": 1866 + }, + { + "epoch": 0.31784133469526726, + "grad_norm": 1.5920957326889038, + "learning_rate": 1e-06, + "loss": 0.0283, + "step": 1867 + }, + { + "epoch": 0.3180115764385427, + "grad_norm": 1.3949881792068481, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 1868 + }, + { + "epoch": 0.3181818181818182, + "grad_norm": 1.364362359046936, + "learning_rate": 1e-06, + "loss": 0.0292, + "step": 1869 + }, + { + "epoch": 0.31835205992509363, + "grad_norm": 1.4598324298858643, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 1870 + }, + { + "epoch": 0.3185223016683691, + "grad_norm": 1.320860505104065, + "learning_rate": 1e-06, + "loss": 0.0198, + "step": 1871 + }, + { + "epoch": 0.31869254341164455, + "grad_norm": 1.9381211996078491, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 1872 + }, + { + "epoch": 0.31886278515492, + "grad_norm": 1.4324305057525635, + "learning_rate": 1e-06, + "loss": 0.029, + "step": 1873 + }, + { + "epoch": 0.31903302689819546, + "grad_norm": 1.2118055820465088, + "learning_rate": 1e-06, + "loss": 0.0144, + "step": 1874 + }, + { + "epoch": 0.3192032686414709, + "grad_norm": 1.200643539428711, + "learning_rate": 1e-06, + "loss": 0.0174, + "step": 1875 + }, + { + "epoch": 0.3193735103847463, + "grad_norm": 1.5699355602264404, + "learning_rate": 1e-06, + "loss": 0.0288, + "step": 1876 + }, + { + "epoch": 0.3195437521280218, + "grad_norm": 1.7179347276687622, + "learning_rate": 1e-06, + "loss": 0.0267, + "step": 1877 + }, + { + "epoch": 0.31971399387129723, + "grad_norm": 1.8065325021743774, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 1878 + }, + { + "epoch": 0.3198842356145727, + "grad_norm": 2.4595632553100586, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 1879 + }, + { + "epoch": 0.32005447735784814, + "grad_norm": 1.140232801437378, + "learning_rate": 1e-06, + "loss": 0.0174, + "step": 1880 + }, + { + "epoch": 0.3202247191011236, + "grad_norm": 1.3284320831298828, + "learning_rate": 1e-06, + "loss": 0.0212, + "step": 1881 + }, + { + "epoch": 0.32039496084439906, + "grad_norm": 1.7007908821105957, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 1882 + }, + { + "epoch": 0.3205652025876745, + "grad_norm": 1.8130091428756714, + "learning_rate": 1e-06, + "loss": 0.0322, + "step": 1883 + }, + { + "epoch": 0.32073544433094997, + "grad_norm": 1.434629201889038, + "learning_rate": 1e-06, + "loss": 0.0176, + "step": 1884 + }, + { + "epoch": 0.3209056860742254, + "grad_norm": 1.0741052627563477, + "learning_rate": 1e-06, + "loss": 0.0174, + "step": 1885 + }, + { + "epoch": 0.32107592781750083, + "grad_norm": 1.5076146125793457, + "learning_rate": 1e-06, + "loss": 0.0187, + "step": 1886 + }, + { + "epoch": 0.3212461695607763, + "grad_norm": 1.420843243598938, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 1887 + }, + { + "epoch": 0.32141641130405174, + "grad_norm": 1.4577027559280396, + "learning_rate": 1e-06, + "loss": 0.019, + "step": 1888 + }, + { + "epoch": 0.3215866530473272, + "grad_norm": 1.7815074920654297, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 1889 + }, + { + "epoch": 0.32175689479060265, + "grad_norm": 1.2624595165252686, + "learning_rate": 1e-06, + "loss": 0.023, + "step": 1890 + }, + { + "epoch": 0.3219271365338781, + "grad_norm": 1.2014528512954712, + "learning_rate": 1e-06, + "loss": 0.0191, + "step": 1891 + }, + { + "epoch": 0.32209737827715357, + "grad_norm": 1.948778510093689, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 1892 + }, + { + "epoch": 0.322267620020429, + "grad_norm": 1.3787339925765991, + "learning_rate": 1e-06, + "loss": 0.0231, + "step": 1893 + }, + { + "epoch": 0.3224378617637045, + "grad_norm": 1.5238713026046753, + "learning_rate": 1e-06, + "loss": 0.0195, + "step": 1894 + }, + { + "epoch": 0.32260810350697994, + "grad_norm": 1.623833417892456, + "learning_rate": 1e-06, + "loss": 0.0222, + "step": 1895 + }, + { + "epoch": 0.32277834525025534, + "grad_norm": 1.7165459394454956, + "learning_rate": 1e-06, + "loss": 0.0294, + "step": 1896 + }, + { + "epoch": 0.3229485869935308, + "grad_norm": 1.7165459394454956, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 1897 + }, + { + "epoch": 0.32311882873680625, + "grad_norm": 1.6914831399917603, + "learning_rate": 1e-06, + "loss": 0.0212, + "step": 1898 + }, + { + "epoch": 0.3232890704800817, + "grad_norm": 1.4278634786605835, + "learning_rate": 1e-06, + "loss": 0.0173, + "step": 1899 + }, + { + "epoch": 0.32345931222335716, + "grad_norm": 1.3899396657943726, + "learning_rate": 1e-06, + "loss": 0.0216, + "step": 1900 + }, + { + "epoch": 0.3236295539666326, + "grad_norm": 1.8707932233810425, + "learning_rate": 1e-06, + "loss": 0.0296, + "step": 1901 + }, + { + "epoch": 0.3237997957099081, + "grad_norm": 1.5746642351150513, + "learning_rate": 1e-06, + "loss": 0.0202, + "step": 1902 + }, + { + "epoch": 0.32397003745318353, + "grad_norm": 1.5746642351150513, + "learning_rate": 1e-06, + "loss": 0.0459, + "step": 1903 + }, + { + "epoch": 0.324140279196459, + "grad_norm": 1.444451928138733, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 1904 + }, + { + "epoch": 0.32431052093973445, + "grad_norm": 1.6318027973175049, + "learning_rate": 1e-06, + "loss": 0.0168, + "step": 1905 + }, + { + "epoch": 0.32448076268300985, + "grad_norm": 1.5491737127304077, + "learning_rate": 1e-06, + "loss": 0.0229, + "step": 1906 + }, + { + "epoch": 0.3246510044262853, + "grad_norm": 1.1921838521957397, + "learning_rate": 1e-06, + "loss": 0.0217, + "step": 1907 + }, + { + "epoch": 0.32482124616956076, + "grad_norm": 1.1826146841049194, + "learning_rate": 1e-06, + "loss": 0.0153, + "step": 1908 + }, + { + "epoch": 0.3249914879128362, + "grad_norm": 1.4094597101211548, + "learning_rate": 1e-06, + "loss": 0.019, + "step": 1909 + }, + { + "epoch": 0.3251617296561117, + "grad_norm": 1.6654062271118164, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 1910 + }, + { + "epoch": 0.32533197139938713, + "grad_norm": 1.497878909111023, + "learning_rate": 1e-06, + "loss": 0.0234, + "step": 1911 + }, + { + "epoch": 0.3255022131426626, + "grad_norm": 1.8206701278686523, + "learning_rate": 1e-06, + "loss": 0.0277, + "step": 1912 + }, + { + "epoch": 0.32567245488593805, + "grad_norm": 1.2228989601135254, + "learning_rate": 1e-06, + "loss": 0.017, + "step": 1913 + }, + { + "epoch": 0.3258426966292135, + "grad_norm": 1.4604421854019165, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 1914 + }, + { + "epoch": 0.32601293837248896, + "grad_norm": 1.377022624015808, + "learning_rate": 1e-06, + "loss": 0.0206, + "step": 1915 + }, + { + "epoch": 0.32618318011576436, + "grad_norm": 3.5428032875061035, + "learning_rate": 1e-06, + "loss": 0.0489, + "step": 1916 + }, + { + "epoch": 0.3263534218590398, + "grad_norm": 1.4174351692199707, + "learning_rate": 1e-06, + "loss": 0.0177, + "step": 1917 + }, + { + "epoch": 0.3265236636023153, + "grad_norm": 1.2907001972198486, + "learning_rate": 1e-06, + "loss": 0.0171, + "step": 1918 + }, + { + "epoch": 0.32669390534559073, + "grad_norm": 1.2209053039550781, + "learning_rate": 1e-06, + "loss": 0.019, + "step": 1919 + }, + { + "epoch": 0.3268641470888662, + "grad_norm": 1.1538865566253662, + "learning_rate": 1e-06, + "loss": 0.0144, + "step": 1920 + }, + { + "epoch": 0.32703438883214164, + "grad_norm": 1.5341880321502686, + "learning_rate": 1e-06, + "loss": 0.019, + "step": 1921 + }, + { + "epoch": 0.3272046305754171, + "grad_norm": 1.3872793912887573, + "learning_rate": 1e-06, + "loss": 0.0181, + "step": 1922 + }, + { + "epoch": 0.32737487231869256, + "grad_norm": 1.6427528858184814, + "learning_rate": 1e-06, + "loss": 0.0374, + "step": 1923 + }, + { + "epoch": 0.327545114061968, + "grad_norm": 1.2254210710525513, + "learning_rate": 1e-06, + "loss": 0.0155, + "step": 1924 + }, + { + "epoch": 0.32771535580524347, + "grad_norm": 1.4030553102493286, + "learning_rate": 1e-06, + "loss": 0.0207, + "step": 1925 + }, + { + "epoch": 0.32788559754851887, + "grad_norm": 1.9961392879486084, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 1926 + }, + { + "epoch": 0.3280558392917943, + "grad_norm": 1.2701325416564941, + "learning_rate": 1e-06, + "loss": 0.0194, + "step": 1927 + }, + { + "epoch": 0.3282260810350698, + "grad_norm": 1.761595606803894, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 1928 + }, + { + "epoch": 0.32839632277834524, + "grad_norm": 1.5316126346588135, + "learning_rate": 1e-06, + "loss": 0.0224, + "step": 1929 + }, + { + "epoch": 0.3285665645216207, + "grad_norm": 1.808524489402771, + "learning_rate": 1e-06, + "loss": 0.0289, + "step": 1930 + }, + { + "epoch": 0.32873680626489615, + "grad_norm": 1.5288134813308716, + "learning_rate": 1e-06, + "loss": 0.023, + "step": 1931 + }, + { + "epoch": 0.3289070480081716, + "grad_norm": 1.089560627937317, + "learning_rate": 1e-06, + "loss": 0.0148, + "step": 1932 + }, + { + "epoch": 0.32907728975144707, + "grad_norm": 1.5515779256820679, + "learning_rate": 1e-06, + "loss": 0.0187, + "step": 1933 + }, + { + "epoch": 0.3292475314947225, + "grad_norm": 1.5481963157653809, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 1934 + }, + { + "epoch": 0.329417773237998, + "grad_norm": 1.1329618692398071, + "learning_rate": 1e-06, + "loss": 0.0175, + "step": 1935 + }, + { + "epoch": 0.3295880149812734, + "grad_norm": 1.2369213104248047, + "learning_rate": 1e-06, + "loss": 0.0196, + "step": 1936 + }, + { + "epoch": 0.32975825672454884, + "grad_norm": 1.7791813611984253, + "learning_rate": 1e-06, + "loss": 0.028, + "step": 1937 + }, + { + "epoch": 0.3299284984678243, + "grad_norm": 1.6558544635772705, + "learning_rate": 1e-06, + "loss": 0.0208, + "step": 1938 + }, + { + "epoch": 0.33009874021109975, + "grad_norm": 1.5044019222259521, + "learning_rate": 1e-06, + "loss": 0.0194, + "step": 1939 + }, + { + "epoch": 0.3302689819543752, + "grad_norm": 1.4375008344650269, + "learning_rate": 1e-06, + "loss": 0.021, + "step": 1940 + }, + { + "epoch": 0.33043922369765066, + "grad_norm": 1.3709217309951782, + "learning_rate": 1e-06, + "loss": 0.0155, + "step": 1941 + }, + { + "epoch": 0.3306094654409261, + "grad_norm": 1.812625765800476, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 1942 + }, + { + "epoch": 0.3307797071842016, + "grad_norm": 1.2849189043045044, + "learning_rate": 1e-06, + "loss": 0.0187, + "step": 1943 + }, + { + "epoch": 0.33094994892747703, + "grad_norm": 1.6139867305755615, + "learning_rate": 1e-06, + "loss": 0.0269, + "step": 1944 + }, + { + "epoch": 0.3311201906707525, + "grad_norm": 1.5526776313781738, + "learning_rate": 1e-06, + "loss": 0.0236, + "step": 1945 + }, + { + "epoch": 0.33129043241402795, + "grad_norm": 1.4651141166687012, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 1946 + }, + { + "epoch": 0.33146067415730335, + "grad_norm": 2.1171090602874756, + "learning_rate": 1e-06, + "loss": 0.0345, + "step": 1947 + }, + { + "epoch": 0.3316309159005788, + "grad_norm": 1.532902717590332, + "learning_rate": 1e-06, + "loss": 0.0234, + "step": 1948 + }, + { + "epoch": 0.33180115764385426, + "grad_norm": 1.4364941120147705, + "learning_rate": 1e-06, + "loss": 0.0224, + "step": 1949 + }, + { + "epoch": 0.3319713993871297, + "grad_norm": 1.2812119722366333, + "learning_rate": 1e-06, + "loss": 0.0178, + "step": 1950 + }, + { + "epoch": 0.3321416411304052, + "grad_norm": 1.3794339895248413, + "learning_rate": 1e-06, + "loss": 0.02, + "step": 1951 + }, + { + "epoch": 0.33231188287368063, + "grad_norm": 1.5946048498153687, + "learning_rate": 1e-06, + "loss": 0.0179, + "step": 1952 + }, + { + "epoch": 0.3324821246169561, + "grad_norm": 2.0443038940429688, + "learning_rate": 1e-06, + "loss": 0.0215, + "step": 1953 + }, + { + "epoch": 0.33265236636023154, + "grad_norm": 1.368597149848938, + "learning_rate": 1e-06, + "loss": 0.0158, + "step": 1954 + }, + { + "epoch": 0.332822608103507, + "grad_norm": 1.441415786743164, + "learning_rate": 1e-06, + "loss": 0.0197, + "step": 1955 + }, + { + "epoch": 0.33299284984678246, + "grad_norm": 1.6092345714569092, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 1956 + }, + { + "epoch": 0.33316309159005786, + "grad_norm": 1.4012231826782227, + "learning_rate": 1e-06, + "loss": 0.0217, + "step": 1957 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.6141374111175537, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 1958 + }, + { + "epoch": 0.33350357507660877, + "grad_norm": 1.4171791076660156, + "learning_rate": 1e-06, + "loss": 0.0178, + "step": 1959 + }, + { + "epoch": 0.33367381681988423, + "grad_norm": 1.7821341753005981, + "learning_rate": 1e-06, + "loss": 0.0288, + "step": 1960 + }, + { + "epoch": 0.3338440585631597, + "grad_norm": 1.6299926042556763, + "learning_rate": 1e-06, + "loss": 0.0269, + "step": 1961 + }, + { + "epoch": 0.33401430030643514, + "grad_norm": 1.5912959575653076, + "learning_rate": 1e-06, + "loss": 0.0196, + "step": 1962 + }, + { + "epoch": 0.3341845420497106, + "grad_norm": 1.282433271408081, + "learning_rate": 1e-06, + "loss": 0.0169, + "step": 1963 + }, + { + "epoch": 0.33435478379298605, + "grad_norm": 1.5464577674865723, + "learning_rate": 1e-06, + "loss": 0.027, + "step": 1964 + }, + { + "epoch": 0.3345250255362615, + "grad_norm": 1.4779020547866821, + "learning_rate": 1e-06, + "loss": 0.0301, + "step": 1965 + }, + { + "epoch": 0.33469526727953697, + "grad_norm": 1.4697922468185425, + "learning_rate": 1e-06, + "loss": 0.0234, + "step": 1966 + }, + { + "epoch": 0.33486550902281237, + "grad_norm": 1.5022433996200562, + "learning_rate": 1e-06, + "loss": 0.0189, + "step": 1967 + }, + { + "epoch": 0.3350357507660878, + "grad_norm": 1.5426301956176758, + "learning_rate": 1e-06, + "loss": 0.0221, + "step": 1968 + }, + { + "epoch": 0.3352059925093633, + "grad_norm": 1.9386368989944458, + "learning_rate": 1e-06, + "loss": 0.0269, + "step": 1969 + }, + { + "epoch": 0.33537623425263874, + "grad_norm": 1.8475077152252197, + "learning_rate": 1e-06, + "loss": 0.0194, + "step": 1970 + }, + { + "epoch": 0.3355464759959142, + "grad_norm": 1.6919015645980835, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 1971 + }, + { + "epoch": 0.33571671773918965, + "grad_norm": 1.3801902532577515, + "learning_rate": 1e-06, + "loss": 0.0143, + "step": 1972 + }, + { + "epoch": 0.3358869594824651, + "grad_norm": 1.8841779232025146, + "learning_rate": 1e-06, + "loss": 0.0382, + "step": 1973 + }, + { + "epoch": 0.33605720122574056, + "grad_norm": 1.4700556993484497, + "learning_rate": 1e-06, + "loss": 0.0222, + "step": 1974 + }, + { + "epoch": 0.336227442969016, + "grad_norm": 1.4399062395095825, + "learning_rate": 1e-06, + "loss": 0.0233, + "step": 1975 + }, + { + "epoch": 0.3363976847122915, + "grad_norm": 1.2615573406219482, + "learning_rate": 1e-06, + "loss": 0.0149, + "step": 1976 + }, + { + "epoch": 0.3365679264555669, + "grad_norm": 1.5497113466262817, + "learning_rate": 1e-06, + "loss": 0.0234, + "step": 1977 + }, + { + "epoch": 0.33673816819884234, + "grad_norm": 1.5951131582260132, + "learning_rate": 1e-06, + "loss": 0.0189, + "step": 1978 + }, + { + "epoch": 0.3369084099421178, + "grad_norm": 1.7845149040222168, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 1979 + }, + { + "epoch": 0.33707865168539325, + "grad_norm": 1.608124017715454, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 1980 + }, + { + "epoch": 0.3372488934286687, + "grad_norm": 1.1790499687194824, + "learning_rate": 1e-06, + "loss": 0.0163, + "step": 1981 + }, + { + "epoch": 0.33741913517194416, + "grad_norm": 1.192211627960205, + "learning_rate": 1e-06, + "loss": 0.0215, + "step": 1982 + }, + { + "epoch": 0.3375893769152196, + "grad_norm": 1.3114814758300781, + "learning_rate": 1e-06, + "loss": 0.0229, + "step": 1983 + }, + { + "epoch": 0.3377596186584951, + "grad_norm": 1.1542383432388306, + "learning_rate": 1e-06, + "loss": 0.0124, + "step": 1984 + }, + { + "epoch": 0.33792986040177053, + "grad_norm": 1.2791386842727661, + "learning_rate": 1e-06, + "loss": 0.0155, + "step": 1985 + }, + { + "epoch": 0.338100102145046, + "grad_norm": 1.5622920989990234, + "learning_rate": 1e-06, + "loss": 0.0191, + "step": 1986 + }, + { + "epoch": 0.3382703438883214, + "grad_norm": 1.3029829263687134, + "learning_rate": 1e-06, + "loss": 0.0188, + "step": 1987 + }, + { + "epoch": 0.33844058563159685, + "grad_norm": 1.5938515663146973, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 1988 + }, + { + "epoch": 0.3386108273748723, + "grad_norm": 1.4531913995742798, + "learning_rate": 1e-06, + "loss": 0.0189, + "step": 1989 + }, + { + "epoch": 0.33878106911814776, + "grad_norm": 1.8602267503738403, + "learning_rate": 1e-06, + "loss": 0.021, + "step": 1990 + }, + { + "epoch": 0.3389513108614232, + "grad_norm": 1.388229489326477, + "learning_rate": 1e-06, + "loss": 0.0118, + "step": 1991 + }, + { + "epoch": 0.3391215526046987, + "grad_norm": 1.3557769060134888, + "learning_rate": 1e-06, + "loss": 0.0178, + "step": 1992 + }, + { + "epoch": 0.33929179434797413, + "grad_norm": 1.5094513893127441, + "learning_rate": 1e-06, + "loss": 0.0186, + "step": 1993 + }, + { + "epoch": 0.3394620360912496, + "grad_norm": 1.5209040641784668, + "learning_rate": 1e-06, + "loss": 0.0234, + "step": 1994 + }, + { + "epoch": 0.33963227783452504, + "grad_norm": 2.0958139896392822, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 1995 + }, + { + "epoch": 0.3398025195778005, + "grad_norm": 1.7948811054229736, + "learning_rate": 1e-06, + "loss": 0.0195, + "step": 1996 + }, + { + "epoch": 0.3399727613210759, + "grad_norm": 1.6881929636001587, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 1997 + }, + { + "epoch": 0.34014300306435136, + "grad_norm": 1.1193768978118896, + "learning_rate": 1e-06, + "loss": 0.0157, + "step": 1998 + }, + { + "epoch": 0.3403132448076268, + "grad_norm": 2.3566431999206543, + "learning_rate": 1e-06, + "loss": 0.0536, + "step": 1999 + }, + { + "epoch": 0.34048348655090227, + "grad_norm": 1.4195446968078613, + "learning_rate": 1e-06, + "loss": 0.0161, + "step": 2000 + }, + { + "epoch": 0.34048348655090227, + "eval_loss": 0.27453577518463135, + "eval_runtime": 21.0487, + "eval_samples_per_second": 14.253, + "eval_steps_per_second": 0.38, + "step": 2000 + }, + { + "epoch": 0.3406537282941777, + "grad_norm": 1.61619234085083, + "learning_rate": 1e-06, + "loss": 0.0193, + "step": 2001 + }, + { + "epoch": 0.3408239700374532, + "grad_norm": 1.2892980575561523, + "learning_rate": 1e-06, + "loss": 0.0173, + "step": 2002 + }, + { + "epoch": 0.34099421178072864, + "grad_norm": 1.3476999998092651, + "learning_rate": 1e-06, + "loss": 0.0194, + "step": 2003 + }, + { + "epoch": 0.3411644535240041, + "grad_norm": 1.6146029233932495, + "learning_rate": 1e-06, + "loss": 0.0154, + "step": 2004 + }, + { + "epoch": 0.34133469526727955, + "grad_norm": 1.2237156629562378, + "learning_rate": 1e-06, + "loss": 0.0159, + "step": 2005 + }, + { + "epoch": 0.341504937010555, + "grad_norm": 1.6599453687667847, + "learning_rate": 1e-06, + "loss": 0.0228, + "step": 2006 + }, + { + "epoch": 0.34167517875383047, + "grad_norm": 4.322335720062256, + "learning_rate": 1e-06, + "loss": 0.0276, + "step": 2007 + }, + { + "epoch": 0.34184542049710587, + "grad_norm": 1.4886971712112427, + "learning_rate": 1e-06, + "loss": 0.0307, + "step": 2008 + }, + { + "epoch": 0.3420156622403813, + "grad_norm": 1.504891037940979, + "learning_rate": 1e-06, + "loss": 0.0144, + "step": 2009 + }, + { + "epoch": 0.3421859039836568, + "grad_norm": 1.3250677585601807, + "learning_rate": 1e-06, + "loss": 0.0178, + "step": 2010 + }, + { + "epoch": 0.34235614572693224, + "grad_norm": 1.3704164028167725, + "learning_rate": 1e-06, + "loss": 0.0175, + "step": 2011 + }, + { + "epoch": 0.3425263874702077, + "grad_norm": 1.270844578742981, + "learning_rate": 1e-06, + "loss": 0.0188, + "step": 2012 + }, + { + "epoch": 0.34269662921348315, + "grad_norm": 1.2841618061065674, + "learning_rate": 1e-06, + "loss": 0.0162, + "step": 2013 + }, + { + "epoch": 0.3428668709567586, + "grad_norm": 2.089329242706299, + "learning_rate": 1e-06, + "loss": 0.0362, + "step": 2014 + }, + { + "epoch": 0.34303711270003406, + "grad_norm": 2.0143887996673584, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 2015 + }, + { + "epoch": 0.3432073544433095, + "grad_norm": 1.7846940755844116, + "learning_rate": 1e-06, + "loss": 0.0268, + "step": 2016 + }, + { + "epoch": 0.343377596186585, + "grad_norm": 1.5742454528808594, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 2017 + }, + { + "epoch": 0.3435478379298604, + "grad_norm": 1.1249366998672485, + "learning_rate": 1e-06, + "loss": 0.0133, + "step": 2018 + }, + { + "epoch": 0.34371807967313583, + "grad_norm": 1.3523153066635132, + "learning_rate": 1e-06, + "loss": 0.0234, + "step": 2019 + }, + { + "epoch": 0.3438883214164113, + "grad_norm": 1.611651062965393, + "learning_rate": 1e-06, + "loss": 0.0179, + "step": 2020 + }, + { + "epoch": 0.34405856315968675, + "grad_norm": 1.476097583770752, + "learning_rate": 1e-06, + "loss": 0.017, + "step": 2021 + }, + { + "epoch": 0.3442288049029622, + "grad_norm": 1.2952362298965454, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 2022 + }, + { + "epoch": 0.34439904664623766, + "grad_norm": 1.3099634647369385, + "learning_rate": 1e-06, + "loss": 0.0153, + "step": 2023 + }, + { + "epoch": 0.3445692883895131, + "grad_norm": 1.4053715467453003, + "learning_rate": 1e-06, + "loss": 0.0159, + "step": 2024 + }, + { + "epoch": 0.3447395301327886, + "grad_norm": 1.7385302782058716, + "learning_rate": 1e-06, + "loss": 0.022, + "step": 2025 + }, + { + "epoch": 0.34490977187606403, + "grad_norm": 1.365162968635559, + "learning_rate": 1e-06, + "loss": 0.0171, + "step": 2026 + }, + { + "epoch": 0.3450800136193395, + "grad_norm": 1.7785168886184692, + "learning_rate": 1e-06, + "loss": 0.0202, + "step": 2027 + }, + { + "epoch": 0.3452502553626149, + "grad_norm": 1.1140302419662476, + "learning_rate": 1e-06, + "loss": 0.0127, + "step": 2028 + }, + { + "epoch": 0.34542049710589035, + "grad_norm": 1.29301917552948, + "learning_rate": 1e-06, + "loss": 0.0212, + "step": 2029 + }, + { + "epoch": 0.3455907388491658, + "grad_norm": 1.9448950290679932, + "learning_rate": 1e-06, + "loss": 0.0285, + "step": 2030 + }, + { + "epoch": 0.34576098059244126, + "grad_norm": 1.5493046045303345, + "learning_rate": 1e-06, + "loss": 0.0268, + "step": 2031 + }, + { + "epoch": 0.3459312223357167, + "grad_norm": 1.561806321144104, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 2032 + }, + { + "epoch": 0.34610146407899217, + "grad_norm": 1.5834723711013794, + "learning_rate": 1e-06, + "loss": 0.0336, + "step": 2033 + }, + { + "epoch": 0.34627170582226763, + "grad_norm": 1.1366455554962158, + "learning_rate": 1e-06, + "loss": 0.0182, + "step": 2034 + }, + { + "epoch": 0.3464419475655431, + "grad_norm": 1.152408242225647, + "learning_rate": 1e-06, + "loss": 0.0166, + "step": 2035 + }, + { + "epoch": 0.34661218930881854, + "grad_norm": 1.2690989971160889, + "learning_rate": 1e-06, + "loss": 0.014, + "step": 2036 + }, + { + "epoch": 0.346782431052094, + "grad_norm": 1.0005087852478027, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 2037 + }, + { + "epoch": 0.3469526727953694, + "grad_norm": 1.1189073324203491, + "learning_rate": 1e-06, + "loss": 0.0143, + "step": 2038 + }, + { + "epoch": 0.34712291453864486, + "grad_norm": 1.6101375818252563, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 2039 + }, + { + "epoch": 0.3472931562819203, + "grad_norm": 1.4370726346969604, + "learning_rate": 1e-06, + "loss": 0.0193, + "step": 2040 + }, + { + "epoch": 0.34746339802519577, + "grad_norm": 1.6193634271621704, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 2041 + }, + { + "epoch": 0.3476336397684712, + "grad_norm": 1.6027393341064453, + "learning_rate": 1e-06, + "loss": 0.0178, + "step": 2042 + }, + { + "epoch": 0.3478038815117467, + "grad_norm": 1.3268249034881592, + "learning_rate": 1e-06, + "loss": 0.0194, + "step": 2043 + }, + { + "epoch": 0.34797412325502214, + "grad_norm": 1.6447010040283203, + "learning_rate": 1e-06, + "loss": 0.016, + "step": 2044 + }, + { + "epoch": 0.3481443649982976, + "grad_norm": 1.3561208248138428, + "learning_rate": 1e-06, + "loss": 0.0166, + "step": 2045 + }, + { + "epoch": 0.34831460674157305, + "grad_norm": 1.494844913482666, + "learning_rate": 1e-06, + "loss": 0.0288, + "step": 2046 + }, + { + "epoch": 0.3484848484848485, + "grad_norm": 1.2338416576385498, + "learning_rate": 1e-06, + "loss": 0.0196, + "step": 2047 + }, + { + "epoch": 0.3486550902281239, + "grad_norm": 1.3532445430755615, + "learning_rate": 1e-06, + "loss": 0.0208, + "step": 2048 + }, + { + "epoch": 0.34882533197139937, + "grad_norm": 1.3149325847625732, + "learning_rate": 1e-06, + "loss": 0.02, + "step": 2049 + }, + { + "epoch": 0.3489955737146748, + "grad_norm": 1.2256778478622437, + "learning_rate": 1e-06, + "loss": 0.0188, + "step": 2050 + }, + { + "epoch": 0.3491658154579503, + "grad_norm": 1.6831778287887573, + "learning_rate": 1e-06, + "loss": 0.0164, + "step": 2051 + }, + { + "epoch": 0.34933605720122574, + "grad_norm": 1.2866861820220947, + "learning_rate": 1e-06, + "loss": 0.0197, + "step": 2052 + }, + { + "epoch": 0.3495062989445012, + "grad_norm": 1.3010034561157227, + "learning_rate": 1e-06, + "loss": 0.0218, + "step": 2053 + }, + { + "epoch": 0.34967654068777665, + "grad_norm": 1.460037350654602, + "learning_rate": 1e-06, + "loss": 0.021, + "step": 2054 + }, + { + "epoch": 0.3498467824310521, + "grad_norm": 1.2930116653442383, + "learning_rate": 1e-06, + "loss": 0.0189, + "step": 2055 + }, + { + "epoch": 0.35001702417432756, + "grad_norm": 1.4779536724090576, + "learning_rate": 1e-06, + "loss": 0.0195, + "step": 2056 + }, + { + "epoch": 0.350187265917603, + "grad_norm": 1.1668182611465454, + "learning_rate": 1e-06, + "loss": 0.0168, + "step": 2057 + }, + { + "epoch": 0.3503575076608784, + "grad_norm": 1.5473500490188599, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 2058 + }, + { + "epoch": 0.3505277494041539, + "grad_norm": 1.5026984214782715, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 2059 + }, + { + "epoch": 0.35069799114742933, + "grad_norm": 1.4979209899902344, + "learning_rate": 1e-06, + "loss": 0.0192, + "step": 2060 + }, + { + "epoch": 0.3508682328907048, + "grad_norm": 1.6945948600769043, + "learning_rate": 1e-06, + "loss": 0.0215, + "step": 2061 + }, + { + "epoch": 0.35103847463398025, + "grad_norm": 1.4094072580337524, + "learning_rate": 1e-06, + "loss": 0.0159, + "step": 2062 + }, + { + "epoch": 0.3512087163772557, + "grad_norm": 1.5919791460037231, + "learning_rate": 1e-06, + "loss": 0.017, + "step": 2063 + }, + { + "epoch": 0.35137895812053116, + "grad_norm": 1.1257418394088745, + "learning_rate": 1e-06, + "loss": 0.0117, + "step": 2064 + }, + { + "epoch": 0.3515491998638066, + "grad_norm": 1.4687309265136719, + "learning_rate": 1e-06, + "loss": 0.0161, + "step": 2065 + }, + { + "epoch": 0.3517194416070821, + "grad_norm": 1.5042623281478882, + "learning_rate": 1e-06, + "loss": 0.0206, + "step": 2066 + }, + { + "epoch": 0.35188968335035753, + "grad_norm": 1.5676453113555908, + "learning_rate": 1e-06, + "loss": 0.0191, + "step": 2067 + }, + { + "epoch": 0.352059925093633, + "grad_norm": 1.368834137916565, + "learning_rate": 1e-06, + "loss": 0.0177, + "step": 2068 + }, + { + "epoch": 0.3522301668369084, + "grad_norm": 1.1905148029327393, + "learning_rate": 1e-06, + "loss": 0.0159, + "step": 2069 + }, + { + "epoch": 0.35240040858018384, + "grad_norm": 1.3578230142593384, + "learning_rate": 1e-06, + "loss": 0.0146, + "step": 2070 + }, + { + "epoch": 0.3525706503234593, + "grad_norm": 1.6715102195739746, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 2071 + }, + { + "epoch": 0.35274089206673476, + "grad_norm": 1.2917587757110596, + "learning_rate": 1e-06, + "loss": 0.0231, + "step": 2072 + }, + { + "epoch": 0.3529111338100102, + "grad_norm": 1.7715479135513306, + "learning_rate": 1e-06, + "loss": 0.0151, + "step": 2073 + }, + { + "epoch": 0.35308137555328567, + "grad_norm": 1.1060619354248047, + "learning_rate": 1e-06, + "loss": 0.0133, + "step": 2074 + }, + { + "epoch": 0.3532516172965611, + "grad_norm": 2.2120845317840576, + "learning_rate": 1e-06, + "loss": 0.0233, + "step": 2075 + }, + { + "epoch": 0.3534218590398366, + "grad_norm": 1.4091529846191406, + "learning_rate": 1e-06, + "loss": 0.027, + "step": 2076 + }, + { + "epoch": 0.35359210078311204, + "grad_norm": 1.3464224338531494, + "learning_rate": 1e-06, + "loss": 0.0183, + "step": 2077 + }, + { + "epoch": 0.3537623425263875, + "grad_norm": 1.6281346082687378, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 2078 + }, + { + "epoch": 0.3539325842696629, + "grad_norm": 1.2976142168045044, + "learning_rate": 1e-06, + "loss": 0.0157, + "step": 2079 + }, + { + "epoch": 0.35410282601293835, + "grad_norm": 1.4444133043289185, + "learning_rate": 1e-06, + "loss": 0.0186, + "step": 2080 + }, + { + "epoch": 0.3542730677562138, + "grad_norm": 1.5797218084335327, + "learning_rate": 1e-06, + "loss": 0.0183, + "step": 2081 + }, + { + "epoch": 0.35444330949948927, + "grad_norm": 1.4098842144012451, + "learning_rate": 1e-06, + "loss": 0.0211, + "step": 2082 + }, + { + "epoch": 0.3546135512427647, + "grad_norm": 1.34474778175354, + "learning_rate": 1e-06, + "loss": 0.0163, + "step": 2083 + }, + { + "epoch": 0.3547837929860402, + "grad_norm": 1.0983127355575562, + "learning_rate": 1e-06, + "loss": 0.0163, + "step": 2084 + }, + { + "epoch": 0.35495403472931564, + "grad_norm": 1.8528269529342651, + "learning_rate": 1e-06, + "loss": 0.0211, + "step": 2085 + }, + { + "epoch": 0.3551242764725911, + "grad_norm": 1.4813817739486694, + "learning_rate": 1e-06, + "loss": 0.0148, + "step": 2086 + }, + { + "epoch": 0.35529451821586655, + "grad_norm": 1.7108453512191772, + "learning_rate": 1e-06, + "loss": 0.0221, + "step": 2087 + }, + { + "epoch": 0.355464759959142, + "grad_norm": 1.5667725801467896, + "learning_rate": 1e-06, + "loss": 0.0196, + "step": 2088 + }, + { + "epoch": 0.3556350017024174, + "grad_norm": 1.5677913427352905, + "learning_rate": 1e-06, + "loss": 0.0221, + "step": 2089 + }, + { + "epoch": 0.35580524344569286, + "grad_norm": 0.9959020614624023, + "learning_rate": 1e-06, + "loss": 0.0119, + "step": 2090 + }, + { + "epoch": 0.3559754851889683, + "grad_norm": 1.4427545070648193, + "learning_rate": 1e-06, + "loss": 0.018, + "step": 2091 + }, + { + "epoch": 0.3561457269322438, + "grad_norm": 1.162044644355774, + "learning_rate": 1e-06, + "loss": 0.0193, + "step": 2092 + }, + { + "epoch": 0.35631596867551923, + "grad_norm": 1.3956273794174194, + "learning_rate": 1e-06, + "loss": 0.0219, + "step": 2093 + }, + { + "epoch": 0.3564862104187947, + "grad_norm": 1.4350906610488892, + "learning_rate": 1e-06, + "loss": 0.0204, + "step": 2094 + }, + { + "epoch": 0.35665645216207015, + "grad_norm": 1.2975064516067505, + "learning_rate": 1e-06, + "loss": 0.0147, + "step": 2095 + }, + { + "epoch": 0.3568266939053456, + "grad_norm": 2.6082117557525635, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 2096 + }, + { + "epoch": 0.35699693564862106, + "grad_norm": 1.2373225688934326, + "learning_rate": 1e-06, + "loss": 0.0158, + "step": 2097 + }, + { + "epoch": 0.3571671773918965, + "grad_norm": 1.4951549768447876, + "learning_rate": 1e-06, + "loss": 0.0154, + "step": 2098 + }, + { + "epoch": 0.3573374191351719, + "grad_norm": 1.339722752571106, + "learning_rate": 1e-06, + "loss": 0.012, + "step": 2099 + }, + { + "epoch": 0.3575076608784474, + "grad_norm": 1.1775256395339966, + "learning_rate": 1e-06, + "loss": 0.0141, + "step": 2100 + }, + { + "epoch": 0.35767790262172283, + "grad_norm": 1.7469888925552368, + "learning_rate": 1e-06, + "loss": 0.0147, + "step": 2101 + }, + { + "epoch": 0.3578481443649983, + "grad_norm": 1.4239497184753418, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 2102 + }, + { + "epoch": 0.35801838610827375, + "grad_norm": 1.3292256593704224, + "learning_rate": 1e-06, + "loss": 0.0162, + "step": 2103 + }, + { + "epoch": 0.3581886278515492, + "grad_norm": 1.902808427810669, + "learning_rate": 1e-06, + "loss": 0.0274, + "step": 2104 + }, + { + "epoch": 0.35835886959482466, + "grad_norm": 1.5837531089782715, + "learning_rate": 1e-06, + "loss": 0.0222, + "step": 2105 + }, + { + "epoch": 0.3585291113381001, + "grad_norm": 1.3720051050186157, + "learning_rate": 1e-06, + "loss": 0.0207, + "step": 2106 + }, + { + "epoch": 0.35869935308137557, + "grad_norm": 1.7276686429977417, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 2107 + }, + { + "epoch": 0.35886959482465103, + "grad_norm": 1.7324738502502441, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 2108 + }, + { + "epoch": 0.35903983656792643, + "grad_norm": 1.5119861364364624, + "learning_rate": 1e-06, + "loss": 0.0177, + "step": 2109 + }, + { + "epoch": 0.3592100783112019, + "grad_norm": 1.672288417816162, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 2110 + }, + { + "epoch": 0.35938032005447734, + "grad_norm": 3.63991379737854, + "learning_rate": 1e-06, + "loss": 0.0522, + "step": 2111 + }, + { + "epoch": 0.3595505617977528, + "grad_norm": 1.3657325506210327, + "learning_rate": 1e-06, + "loss": 0.0194, + "step": 2112 + }, + { + "epoch": 0.35972080354102826, + "grad_norm": 1.196782112121582, + "learning_rate": 1e-06, + "loss": 0.0213, + "step": 2113 + }, + { + "epoch": 0.3598910452843037, + "grad_norm": 1.248388648033142, + "learning_rate": 1e-06, + "loss": 0.0125, + "step": 2114 + }, + { + "epoch": 0.36006128702757917, + "grad_norm": 1.267297387123108, + "learning_rate": 1e-06, + "loss": 0.0182, + "step": 2115 + }, + { + "epoch": 0.3602315287708546, + "grad_norm": 1.4637548923492432, + "learning_rate": 1e-06, + "loss": 0.0212, + "step": 2116 + }, + { + "epoch": 0.3604017705141301, + "grad_norm": 1.5229331254959106, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 2117 + }, + { + "epoch": 0.36057201225740554, + "grad_norm": 1.511386752128601, + "learning_rate": 1e-06, + "loss": 0.0193, + "step": 2118 + }, + { + "epoch": 0.36074225400068094, + "grad_norm": 1.6698894500732422, + "learning_rate": 1e-06, + "loss": 0.0272, + "step": 2119 + }, + { + "epoch": 0.3609124957439564, + "grad_norm": 2.0563039779663086, + "learning_rate": 1e-06, + "loss": 0.0234, + "step": 2120 + }, + { + "epoch": 0.36108273748723185, + "grad_norm": 2.0152013301849365, + "learning_rate": 1e-06, + "loss": 0.0169, + "step": 2121 + }, + { + "epoch": 0.3612529792305073, + "grad_norm": 1.2313683032989502, + "learning_rate": 1e-06, + "loss": 0.0146, + "step": 2122 + }, + { + "epoch": 0.36142322097378277, + "grad_norm": 1.654374361038208, + "learning_rate": 1e-06, + "loss": 0.0238, + "step": 2123 + }, + { + "epoch": 0.3615934627170582, + "grad_norm": 1.3135615587234497, + "learning_rate": 1e-06, + "loss": 0.013, + "step": 2124 + }, + { + "epoch": 0.3617637044603337, + "grad_norm": 1.285529613494873, + "learning_rate": 1e-06, + "loss": 0.0205, + "step": 2125 + }, + { + "epoch": 0.36193394620360914, + "grad_norm": 1.392561912536621, + "learning_rate": 1e-06, + "loss": 0.0174, + "step": 2126 + }, + { + "epoch": 0.3621041879468846, + "grad_norm": 1.8578132390975952, + "learning_rate": 1e-06, + "loss": 0.0177, + "step": 2127 + }, + { + "epoch": 0.36227442969016005, + "grad_norm": 1.368056297302246, + "learning_rate": 1e-06, + "loss": 0.0215, + "step": 2128 + }, + { + "epoch": 0.3624446714334355, + "grad_norm": 1.1833387613296509, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 2129 + }, + { + "epoch": 0.3626149131767109, + "grad_norm": 1.4502012729644775, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 2130 + }, + { + "epoch": 0.36278515491998636, + "grad_norm": 1.522831678390503, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 2131 + }, + { + "epoch": 0.3629553966632618, + "grad_norm": 1.4958367347717285, + "learning_rate": 1e-06, + "loss": 0.0168, + "step": 2132 + }, + { + "epoch": 0.3631256384065373, + "grad_norm": 1.1259353160858154, + "learning_rate": 1e-06, + "loss": 0.0168, + "step": 2133 + }, + { + "epoch": 0.36329588014981273, + "grad_norm": 2.0761754512786865, + "learning_rate": 1e-06, + "loss": 0.0311, + "step": 2134 + }, + { + "epoch": 0.3634661218930882, + "grad_norm": 1.4105058908462524, + "learning_rate": 1e-06, + "loss": 0.0211, + "step": 2135 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 1.2559282779693604, + "learning_rate": 1e-06, + "loss": 0.0168, + "step": 2136 + }, + { + "epoch": 0.3638066053796391, + "grad_norm": 1.5729042291641235, + "learning_rate": 1e-06, + "loss": 0.0292, + "step": 2137 + }, + { + "epoch": 0.36397684712291456, + "grad_norm": 1.7037521600723267, + "learning_rate": 1e-06, + "loss": 0.0228, + "step": 2138 + }, + { + "epoch": 0.36414708886619, + "grad_norm": 1.3164565563201904, + "learning_rate": 1e-06, + "loss": 0.0178, + "step": 2139 + }, + { + "epoch": 0.3643173306094654, + "grad_norm": 1.6225502490997314, + "learning_rate": 1e-06, + "loss": 0.0195, + "step": 2140 + }, + { + "epoch": 0.3644875723527409, + "grad_norm": 1.2100952863693237, + "learning_rate": 1e-06, + "loss": 0.0165, + "step": 2141 + }, + { + "epoch": 0.36465781409601633, + "grad_norm": 1.2184245586395264, + "learning_rate": 1e-06, + "loss": 0.0136, + "step": 2142 + }, + { + "epoch": 0.3648280558392918, + "grad_norm": 1.6735187768936157, + "learning_rate": 1e-06, + "loss": 0.0227, + "step": 2143 + }, + { + "epoch": 0.36499829758256724, + "grad_norm": 1.3809057474136353, + "learning_rate": 1e-06, + "loss": 0.0211, + "step": 2144 + }, + { + "epoch": 0.3651685393258427, + "grad_norm": 1.2712934017181396, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 2145 + }, + { + "epoch": 0.36533878106911816, + "grad_norm": 1.435024619102478, + "learning_rate": 1e-06, + "loss": 0.0146, + "step": 2146 + }, + { + "epoch": 0.3655090228123936, + "grad_norm": 1.45426344871521, + "learning_rate": 1e-06, + "loss": 0.0159, + "step": 2147 + }, + { + "epoch": 0.36567926455566907, + "grad_norm": 1.9181464910507202, + "learning_rate": 1e-06, + "loss": 0.0285, + "step": 2148 + }, + { + "epoch": 0.3658495062989445, + "grad_norm": 1.4425302743911743, + "learning_rate": 1e-06, + "loss": 0.0196, + "step": 2149 + }, + { + "epoch": 0.36601974804221993, + "grad_norm": 1.8058220148086548, + "learning_rate": 1e-06, + "loss": 0.0204, + "step": 2150 + }, + { + "epoch": 0.3661899897854954, + "grad_norm": 1.3857768774032593, + "learning_rate": 1e-06, + "loss": 0.0152, + "step": 2151 + }, + { + "epoch": 0.36636023152877084, + "grad_norm": 1.4955687522888184, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 2152 + }, + { + "epoch": 0.3665304732720463, + "grad_norm": 1.6004747152328491, + "learning_rate": 1e-06, + "loss": 0.0177, + "step": 2153 + }, + { + "epoch": 0.36670071501532175, + "grad_norm": 1.3243181705474854, + "learning_rate": 1e-06, + "loss": 0.0145, + "step": 2154 + }, + { + "epoch": 0.3668709567585972, + "grad_norm": 1.8293629884719849, + "learning_rate": 1e-06, + "loss": 0.0236, + "step": 2155 + }, + { + "epoch": 0.36704119850187267, + "grad_norm": 1.2558562755584717, + "learning_rate": 1e-06, + "loss": 0.0163, + "step": 2156 + }, + { + "epoch": 0.3672114402451481, + "grad_norm": 1.2633802890777588, + "learning_rate": 1e-06, + "loss": 0.0144, + "step": 2157 + }, + { + "epoch": 0.3673816819884236, + "grad_norm": 1.4996285438537598, + "learning_rate": 1e-06, + "loss": 0.0229, + "step": 2158 + }, + { + "epoch": 0.36755192373169904, + "grad_norm": 1.1702207326889038, + "learning_rate": 1e-06, + "loss": 0.0154, + "step": 2159 + }, + { + "epoch": 0.36772216547497444, + "grad_norm": 1.2386761903762817, + "learning_rate": 1e-06, + "loss": 0.0167, + "step": 2160 + }, + { + "epoch": 0.3678924072182499, + "grad_norm": 1.7300621271133423, + "learning_rate": 1e-06, + "loss": 0.0228, + "step": 2161 + }, + { + "epoch": 0.36806264896152535, + "grad_norm": 1.3940644264221191, + "learning_rate": 1e-06, + "loss": 0.022, + "step": 2162 + }, + { + "epoch": 0.3682328907048008, + "grad_norm": 1.4714255332946777, + "learning_rate": 1e-06, + "loss": 0.019, + "step": 2163 + }, + { + "epoch": 0.36840313244807626, + "grad_norm": 1.099491834640503, + "learning_rate": 1e-06, + "loss": 0.012, + "step": 2164 + }, + { + "epoch": 0.3685733741913517, + "grad_norm": 1.4436107873916626, + "learning_rate": 1e-06, + "loss": 0.0184, + "step": 2165 + }, + { + "epoch": 0.3687436159346272, + "grad_norm": 1.4421783685684204, + "learning_rate": 1e-06, + "loss": 0.0158, + "step": 2166 + }, + { + "epoch": 0.36891385767790263, + "grad_norm": 1.221912145614624, + "learning_rate": 1e-06, + "loss": 0.0153, + "step": 2167 + }, + { + "epoch": 0.3690840994211781, + "grad_norm": 1.3377691507339478, + "learning_rate": 1e-06, + "loss": 0.0196, + "step": 2168 + }, + { + "epoch": 0.36925434116445355, + "grad_norm": 1.5522762537002563, + "learning_rate": 1e-06, + "loss": 0.0217, + "step": 2169 + }, + { + "epoch": 0.36942458290772895, + "grad_norm": 1.452081561088562, + "learning_rate": 1e-06, + "loss": 0.0174, + "step": 2170 + }, + { + "epoch": 0.3695948246510044, + "grad_norm": 1.293049931526184, + "learning_rate": 1e-06, + "loss": 0.0131, + "step": 2171 + }, + { + "epoch": 0.36976506639427986, + "grad_norm": 1.621479868888855, + "learning_rate": 1e-06, + "loss": 0.0177, + "step": 2172 + }, + { + "epoch": 0.3699353081375553, + "grad_norm": 1.6249638795852661, + "learning_rate": 1e-06, + "loss": 0.0203, + "step": 2173 + }, + { + "epoch": 0.3701055498808308, + "grad_norm": 1.5299254655838013, + "learning_rate": 1e-06, + "loss": 0.0205, + "step": 2174 + }, + { + "epoch": 0.37027579162410623, + "grad_norm": 1.5919313430786133, + "learning_rate": 1e-06, + "loss": 0.0191, + "step": 2175 + }, + { + "epoch": 0.3704460333673817, + "grad_norm": 1.651296615600586, + "learning_rate": 1e-06, + "loss": 0.0235, + "step": 2176 + }, + { + "epoch": 0.37061627511065715, + "grad_norm": 1.3494011163711548, + "learning_rate": 1e-06, + "loss": 0.0202, + "step": 2177 + }, + { + "epoch": 0.3707865168539326, + "grad_norm": 1.4936208724975586, + "learning_rate": 1e-06, + "loss": 0.0174, + "step": 2178 + }, + { + "epoch": 0.37095675859720806, + "grad_norm": 1.3437141180038452, + "learning_rate": 1e-06, + "loss": 0.0166, + "step": 2179 + }, + { + "epoch": 0.37112700034048346, + "grad_norm": 1.137948751449585, + "learning_rate": 1e-06, + "loss": 0.0134, + "step": 2180 + }, + { + "epoch": 0.3712972420837589, + "grad_norm": 1.409578800201416, + "learning_rate": 1e-06, + "loss": 0.0187, + "step": 2181 + }, + { + "epoch": 0.3714674838270344, + "grad_norm": 1.4696379899978638, + "learning_rate": 1e-06, + "loss": 0.0212, + "step": 2182 + }, + { + "epoch": 0.37163772557030983, + "grad_norm": 1.8238104581832886, + "learning_rate": 1e-06, + "loss": 0.0156, + "step": 2183 + }, + { + "epoch": 0.3718079673135853, + "grad_norm": 1.643831729888916, + "learning_rate": 1e-06, + "loss": 0.0176, + "step": 2184 + }, + { + "epoch": 0.37197820905686074, + "grad_norm": 1.536838173866272, + "learning_rate": 1e-06, + "loss": 0.017, + "step": 2185 + }, + { + "epoch": 0.3721484508001362, + "grad_norm": 1.572147011756897, + "learning_rate": 1e-06, + "loss": 0.0185, + "step": 2186 + }, + { + "epoch": 0.37231869254341166, + "grad_norm": 1.0589184761047363, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 2187 + }, + { + "epoch": 0.3724889342866871, + "grad_norm": 2.404597043991089, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 2188 + }, + { + "epoch": 0.37265917602996257, + "grad_norm": 1.3876649141311646, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 2189 + }, + { + "epoch": 0.372829417773238, + "grad_norm": 1.3182275295257568, + "learning_rate": 1e-06, + "loss": 0.0153, + "step": 2190 + }, + { + "epoch": 0.3729996595165134, + "grad_norm": 1.2899410724639893, + "learning_rate": 1e-06, + "loss": 0.0233, + "step": 2191 + }, + { + "epoch": 0.3731699012597889, + "grad_norm": 1.5908676385879517, + "learning_rate": 1e-06, + "loss": 0.0156, + "step": 2192 + }, + { + "epoch": 0.37334014300306434, + "grad_norm": 1.2450308799743652, + "learning_rate": 1e-06, + "loss": 0.018, + "step": 2193 + }, + { + "epoch": 0.3735103847463398, + "grad_norm": 1.247621774673462, + "learning_rate": 1e-06, + "loss": 0.0134, + "step": 2194 + }, + { + "epoch": 0.37368062648961525, + "grad_norm": 1.1101661920547485, + "learning_rate": 1e-06, + "loss": 0.013, + "step": 2195 + }, + { + "epoch": 0.3738508682328907, + "grad_norm": 1.9619181156158447, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 2196 + }, + { + "epoch": 0.37402110997616617, + "grad_norm": 1.3228130340576172, + "learning_rate": 1e-06, + "loss": 0.0272, + "step": 2197 + }, + { + "epoch": 0.3741913517194416, + "grad_norm": 1.2315998077392578, + "learning_rate": 1e-06, + "loss": 0.0153, + "step": 2198 + }, + { + "epoch": 0.3743615934627171, + "grad_norm": 1.2987167835235596, + "learning_rate": 1e-06, + "loss": 0.0151, + "step": 2199 + }, + { + "epoch": 0.37453183520599254, + "grad_norm": 1.2436332702636719, + "learning_rate": 1e-06, + "loss": 0.0175, + "step": 2200 + }, + { + "epoch": 0.37470207694926794, + "grad_norm": 1.6267950534820557, + "learning_rate": 1e-06, + "loss": 0.0196, + "step": 2201 + }, + { + "epoch": 0.3748723186925434, + "grad_norm": 1.679490327835083, + "learning_rate": 1e-06, + "loss": 0.0191, + "step": 2202 + }, + { + "epoch": 0.37504256043581885, + "grad_norm": 1.6547735929489136, + "learning_rate": 1e-06, + "loss": 0.0159, + "step": 2203 + }, + { + "epoch": 0.3752128021790943, + "grad_norm": 1.4193183183670044, + "learning_rate": 1e-06, + "loss": 0.0184, + "step": 2204 + }, + { + "epoch": 0.37538304392236976, + "grad_norm": 1.39267098903656, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 2205 + }, + { + "epoch": 0.3755532856656452, + "grad_norm": 1.2222113609313965, + "learning_rate": 1e-06, + "loss": 0.0129, + "step": 2206 + }, + { + "epoch": 0.3757235274089207, + "grad_norm": 1.31162428855896, + "learning_rate": 1e-06, + "loss": 0.0147, + "step": 2207 + }, + { + "epoch": 0.37589376915219613, + "grad_norm": 1.1816409826278687, + "learning_rate": 1e-06, + "loss": 0.0152, + "step": 2208 + }, + { + "epoch": 0.3760640108954716, + "grad_norm": 1.3292574882507324, + "learning_rate": 1e-06, + "loss": 0.0146, + "step": 2209 + }, + { + "epoch": 0.37623425263874705, + "grad_norm": 1.1428711414337158, + "learning_rate": 1e-06, + "loss": 0.013, + "step": 2210 + }, + { + "epoch": 0.37640449438202245, + "grad_norm": 1.3439229726791382, + "learning_rate": 1e-06, + "loss": 0.0159, + "step": 2211 + }, + { + "epoch": 0.3765747361252979, + "grad_norm": 1.5189659595489502, + "learning_rate": 1e-06, + "loss": 0.0231, + "step": 2212 + }, + { + "epoch": 0.37674497786857336, + "grad_norm": 1.3098925352096558, + "learning_rate": 1e-06, + "loss": 0.015, + "step": 2213 + }, + { + "epoch": 0.3769152196118488, + "grad_norm": 1.3993158340454102, + "learning_rate": 1e-06, + "loss": 0.0225, + "step": 2214 + }, + { + "epoch": 0.3770854613551243, + "grad_norm": 1.4516868591308594, + "learning_rate": 1e-06, + "loss": 0.0204, + "step": 2215 + }, + { + "epoch": 0.37725570309839973, + "grad_norm": 1.124986171722412, + "learning_rate": 1e-06, + "loss": 0.0119, + "step": 2216 + }, + { + "epoch": 0.3774259448416752, + "grad_norm": 1.4332947731018066, + "learning_rate": 1e-06, + "loss": 0.023, + "step": 2217 + }, + { + "epoch": 0.37759618658495064, + "grad_norm": 1.2420907020568848, + "learning_rate": 1e-06, + "loss": 0.015, + "step": 2218 + }, + { + "epoch": 0.3777664283282261, + "grad_norm": 1.521194577217102, + "learning_rate": 1e-06, + "loss": 0.0173, + "step": 2219 + }, + { + "epoch": 0.37793667007150156, + "grad_norm": 1.1087653636932373, + "learning_rate": 1e-06, + "loss": 0.0148, + "step": 2220 + }, + { + "epoch": 0.37810691181477696, + "grad_norm": 1.2441989183425903, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 2221 + }, + { + "epoch": 0.3782771535580524, + "grad_norm": 1.500753402709961, + "learning_rate": 1e-06, + "loss": 0.0223, + "step": 2222 + }, + { + "epoch": 0.37844739530132787, + "grad_norm": 1.2610942125320435, + "learning_rate": 1e-06, + "loss": 0.0132, + "step": 2223 + }, + { + "epoch": 0.37861763704460333, + "grad_norm": 1.7212361097335815, + "learning_rate": 1e-06, + "loss": 0.0143, + "step": 2224 + }, + { + "epoch": 0.3787878787878788, + "grad_norm": 5.513698101043701, + "learning_rate": 1e-06, + "loss": 0.0828, + "step": 2225 + }, + { + "epoch": 0.37895812053115424, + "grad_norm": 1.726049542427063, + "learning_rate": 1e-06, + "loss": 0.018, + "step": 2226 + }, + { + "epoch": 0.3791283622744297, + "grad_norm": 1.936808466911316, + "learning_rate": 1e-06, + "loss": 0.0195, + "step": 2227 + }, + { + "epoch": 0.37929860401770515, + "grad_norm": 1.879072666168213, + "learning_rate": 1e-06, + "loss": 0.0226, + "step": 2228 + }, + { + "epoch": 0.3794688457609806, + "grad_norm": 1.6716440916061401, + "learning_rate": 1e-06, + "loss": 0.0294, + "step": 2229 + }, + { + "epoch": 0.37963908750425607, + "grad_norm": 1.3907865285873413, + "learning_rate": 1e-06, + "loss": 0.0205, + "step": 2230 + }, + { + "epoch": 0.37980932924753147, + "grad_norm": 1.4366625547409058, + "learning_rate": 1e-06, + "loss": 0.022, + "step": 2231 + }, + { + "epoch": 0.3799795709908069, + "grad_norm": 1.3679766654968262, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 2232 + }, + { + "epoch": 0.3801498127340824, + "grad_norm": 1.318230390548706, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 2233 + }, + { + "epoch": 0.38032005447735784, + "grad_norm": 1.0809030532836914, + "learning_rate": 1e-06, + "loss": 0.0159, + "step": 2234 + }, + { + "epoch": 0.3804902962206333, + "grad_norm": 1.47706937789917, + "learning_rate": 1e-06, + "loss": 0.0191, + "step": 2235 + }, + { + "epoch": 0.38066053796390875, + "grad_norm": 1.1768239736557007, + "learning_rate": 1e-06, + "loss": 0.0145, + "step": 2236 + }, + { + "epoch": 0.3808307797071842, + "grad_norm": 1.2666648626327515, + "learning_rate": 1e-06, + "loss": 0.0175, + "step": 2237 + }, + { + "epoch": 0.38100102145045966, + "grad_norm": 1.3947467803955078, + "learning_rate": 1e-06, + "loss": 0.0177, + "step": 2238 + }, + { + "epoch": 0.3811712631937351, + "grad_norm": 1.308648943901062, + "learning_rate": 1e-06, + "loss": 0.0174, + "step": 2239 + }, + { + "epoch": 0.3813415049370106, + "grad_norm": 1.5134379863739014, + "learning_rate": 1e-06, + "loss": 0.0133, + "step": 2240 + }, + { + "epoch": 0.381511746680286, + "grad_norm": 1.6946591138839722, + "learning_rate": 1e-06, + "loss": 0.0219, + "step": 2241 + }, + { + "epoch": 0.38168198842356144, + "grad_norm": 1.2524915933609009, + "learning_rate": 1e-06, + "loss": 0.0117, + "step": 2242 + }, + { + "epoch": 0.3818522301668369, + "grad_norm": 1.4766790866851807, + "learning_rate": 1e-06, + "loss": 0.0228, + "step": 2243 + }, + { + "epoch": 0.38202247191011235, + "grad_norm": 1.1624605655670166, + "learning_rate": 1e-06, + "loss": 0.0111, + "step": 2244 + }, + { + "epoch": 0.3821927136533878, + "grad_norm": 1.6786658763885498, + "learning_rate": 1e-06, + "loss": 0.019, + "step": 2245 + }, + { + "epoch": 0.38236295539666326, + "grad_norm": 1.1682196855545044, + "learning_rate": 1e-06, + "loss": 0.0136, + "step": 2246 + }, + { + "epoch": 0.3825331971399387, + "grad_norm": 1.470716953277588, + "learning_rate": 1e-06, + "loss": 0.016, + "step": 2247 + }, + { + "epoch": 0.3827034388832142, + "grad_norm": 1.309889793395996, + "learning_rate": 1e-06, + "loss": 0.0106, + "step": 2248 + }, + { + "epoch": 0.38287368062648963, + "grad_norm": 1.0708969831466675, + "learning_rate": 1e-06, + "loss": 0.0138, + "step": 2249 + }, + { + "epoch": 0.3830439223697651, + "grad_norm": 1.1626148223876953, + "learning_rate": 1e-06, + "loss": 0.0162, + "step": 2250 + }, + { + "epoch": 0.38321416411304055, + "grad_norm": 1.4477424621582031, + "learning_rate": 1e-06, + "loss": 0.0168, + "step": 2251 + }, + { + "epoch": 0.38338440585631595, + "grad_norm": 1.3030651807785034, + "learning_rate": 1e-06, + "loss": 0.0153, + "step": 2252 + }, + { + "epoch": 0.3835546475995914, + "grad_norm": 1.7228481769561768, + "learning_rate": 1e-06, + "loss": 0.0206, + "step": 2253 + }, + { + "epoch": 0.38372488934286686, + "grad_norm": 1.1536338329315186, + "learning_rate": 1e-06, + "loss": 0.0184, + "step": 2254 + }, + { + "epoch": 0.3838951310861423, + "grad_norm": 1.347286581993103, + "learning_rate": 1e-06, + "loss": 0.0106, + "step": 2255 + }, + { + "epoch": 0.3840653728294178, + "grad_norm": 1.1855061054229736, + "learning_rate": 1e-06, + "loss": 0.0186, + "step": 2256 + }, + { + "epoch": 0.38423561457269323, + "grad_norm": 1.283276915550232, + "learning_rate": 1e-06, + "loss": 0.0133, + "step": 2257 + }, + { + "epoch": 0.3844058563159687, + "grad_norm": 1.4037493467330933, + "learning_rate": 1e-06, + "loss": 0.0184, + "step": 2258 + }, + { + "epoch": 0.38457609805924414, + "grad_norm": 1.4328378438949585, + "learning_rate": 1e-06, + "loss": 0.022, + "step": 2259 + }, + { + "epoch": 0.3847463398025196, + "grad_norm": 1.7087554931640625, + "learning_rate": 1e-06, + "loss": 0.02, + "step": 2260 + }, + { + "epoch": 0.38491658154579506, + "grad_norm": 1.0174990892410278, + "learning_rate": 1e-06, + "loss": 0.012, + "step": 2261 + }, + { + "epoch": 0.38508682328907046, + "grad_norm": 1.4762239456176758, + "learning_rate": 1e-06, + "loss": 0.0168, + "step": 2262 + }, + { + "epoch": 0.3852570650323459, + "grad_norm": 1.040967345237732, + "learning_rate": 1e-06, + "loss": 0.0114, + "step": 2263 + }, + { + "epoch": 0.38542730677562137, + "grad_norm": 1.2842293977737427, + "learning_rate": 1e-06, + "loss": 0.0144, + "step": 2264 + }, + { + "epoch": 0.3855975485188968, + "grad_norm": 1.1241962909698486, + "learning_rate": 1e-06, + "loss": 0.0127, + "step": 2265 + }, + { + "epoch": 0.3857677902621723, + "grad_norm": 1.4264283180236816, + "learning_rate": 1e-06, + "loss": 0.0184, + "step": 2266 + }, + { + "epoch": 0.38593803200544774, + "grad_norm": 1.4640333652496338, + "learning_rate": 1e-06, + "loss": 0.0226, + "step": 2267 + }, + { + "epoch": 0.3861082737487232, + "grad_norm": 1.8459786176681519, + "learning_rate": 1e-06, + "loss": 0.0183, + "step": 2268 + }, + { + "epoch": 0.38627851549199865, + "grad_norm": 1.619437575340271, + "learning_rate": 1e-06, + "loss": 0.0322, + "step": 2269 + }, + { + "epoch": 0.3864487572352741, + "grad_norm": 1.2061455249786377, + "learning_rate": 1e-06, + "loss": 0.0152, + "step": 2270 + }, + { + "epoch": 0.38661899897854957, + "grad_norm": 2.0068230628967285, + "learning_rate": 1e-06, + "loss": 0.0161, + "step": 2271 + }, + { + "epoch": 0.38678924072182497, + "grad_norm": 1.225205659866333, + "learning_rate": 1e-06, + "loss": 0.0123, + "step": 2272 + }, + { + "epoch": 0.3869594824651004, + "grad_norm": 1.4834448099136353, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 2273 + }, + { + "epoch": 0.3871297242083759, + "grad_norm": 1.3730425834655762, + "learning_rate": 1e-06, + "loss": 0.0127, + "step": 2274 + }, + { + "epoch": 0.38729996595165134, + "grad_norm": 1.3718072175979614, + "learning_rate": 1e-06, + "loss": 0.0144, + "step": 2275 + }, + { + "epoch": 0.3874702076949268, + "grad_norm": 1.5890707969665527, + "learning_rate": 1e-06, + "loss": 0.018, + "step": 2276 + }, + { + "epoch": 0.38764044943820225, + "grad_norm": 1.437637209892273, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 2277 + }, + { + "epoch": 0.3878106911814777, + "grad_norm": 1.2185838222503662, + "learning_rate": 1e-06, + "loss": 0.0118, + "step": 2278 + }, + { + "epoch": 0.38798093292475316, + "grad_norm": 1.7744706869125366, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 2279 + }, + { + "epoch": 0.3881511746680286, + "grad_norm": 1.0997908115386963, + "learning_rate": 1e-06, + "loss": 0.0128, + "step": 2280 + }, + { + "epoch": 0.3883214164113041, + "grad_norm": 1.7149277925491333, + "learning_rate": 1e-06, + "loss": 0.0189, + "step": 2281 + }, + { + "epoch": 0.3884916581545795, + "grad_norm": 4.9614338874816895, + "learning_rate": 1e-06, + "loss": 0.0606, + "step": 2282 + }, + { + "epoch": 0.38866189989785493, + "grad_norm": 1.2618991136550903, + "learning_rate": 1e-06, + "loss": 0.0162, + "step": 2283 + }, + { + "epoch": 0.3888321416411304, + "grad_norm": 1.7077900171279907, + "learning_rate": 1e-06, + "loss": 0.0212, + "step": 2284 + }, + { + "epoch": 0.38900238338440585, + "grad_norm": 1.330156683921814, + "learning_rate": 1e-06, + "loss": 0.0153, + "step": 2285 + }, + { + "epoch": 0.3891726251276813, + "grad_norm": 1.5688471794128418, + "learning_rate": 1e-06, + "loss": 0.0196, + "step": 2286 + }, + { + "epoch": 0.38934286687095676, + "grad_norm": 1.4680328369140625, + "learning_rate": 1e-06, + "loss": 0.0162, + "step": 2287 + }, + { + "epoch": 0.3895131086142322, + "grad_norm": 1.4984068870544434, + "learning_rate": 1e-06, + "loss": 0.0226, + "step": 2288 + }, + { + "epoch": 0.3896833503575077, + "grad_norm": 1.8151462078094482, + "learning_rate": 1e-06, + "loss": 0.0177, + "step": 2289 + }, + { + "epoch": 0.38985359210078313, + "grad_norm": 1.1406170129776, + "learning_rate": 1e-06, + "loss": 0.0151, + "step": 2290 + }, + { + "epoch": 0.3900238338440586, + "grad_norm": 1.425500512123108, + "learning_rate": 1e-06, + "loss": 0.022, + "step": 2291 + }, + { + "epoch": 0.390194075587334, + "grad_norm": 1.5256712436676025, + "learning_rate": 1e-06, + "loss": 0.0152, + "step": 2292 + }, + { + "epoch": 0.39036431733060944, + "grad_norm": 1.2895482778549194, + "learning_rate": 1e-06, + "loss": 0.0116, + "step": 2293 + }, + { + "epoch": 0.3905345590738849, + "grad_norm": 1.3082205057144165, + "learning_rate": 1e-06, + "loss": 0.0127, + "step": 2294 + }, + { + "epoch": 0.39070480081716036, + "grad_norm": 1.8497793674468994, + "learning_rate": 1e-06, + "loss": 0.0213, + "step": 2295 + }, + { + "epoch": 0.3908750425604358, + "grad_norm": 1.3272291421890259, + "learning_rate": 1e-06, + "loss": 0.0154, + "step": 2296 + }, + { + "epoch": 0.39104528430371127, + "grad_norm": 1.159996747970581, + "learning_rate": 1e-06, + "loss": 0.0127, + "step": 2297 + }, + { + "epoch": 0.39121552604698673, + "grad_norm": 1.58273184299469, + "learning_rate": 1e-06, + "loss": 0.0124, + "step": 2298 + }, + { + "epoch": 0.3913857677902622, + "grad_norm": 1.2419120073318481, + "learning_rate": 1e-06, + "loss": 0.0163, + "step": 2299 + }, + { + "epoch": 0.39155600953353764, + "grad_norm": 1.4475661516189575, + "learning_rate": 1e-06, + "loss": 0.0133, + "step": 2300 + }, + { + "epoch": 0.3917262512768131, + "grad_norm": 1.270992636680603, + "learning_rate": 1e-06, + "loss": 0.0209, + "step": 2301 + }, + { + "epoch": 0.3918964930200885, + "grad_norm": 1.5242903232574463, + "learning_rate": 1e-06, + "loss": 0.0181, + "step": 2302 + }, + { + "epoch": 0.39206673476336396, + "grad_norm": 1.34809410572052, + "learning_rate": 1e-06, + "loss": 0.0148, + "step": 2303 + }, + { + "epoch": 0.3922369765066394, + "grad_norm": 1.6840362548828125, + "learning_rate": 1e-06, + "loss": 0.019, + "step": 2304 + }, + { + "epoch": 0.39240721824991487, + "grad_norm": 1.350677728652954, + "learning_rate": 1e-06, + "loss": 0.0169, + "step": 2305 + }, + { + "epoch": 0.3925774599931903, + "grad_norm": 1.4453243017196655, + "learning_rate": 1e-06, + "loss": 0.0215, + "step": 2306 + }, + { + "epoch": 0.3927477017364658, + "grad_norm": 1.3742339611053467, + "learning_rate": 1e-06, + "loss": 0.0211, + "step": 2307 + }, + { + "epoch": 0.39291794347974124, + "grad_norm": 1.471127986907959, + "learning_rate": 1e-06, + "loss": 0.0149, + "step": 2308 + }, + { + "epoch": 0.3930881852230167, + "grad_norm": 1.0254688262939453, + "learning_rate": 1e-06, + "loss": 0.0131, + "step": 2309 + }, + { + "epoch": 0.39325842696629215, + "grad_norm": 1.232068419456482, + "learning_rate": 1e-06, + "loss": 0.0158, + "step": 2310 + }, + { + "epoch": 0.3934286687095676, + "grad_norm": 1.4690749645233154, + "learning_rate": 1e-06, + "loss": 0.02, + "step": 2311 + }, + { + "epoch": 0.393598910452843, + "grad_norm": 1.1098586320877075, + "learning_rate": 1e-06, + "loss": 0.0141, + "step": 2312 + }, + { + "epoch": 0.39376915219611847, + "grad_norm": 1.5079904794692993, + "learning_rate": 1e-06, + "loss": 0.0203, + "step": 2313 + }, + { + "epoch": 0.3939393939393939, + "grad_norm": 0.9211795926094055, + "learning_rate": 1e-06, + "loss": 0.0108, + "step": 2314 + }, + { + "epoch": 0.3941096356826694, + "grad_norm": 1.6116832494735718, + "learning_rate": 1e-06, + "loss": 0.0187, + "step": 2315 + }, + { + "epoch": 0.39427987742594484, + "grad_norm": 1.5355298519134521, + "learning_rate": 1e-06, + "loss": 0.0154, + "step": 2316 + }, + { + "epoch": 0.3944501191692203, + "grad_norm": 1.1545178890228271, + "learning_rate": 1e-06, + "loss": 0.0164, + "step": 2317 + }, + { + "epoch": 0.39462036091249575, + "grad_norm": 1.8506436347961426, + "learning_rate": 1e-06, + "loss": 0.0176, + "step": 2318 + }, + { + "epoch": 0.3947906026557712, + "grad_norm": 2.0721189975738525, + "learning_rate": 1e-06, + "loss": 0.0176, + "step": 2319 + }, + { + "epoch": 0.39496084439904666, + "grad_norm": 1.3172030448913574, + "learning_rate": 1e-06, + "loss": 0.0173, + "step": 2320 + }, + { + "epoch": 0.3951310861423221, + "grad_norm": 1.3635270595550537, + "learning_rate": 1e-06, + "loss": 0.0142, + "step": 2321 + }, + { + "epoch": 0.3953013278855976, + "grad_norm": 1.1138346195220947, + "learning_rate": 1e-06, + "loss": 0.0189, + "step": 2322 + }, + { + "epoch": 0.395471569628873, + "grad_norm": 1.483282446861267, + "learning_rate": 1e-06, + "loss": 0.0185, + "step": 2323 + }, + { + "epoch": 0.39564181137214843, + "grad_norm": 1.542601466178894, + "learning_rate": 1e-06, + "loss": 0.023, + "step": 2324 + }, + { + "epoch": 0.3958120531154239, + "grad_norm": 1.5426020622253418, + "learning_rate": 1e-06, + "loss": 0.0302, + "step": 2325 + }, + { + "epoch": 0.39598229485869935, + "grad_norm": 1.148687720298767, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 2326 + }, + { + "epoch": 0.3961525366019748, + "grad_norm": 1.3703385591506958, + "learning_rate": 1e-06, + "loss": 0.0179, + "step": 2327 + }, + { + "epoch": 0.39632277834525026, + "grad_norm": 1.4654871225357056, + "learning_rate": 1e-06, + "loss": 0.0146, + "step": 2328 + }, + { + "epoch": 0.3964930200885257, + "grad_norm": 2.1327321529388428, + "learning_rate": 1e-06, + "loss": 0.028, + "step": 2329 + }, + { + "epoch": 0.3966632618318012, + "grad_norm": 1.7279863357543945, + "learning_rate": 1e-06, + "loss": 0.0238, + "step": 2330 + }, + { + "epoch": 0.39683350357507663, + "grad_norm": 1.1947439908981323, + "learning_rate": 1e-06, + "loss": 0.0125, + "step": 2331 + }, + { + "epoch": 0.3970037453183521, + "grad_norm": 1.0099948644638062, + "learning_rate": 1e-06, + "loss": 0.0108, + "step": 2332 + }, + { + "epoch": 0.3971739870616275, + "grad_norm": 1.3921846151351929, + "learning_rate": 1e-06, + "loss": 0.0162, + "step": 2333 + }, + { + "epoch": 0.39734422880490294, + "grad_norm": 1.3162715435028076, + "learning_rate": 1e-06, + "loss": 0.0158, + "step": 2334 + }, + { + "epoch": 0.3975144705481784, + "grad_norm": 1.7758504152297974, + "learning_rate": 1e-06, + "loss": 0.018, + "step": 2335 + }, + { + "epoch": 0.39768471229145386, + "grad_norm": 1.2216438055038452, + "learning_rate": 1e-06, + "loss": 0.0212, + "step": 2336 + }, + { + "epoch": 0.3978549540347293, + "grad_norm": 1.6880277395248413, + "learning_rate": 1e-06, + "loss": 0.0149, + "step": 2337 + }, + { + "epoch": 0.39802519577800477, + "grad_norm": 1.6719337701797485, + "learning_rate": 1e-06, + "loss": 0.0175, + "step": 2338 + }, + { + "epoch": 0.3981954375212802, + "grad_norm": 1.5387353897094727, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 2339 + }, + { + "epoch": 0.3983656792645557, + "grad_norm": 1.6653772592544556, + "learning_rate": 1e-06, + "loss": 0.0161, + "step": 2340 + }, + { + "epoch": 0.39853592100783114, + "grad_norm": 1.325972318649292, + "learning_rate": 1e-06, + "loss": 0.0202, + "step": 2341 + }, + { + "epoch": 0.3987061627511066, + "grad_norm": 1.40695321559906, + "learning_rate": 1e-06, + "loss": 0.0182, + "step": 2342 + }, + { + "epoch": 0.398876404494382, + "grad_norm": 1.6472898721694946, + "learning_rate": 1e-06, + "loss": 0.0167, + "step": 2343 + }, + { + "epoch": 0.39904664623765745, + "grad_norm": 1.2183064222335815, + "learning_rate": 1e-06, + "loss": 0.0104, + "step": 2344 + }, + { + "epoch": 0.3992168879809329, + "grad_norm": 1.193183183670044, + "learning_rate": 1e-06, + "loss": 0.0102, + "step": 2345 + }, + { + "epoch": 0.39938712972420837, + "grad_norm": 1.4041932821273804, + "learning_rate": 1e-06, + "loss": 0.0156, + "step": 2346 + }, + { + "epoch": 0.3995573714674838, + "grad_norm": 1.3237617015838623, + "learning_rate": 1e-06, + "loss": 0.0157, + "step": 2347 + }, + { + "epoch": 0.3997276132107593, + "grad_norm": 1.5075606107711792, + "learning_rate": 1e-06, + "loss": 0.0167, + "step": 2348 + }, + { + "epoch": 0.39989785495403474, + "grad_norm": 1.4453775882720947, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 2349 + }, + { + "epoch": 0.4000680966973102, + "grad_norm": 1.4384607076644897, + "learning_rate": 1e-06, + "loss": 0.0171, + "step": 2350 + }, + { + "epoch": 0.40023833844058565, + "grad_norm": 1.3807544708251953, + "learning_rate": 1e-06, + "loss": 0.0131, + "step": 2351 + }, + { + "epoch": 0.4004085801838611, + "grad_norm": 1.7280919551849365, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 2352 + }, + { + "epoch": 0.4005788219271365, + "grad_norm": 1.2561084032058716, + "learning_rate": 1e-06, + "loss": 0.0089, + "step": 2353 + }, + { + "epoch": 0.40074906367041196, + "grad_norm": 1.4211997985839844, + "learning_rate": 1e-06, + "loss": 0.0174, + "step": 2354 + }, + { + "epoch": 0.4009193054136874, + "grad_norm": 1.79752516746521, + "learning_rate": 1e-06, + "loss": 0.0177, + "step": 2355 + }, + { + "epoch": 0.4010895471569629, + "grad_norm": 1.746137022972107, + "learning_rate": 1e-06, + "loss": 0.0208, + "step": 2356 + }, + { + "epoch": 0.40125978890023833, + "grad_norm": 3.5860726833343506, + "learning_rate": 1e-06, + "loss": 0.0709, + "step": 2357 + }, + { + "epoch": 0.4014300306435138, + "grad_norm": 1.5278226137161255, + "learning_rate": 1e-06, + "loss": 0.0161, + "step": 2358 + }, + { + "epoch": 0.40160027238678925, + "grad_norm": 1.536190390586853, + "learning_rate": 1e-06, + "loss": 0.0217, + "step": 2359 + }, + { + "epoch": 0.4017705141300647, + "grad_norm": 1.5676721334457397, + "learning_rate": 1e-06, + "loss": 0.0166, + "step": 2360 + }, + { + "epoch": 0.40194075587334016, + "grad_norm": 1.004481554031372, + "learning_rate": 1e-06, + "loss": 0.0115, + "step": 2361 + }, + { + "epoch": 0.4021109976166156, + "grad_norm": 1.576690673828125, + "learning_rate": 1e-06, + "loss": 0.0267, + "step": 2362 + }, + { + "epoch": 0.402281239359891, + "grad_norm": 1.8953304290771484, + "learning_rate": 1e-06, + "loss": 0.0205, + "step": 2363 + }, + { + "epoch": 0.4024514811031665, + "grad_norm": 1.3143856525421143, + "learning_rate": 1e-06, + "loss": 0.013, + "step": 2364 + }, + { + "epoch": 0.40262172284644193, + "grad_norm": 1.4957778453826904, + "learning_rate": 1e-06, + "loss": 0.0285, + "step": 2365 + }, + { + "epoch": 0.4027919645897174, + "grad_norm": 1.0621047019958496, + "learning_rate": 1e-06, + "loss": 0.0117, + "step": 2366 + }, + { + "epoch": 0.40296220633299284, + "grad_norm": 1.4584861993789673, + "learning_rate": 1e-06, + "loss": 0.0127, + "step": 2367 + }, + { + "epoch": 0.4031324480762683, + "grad_norm": 1.5141526460647583, + "learning_rate": 1e-06, + "loss": 0.0149, + "step": 2368 + }, + { + "epoch": 0.40330268981954376, + "grad_norm": 1.0880855321884155, + "learning_rate": 1e-06, + "loss": 0.0116, + "step": 2369 + }, + { + "epoch": 0.4034729315628192, + "grad_norm": 1.1734309196472168, + "learning_rate": 1e-06, + "loss": 0.0158, + "step": 2370 + }, + { + "epoch": 0.40364317330609467, + "grad_norm": 1.3398406505584717, + "learning_rate": 1e-06, + "loss": 0.0172, + "step": 2371 + }, + { + "epoch": 0.40381341504937013, + "grad_norm": 1.1609718799591064, + "learning_rate": 1e-06, + "loss": 0.0157, + "step": 2372 + }, + { + "epoch": 0.40398365679264553, + "grad_norm": 4.927487373352051, + "learning_rate": 1e-06, + "loss": 0.0723, + "step": 2373 + }, + { + "epoch": 0.404153898535921, + "grad_norm": 1.258873462677002, + "learning_rate": 1e-06, + "loss": 0.018, + "step": 2374 + }, + { + "epoch": 0.40432414027919644, + "grad_norm": 1.1295454502105713, + "learning_rate": 1e-06, + "loss": 0.0153, + "step": 2375 + }, + { + "epoch": 0.4044943820224719, + "grad_norm": 1.213400959968567, + "learning_rate": 1e-06, + "loss": 0.0137, + "step": 2376 + }, + { + "epoch": 0.40466462376574736, + "grad_norm": 1.0463217496871948, + "learning_rate": 1e-06, + "loss": 0.013, + "step": 2377 + }, + { + "epoch": 0.4048348655090228, + "grad_norm": 1.1880186796188354, + "learning_rate": 1e-06, + "loss": 0.0145, + "step": 2378 + }, + { + "epoch": 0.40500510725229827, + "grad_norm": 1.3919751644134521, + "learning_rate": 1e-06, + "loss": 0.015, + "step": 2379 + }, + { + "epoch": 0.4051753489955737, + "grad_norm": 1.2092021703720093, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 2380 + }, + { + "epoch": 0.4053455907388492, + "grad_norm": 1.5249980688095093, + "learning_rate": 1e-06, + "loss": 0.016, + "step": 2381 + }, + { + "epoch": 0.40551583248212464, + "grad_norm": 1.3709051609039307, + "learning_rate": 1e-06, + "loss": 0.0167, + "step": 2382 + }, + { + "epoch": 0.4056860742254001, + "grad_norm": 1.3142911195755005, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 2383 + }, + { + "epoch": 0.4058563159686755, + "grad_norm": 1.423117995262146, + "learning_rate": 1e-06, + "loss": 0.0144, + "step": 2384 + }, + { + "epoch": 0.40602655771195095, + "grad_norm": 1.7531969547271729, + "learning_rate": 1e-06, + "loss": 0.0164, + "step": 2385 + }, + { + "epoch": 0.4061967994552264, + "grad_norm": 1.1912846565246582, + "learning_rate": 1e-06, + "loss": 0.0177, + "step": 2386 + }, + { + "epoch": 0.40636704119850187, + "grad_norm": 1.3569473028182983, + "learning_rate": 1e-06, + "loss": 0.0152, + "step": 2387 + }, + { + "epoch": 0.4065372829417773, + "grad_norm": 1.3196473121643066, + "learning_rate": 1e-06, + "loss": 0.0183, + "step": 2388 + }, + { + "epoch": 0.4067075246850528, + "grad_norm": 1.8479810953140259, + "learning_rate": 1e-06, + "loss": 0.0224, + "step": 2389 + }, + { + "epoch": 0.40687776642832824, + "grad_norm": 1.048545479774475, + "learning_rate": 1e-06, + "loss": 0.0083, + "step": 2390 + }, + { + "epoch": 0.4070480081716037, + "grad_norm": 1.090034008026123, + "learning_rate": 1e-06, + "loss": 0.0124, + "step": 2391 + }, + { + "epoch": 0.40721824991487915, + "grad_norm": 1.0449838638305664, + "learning_rate": 1e-06, + "loss": 0.0124, + "step": 2392 + }, + { + "epoch": 0.4073884916581546, + "grad_norm": 1.3128901720046997, + "learning_rate": 1e-06, + "loss": 0.0118, + "step": 2393 + }, + { + "epoch": 0.40755873340143, + "grad_norm": 1.3702256679534912, + "learning_rate": 1e-06, + "loss": 0.0171, + "step": 2394 + }, + { + "epoch": 0.40772897514470546, + "grad_norm": 1.4796867370605469, + "learning_rate": 1e-06, + "loss": 0.0204, + "step": 2395 + }, + { + "epoch": 0.4078992168879809, + "grad_norm": 1.1662784814834595, + "learning_rate": 1e-06, + "loss": 0.0115, + "step": 2396 + }, + { + "epoch": 0.4080694586312564, + "grad_norm": 2.216245651245117, + "learning_rate": 1e-06, + "loss": 0.0212, + "step": 2397 + }, + { + "epoch": 0.40823970037453183, + "grad_norm": 1.5245473384857178, + "learning_rate": 1e-06, + "loss": 0.0111, + "step": 2398 + }, + { + "epoch": 0.4084099421178073, + "grad_norm": 1.6484607458114624, + "learning_rate": 1e-06, + "loss": 0.0132, + "step": 2399 + }, + { + "epoch": 0.40858018386108275, + "grad_norm": 1.326431155204773, + "learning_rate": 1e-06, + "loss": 0.0111, + "step": 2400 + }, + { + "epoch": 0.4087504256043582, + "grad_norm": 1.8550671339035034, + "learning_rate": 1e-06, + "loss": 0.0198, + "step": 2401 + }, + { + "epoch": 0.40892066734763366, + "grad_norm": 1.1143696308135986, + "learning_rate": 1e-06, + "loss": 0.0141, + "step": 2402 + }, + { + "epoch": 0.4090909090909091, + "grad_norm": 1.190245509147644, + "learning_rate": 1e-06, + "loss": 0.012, + "step": 2403 + }, + { + "epoch": 0.4092611508341845, + "grad_norm": 1.6202400922775269, + "learning_rate": 1e-06, + "loss": 0.019, + "step": 2404 + }, + { + "epoch": 0.40943139257746, + "grad_norm": 1.4784173965454102, + "learning_rate": 1e-06, + "loss": 0.0213, + "step": 2405 + }, + { + "epoch": 0.40960163432073543, + "grad_norm": 1.2796027660369873, + "learning_rate": 1e-06, + "loss": 0.0161, + "step": 2406 + }, + { + "epoch": 0.4097718760640109, + "grad_norm": 1.0544267892837524, + "learning_rate": 1e-06, + "loss": 0.0147, + "step": 2407 + }, + { + "epoch": 0.40994211780728634, + "grad_norm": 1.6966776847839355, + "learning_rate": 1e-06, + "loss": 0.0223, + "step": 2408 + }, + { + "epoch": 0.4101123595505618, + "grad_norm": 1.3701565265655518, + "learning_rate": 1e-06, + "loss": 0.0169, + "step": 2409 + }, + { + "epoch": 0.41028260129383726, + "grad_norm": 1.4099042415618896, + "learning_rate": 1e-06, + "loss": 0.0157, + "step": 2410 + }, + { + "epoch": 0.4104528430371127, + "grad_norm": 1.2321785688400269, + "learning_rate": 1e-06, + "loss": 0.0113, + "step": 2411 + }, + { + "epoch": 0.41062308478038817, + "grad_norm": 1.3855117559432983, + "learning_rate": 1e-06, + "loss": 0.0216, + "step": 2412 + }, + { + "epoch": 0.4107933265236636, + "grad_norm": 1.1128058433532715, + "learning_rate": 1e-06, + "loss": 0.0124, + "step": 2413 + }, + { + "epoch": 0.410963568266939, + "grad_norm": 1.4113258123397827, + "learning_rate": 1e-06, + "loss": 0.0144, + "step": 2414 + }, + { + "epoch": 0.4111338100102145, + "grad_norm": 1.4246609210968018, + "learning_rate": 1e-06, + "loss": 0.0156, + "step": 2415 + }, + { + "epoch": 0.41130405175348994, + "grad_norm": 1.2959715127944946, + "learning_rate": 1e-06, + "loss": 0.017, + "step": 2416 + }, + { + "epoch": 0.4114742934967654, + "grad_norm": 1.3845857381820679, + "learning_rate": 1e-06, + "loss": 0.0153, + "step": 2417 + }, + { + "epoch": 0.41164453524004085, + "grad_norm": 1.1727241277694702, + "learning_rate": 1e-06, + "loss": 0.0127, + "step": 2418 + }, + { + "epoch": 0.4118147769833163, + "grad_norm": 1.3593071699142456, + "learning_rate": 1e-06, + "loss": 0.0189, + "step": 2419 + }, + { + "epoch": 0.41198501872659177, + "grad_norm": 1.650676965713501, + "learning_rate": 1e-06, + "loss": 0.0179, + "step": 2420 + }, + { + "epoch": 0.4121552604698672, + "grad_norm": 1.345276117324829, + "learning_rate": 1e-06, + "loss": 0.015, + "step": 2421 + }, + { + "epoch": 0.4123255022131427, + "grad_norm": 1.3404996395111084, + "learning_rate": 1e-06, + "loss": 0.0145, + "step": 2422 + }, + { + "epoch": 0.41249574395641814, + "grad_norm": 1.4839191436767578, + "learning_rate": 1e-06, + "loss": 0.0176, + "step": 2423 + }, + { + "epoch": 0.41266598569969354, + "grad_norm": 1.3546916246414185, + "learning_rate": 1e-06, + "loss": 0.0196, + "step": 2424 + }, + { + "epoch": 0.412836227442969, + "grad_norm": 1.5656921863555908, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 2425 + }, + { + "epoch": 0.41300646918624445, + "grad_norm": 1.348214030265808, + "learning_rate": 1e-06, + "loss": 0.0189, + "step": 2426 + }, + { + "epoch": 0.4131767109295199, + "grad_norm": 1.2620658874511719, + "learning_rate": 1e-06, + "loss": 0.0154, + "step": 2427 + }, + { + "epoch": 0.41334695267279536, + "grad_norm": 1.1423382759094238, + "learning_rate": 1e-06, + "loss": 0.0155, + "step": 2428 + }, + { + "epoch": 0.4135171944160708, + "grad_norm": 1.179055094718933, + "learning_rate": 1e-06, + "loss": 0.0144, + "step": 2429 + }, + { + "epoch": 0.4136874361593463, + "grad_norm": 1.2674864530563354, + "learning_rate": 1e-06, + "loss": 0.0133, + "step": 2430 + }, + { + "epoch": 0.41385767790262173, + "grad_norm": 1.2309482097625732, + "learning_rate": 1e-06, + "loss": 0.0177, + "step": 2431 + }, + { + "epoch": 0.4140279196458972, + "grad_norm": 1.4680554866790771, + "learning_rate": 1e-06, + "loss": 0.0156, + "step": 2432 + }, + { + "epoch": 0.41419816138917265, + "grad_norm": 1.4172992706298828, + "learning_rate": 1e-06, + "loss": 0.021, + "step": 2433 + }, + { + "epoch": 0.41436840313244805, + "grad_norm": 1.537314534187317, + "learning_rate": 1e-06, + "loss": 0.0196, + "step": 2434 + }, + { + "epoch": 0.4145386448757235, + "grad_norm": 1.3401296138763428, + "learning_rate": 1e-06, + "loss": 0.0106, + "step": 2435 + }, + { + "epoch": 0.41470888661899896, + "grad_norm": 1.5095027685165405, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 2436 + }, + { + "epoch": 0.4148791283622744, + "grad_norm": 1.1366171836853027, + "learning_rate": 1e-06, + "loss": 0.0137, + "step": 2437 + }, + { + "epoch": 0.4150493701055499, + "grad_norm": 1.1384146213531494, + "learning_rate": 1e-06, + "loss": 0.0137, + "step": 2438 + }, + { + "epoch": 0.41521961184882533, + "grad_norm": 0.9028575420379639, + "learning_rate": 1e-06, + "loss": 0.0084, + "step": 2439 + }, + { + "epoch": 0.4153898535921008, + "grad_norm": 1.199471116065979, + "learning_rate": 1e-06, + "loss": 0.0122, + "step": 2440 + }, + { + "epoch": 0.41556009533537625, + "grad_norm": 1.3193436861038208, + "learning_rate": 1e-06, + "loss": 0.017, + "step": 2441 + }, + { + "epoch": 0.4157303370786517, + "grad_norm": 1.3699177503585815, + "learning_rate": 1e-06, + "loss": 0.016, + "step": 2442 + }, + { + "epoch": 0.41590057882192716, + "grad_norm": 1.3745206594467163, + "learning_rate": 1e-06, + "loss": 0.0154, + "step": 2443 + }, + { + "epoch": 0.4160708205652026, + "grad_norm": 1.289341926574707, + "learning_rate": 1e-06, + "loss": 0.0152, + "step": 2444 + }, + { + "epoch": 0.416241062308478, + "grad_norm": 1.4632568359375, + "learning_rate": 1e-06, + "loss": 0.0169, + "step": 2445 + }, + { + "epoch": 0.4164113040517535, + "grad_norm": 1.4885531663894653, + "learning_rate": 1e-06, + "loss": 0.0147, + "step": 2446 + }, + { + "epoch": 0.41658154579502893, + "grad_norm": 1.2330007553100586, + "learning_rate": 1e-06, + "loss": 0.0126, + "step": 2447 + }, + { + "epoch": 0.4167517875383044, + "grad_norm": 2.102605104446411, + "learning_rate": 1e-06, + "loss": 0.0458, + "step": 2448 + }, + { + "epoch": 0.41692202928157984, + "grad_norm": 1.2682759761810303, + "learning_rate": 1e-06, + "loss": 0.0087, + "step": 2449 + }, + { + "epoch": 0.4170922710248553, + "grad_norm": 1.3345555067062378, + "learning_rate": 1e-06, + "loss": 0.013, + "step": 2450 + }, + { + "epoch": 0.41726251276813076, + "grad_norm": 1.390181064605713, + "learning_rate": 1e-06, + "loss": 0.0203, + "step": 2451 + }, + { + "epoch": 0.4174327545114062, + "grad_norm": 1.3475115299224854, + "learning_rate": 1e-06, + "loss": 0.0199, + "step": 2452 + }, + { + "epoch": 0.41760299625468167, + "grad_norm": 1.5461567640304565, + "learning_rate": 1e-06, + "loss": 0.0179, + "step": 2453 + }, + { + "epoch": 0.4177732379979571, + "grad_norm": 1.3287405967712402, + "learning_rate": 1e-06, + "loss": 0.0156, + "step": 2454 + }, + { + "epoch": 0.4179434797412325, + "grad_norm": 1.3914709091186523, + "learning_rate": 1e-06, + "loss": 0.0141, + "step": 2455 + }, + { + "epoch": 0.418113721484508, + "grad_norm": 1.5743072032928467, + "learning_rate": 1e-06, + "loss": 0.0166, + "step": 2456 + }, + { + "epoch": 0.41828396322778344, + "grad_norm": 1.3226776123046875, + "learning_rate": 1e-06, + "loss": 0.0176, + "step": 2457 + }, + { + "epoch": 0.4184542049710589, + "grad_norm": 1.3625043630599976, + "learning_rate": 1e-06, + "loss": 0.0209, + "step": 2458 + }, + { + "epoch": 0.41862444671433435, + "grad_norm": 0.9081085920333862, + "learning_rate": 1e-06, + "loss": 0.0076, + "step": 2459 + }, + { + "epoch": 0.4187946884576098, + "grad_norm": 1.608961820602417, + "learning_rate": 1e-06, + "loss": 0.0182, + "step": 2460 + }, + { + "epoch": 0.41896493020088527, + "grad_norm": 1.2214875221252441, + "learning_rate": 1e-06, + "loss": 0.0147, + "step": 2461 + }, + { + "epoch": 0.4191351719441607, + "grad_norm": 1.1907516717910767, + "learning_rate": 1e-06, + "loss": 0.0166, + "step": 2462 + }, + { + "epoch": 0.4193054136874362, + "grad_norm": 1.0932464599609375, + "learning_rate": 1e-06, + "loss": 0.0102, + "step": 2463 + }, + { + "epoch": 0.41947565543071164, + "grad_norm": 1.2685447931289673, + "learning_rate": 1e-06, + "loss": 0.0141, + "step": 2464 + }, + { + "epoch": 0.41964589717398704, + "grad_norm": 1.4846066236495972, + "learning_rate": 1e-06, + "loss": 0.0146, + "step": 2465 + }, + { + "epoch": 0.4198161389172625, + "grad_norm": 1.1014434099197388, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 2466 + }, + { + "epoch": 0.41998638066053795, + "grad_norm": 1.5888346433639526, + "learning_rate": 1e-06, + "loss": 0.0195, + "step": 2467 + }, + { + "epoch": 0.4201566224038134, + "grad_norm": 1.507070779800415, + "learning_rate": 1e-06, + "loss": 0.0171, + "step": 2468 + }, + { + "epoch": 0.42032686414708886, + "grad_norm": 1.330121636390686, + "learning_rate": 1e-06, + "loss": 0.013, + "step": 2469 + }, + { + "epoch": 0.4204971058903643, + "grad_norm": 1.4843699932098389, + "learning_rate": 1e-06, + "loss": 0.0232, + "step": 2470 + }, + { + "epoch": 0.4206673476336398, + "grad_norm": 1.1940176486968994, + "learning_rate": 1e-06, + "loss": 0.0112, + "step": 2471 + }, + { + "epoch": 0.42083758937691523, + "grad_norm": 1.3040800094604492, + "learning_rate": 1e-06, + "loss": 0.0164, + "step": 2472 + }, + { + "epoch": 0.4210078311201907, + "grad_norm": 1.4341161251068115, + "learning_rate": 1e-06, + "loss": 0.0146, + "step": 2473 + }, + { + "epoch": 0.42117807286346615, + "grad_norm": 1.1169599294662476, + "learning_rate": 1e-06, + "loss": 0.0105, + "step": 2474 + }, + { + "epoch": 0.42134831460674155, + "grad_norm": 1.2243717908859253, + "learning_rate": 1e-06, + "loss": 0.0121, + "step": 2475 + }, + { + "epoch": 0.421518556350017, + "grad_norm": 1.5474375486373901, + "learning_rate": 1e-06, + "loss": 0.0188, + "step": 2476 + }, + { + "epoch": 0.42168879809329246, + "grad_norm": 1.487414002418518, + "learning_rate": 1e-06, + "loss": 0.0116, + "step": 2477 + }, + { + "epoch": 0.4218590398365679, + "grad_norm": 1.5315251350402832, + "learning_rate": 1e-06, + "loss": 0.0153, + "step": 2478 + }, + { + "epoch": 0.4220292815798434, + "grad_norm": 4.279947280883789, + "learning_rate": 1e-06, + "loss": 0.0537, + "step": 2479 + }, + { + "epoch": 0.42219952332311883, + "grad_norm": 1.396422266960144, + "learning_rate": 1e-06, + "loss": 0.0219, + "step": 2480 + }, + { + "epoch": 0.4223697650663943, + "grad_norm": 1.583313226699829, + "learning_rate": 1e-06, + "loss": 0.0171, + "step": 2481 + }, + { + "epoch": 0.42254000680966974, + "grad_norm": 1.1320841312408447, + "learning_rate": 1e-06, + "loss": 0.013, + "step": 2482 + }, + { + "epoch": 0.4227102485529452, + "grad_norm": 1.1860630512237549, + "learning_rate": 1e-06, + "loss": 0.015, + "step": 2483 + }, + { + "epoch": 0.42288049029622066, + "grad_norm": 1.2758294343948364, + "learning_rate": 1e-06, + "loss": 0.0168, + "step": 2484 + }, + { + "epoch": 0.42305073203949606, + "grad_norm": 1.084784984588623, + "learning_rate": 1e-06, + "loss": 0.0108, + "step": 2485 + }, + { + "epoch": 0.4232209737827715, + "grad_norm": 1.0579909086227417, + "learning_rate": 1e-06, + "loss": 0.0103, + "step": 2486 + }, + { + "epoch": 0.42339121552604697, + "grad_norm": 1.5031887292861938, + "learning_rate": 1e-06, + "loss": 0.0159, + "step": 2487 + }, + { + "epoch": 0.42356145726932243, + "grad_norm": 1.2599958181381226, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 2488 + }, + { + "epoch": 0.4237316990125979, + "grad_norm": 1.1589609384536743, + "learning_rate": 1e-06, + "loss": 0.01, + "step": 2489 + }, + { + "epoch": 0.42390194075587334, + "grad_norm": 1.139289140701294, + "learning_rate": 1e-06, + "loss": 0.0113, + "step": 2490 + }, + { + "epoch": 0.4240721824991488, + "grad_norm": 1.1821390390396118, + "learning_rate": 1e-06, + "loss": 0.0129, + "step": 2491 + }, + { + "epoch": 0.42424242424242425, + "grad_norm": 1.2557470798492432, + "learning_rate": 1e-06, + "loss": 0.0114, + "step": 2492 + }, + { + "epoch": 0.4244126659856997, + "grad_norm": 1.3615427017211914, + "learning_rate": 1e-06, + "loss": 0.0142, + "step": 2493 + }, + { + "epoch": 0.42458290772897517, + "grad_norm": 1.4991490840911865, + "learning_rate": 1e-06, + "loss": 0.0142, + "step": 2494 + }, + { + "epoch": 0.42475314947225057, + "grad_norm": 1.1081904172897339, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 2495 + }, + { + "epoch": 0.424923391215526, + "grad_norm": 1.3824467658996582, + "learning_rate": 1e-06, + "loss": 0.0111, + "step": 2496 + }, + { + "epoch": 0.4250936329588015, + "grad_norm": 1.4526976346969604, + "learning_rate": 1e-06, + "loss": 0.0229, + "step": 2497 + }, + { + "epoch": 0.42526387470207694, + "grad_norm": 1.1925878524780273, + "learning_rate": 1e-06, + "loss": 0.014, + "step": 2498 + }, + { + "epoch": 0.4254341164453524, + "grad_norm": 1.7539995908737183, + "learning_rate": 1e-06, + "loss": 0.0295, + "step": 2499 + }, + { + "epoch": 0.42560435818862785, + "grad_norm": 1.2335952520370483, + "learning_rate": 1e-06, + "loss": 0.016, + "step": 2500 + }, + { + "epoch": 0.4257745999319033, + "grad_norm": 1.1143063306808472, + "learning_rate": 1e-06, + "loss": 0.0108, + "step": 2501 + }, + { + "epoch": 0.42594484167517876, + "grad_norm": 1.0467058420181274, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 2502 + }, + { + "epoch": 0.4261150834184542, + "grad_norm": 1.118386149406433, + "learning_rate": 1e-06, + "loss": 0.0124, + "step": 2503 + }, + { + "epoch": 0.4262853251617297, + "grad_norm": 1.4646353721618652, + "learning_rate": 1e-06, + "loss": 0.0148, + "step": 2504 + }, + { + "epoch": 0.42645556690500513, + "grad_norm": 1.2250324487686157, + "learning_rate": 1e-06, + "loss": 0.0132, + "step": 2505 + }, + { + "epoch": 0.42662580864828054, + "grad_norm": 1.757466435432434, + "learning_rate": 1e-06, + "loss": 0.0219, + "step": 2506 + }, + { + "epoch": 0.426796050391556, + "grad_norm": 1.2650184631347656, + "learning_rate": 1e-06, + "loss": 0.0137, + "step": 2507 + }, + { + "epoch": 0.42696629213483145, + "grad_norm": 1.2739180326461792, + "learning_rate": 1e-06, + "loss": 0.011, + "step": 2508 + }, + { + "epoch": 0.4271365338781069, + "grad_norm": 1.3488551378250122, + "learning_rate": 1e-06, + "loss": 0.0144, + "step": 2509 + }, + { + "epoch": 0.42730677562138236, + "grad_norm": 1.4845232963562012, + "learning_rate": 1e-06, + "loss": 0.0196, + "step": 2510 + }, + { + "epoch": 0.4274770173646578, + "grad_norm": 1.5139720439910889, + "learning_rate": 1e-06, + "loss": 0.0178, + "step": 2511 + }, + { + "epoch": 0.4276472591079333, + "grad_norm": 1.3799965381622314, + "learning_rate": 1e-06, + "loss": 0.0142, + "step": 2512 + }, + { + "epoch": 0.42781750085120873, + "grad_norm": 1.0673723220825195, + "learning_rate": 1e-06, + "loss": 0.0093, + "step": 2513 + }, + { + "epoch": 0.4279877425944842, + "grad_norm": 1.4284578561782837, + "learning_rate": 1e-06, + "loss": 0.0185, + "step": 2514 + }, + { + "epoch": 0.42815798433775965, + "grad_norm": 0.9315198659896851, + "learning_rate": 1e-06, + "loss": 0.0117, + "step": 2515 + }, + { + "epoch": 0.42832822608103505, + "grad_norm": 1.5946029424667358, + "learning_rate": 1e-06, + "loss": 0.0183, + "step": 2516 + }, + { + "epoch": 0.4284984678243105, + "grad_norm": 1.6120253801345825, + "learning_rate": 1e-06, + "loss": 0.0203, + "step": 2517 + }, + { + "epoch": 0.42866870956758596, + "grad_norm": 1.2495324611663818, + "learning_rate": 1e-06, + "loss": 0.0138, + "step": 2518 + }, + { + "epoch": 0.4288389513108614, + "grad_norm": 1.4058576822280884, + "learning_rate": 1e-06, + "loss": 0.0137, + "step": 2519 + }, + { + "epoch": 0.4290091930541369, + "grad_norm": 1.436219573020935, + "learning_rate": 1e-06, + "loss": 0.0151, + "step": 2520 + }, + { + "epoch": 0.42917943479741233, + "grad_norm": 1.9257749319076538, + "learning_rate": 1e-06, + "loss": 0.0199, + "step": 2521 + }, + { + "epoch": 0.4293496765406878, + "grad_norm": 1.195271611213684, + "learning_rate": 1e-06, + "loss": 0.0126, + "step": 2522 + }, + { + "epoch": 0.42951991828396324, + "grad_norm": 2.288235664367676, + "learning_rate": 1e-06, + "loss": 0.0229, + "step": 2523 + }, + { + "epoch": 0.4296901600272387, + "grad_norm": 1.20006263256073, + "learning_rate": 1e-06, + "loss": 0.011, + "step": 2524 + }, + { + "epoch": 0.42986040177051416, + "grad_norm": 1.8085074424743652, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 2525 + }, + { + "epoch": 0.43003064351378956, + "grad_norm": 1.3439559936523438, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 2526 + }, + { + "epoch": 0.430200885257065, + "grad_norm": 1.2932634353637695, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 2527 + }, + { + "epoch": 0.43037112700034047, + "grad_norm": 1.3444640636444092, + "learning_rate": 1e-06, + "loss": 0.0134, + "step": 2528 + }, + { + "epoch": 0.4305413687436159, + "grad_norm": 1.4149837493896484, + "learning_rate": 1e-06, + "loss": 0.0173, + "step": 2529 + }, + { + "epoch": 0.4307116104868914, + "grad_norm": 1.1546047925949097, + "learning_rate": 1e-06, + "loss": 0.0105, + "step": 2530 + }, + { + "epoch": 0.43088185223016684, + "grad_norm": 1.6338205337524414, + "learning_rate": 1e-06, + "loss": 0.0204, + "step": 2531 + }, + { + "epoch": 0.4310520939734423, + "grad_norm": 1.8675543069839478, + "learning_rate": 1e-06, + "loss": 0.0107, + "step": 2532 + }, + { + "epoch": 0.43122233571671775, + "grad_norm": 1.2775949239730835, + "learning_rate": 1e-06, + "loss": 0.0137, + "step": 2533 + }, + { + "epoch": 0.4313925774599932, + "grad_norm": 1.2907770872116089, + "learning_rate": 1e-06, + "loss": 0.0156, + "step": 2534 + }, + { + "epoch": 0.43156281920326867, + "grad_norm": 1.519423484802246, + "learning_rate": 1e-06, + "loss": 0.0142, + "step": 2535 + }, + { + "epoch": 0.43173306094654407, + "grad_norm": 1.4875999689102173, + "learning_rate": 1e-06, + "loss": 0.0177, + "step": 2536 + }, + { + "epoch": 0.4319033026898195, + "grad_norm": 1.0595074892044067, + "learning_rate": 1e-06, + "loss": 0.013, + "step": 2537 + }, + { + "epoch": 0.432073544433095, + "grad_norm": 1.461539626121521, + "learning_rate": 1e-06, + "loss": 0.0192, + "step": 2538 + }, + { + "epoch": 0.43224378617637044, + "grad_norm": 1.2863537073135376, + "learning_rate": 1e-06, + "loss": 0.0142, + "step": 2539 + }, + { + "epoch": 0.4324140279196459, + "grad_norm": 1.447748064994812, + "learning_rate": 1e-06, + "loss": 0.0147, + "step": 2540 + }, + { + "epoch": 0.43258426966292135, + "grad_norm": 1.0105310678482056, + "learning_rate": 1e-06, + "loss": 0.012, + "step": 2541 + }, + { + "epoch": 0.4327545114061968, + "grad_norm": 1.367261290550232, + "learning_rate": 1e-06, + "loss": 0.0161, + "step": 2542 + }, + { + "epoch": 0.43292475314947226, + "grad_norm": 1.2634276151657104, + "learning_rate": 1e-06, + "loss": 0.0121, + "step": 2543 + }, + { + "epoch": 0.4330949948927477, + "grad_norm": 1.2317240238189697, + "learning_rate": 1e-06, + "loss": 0.0136, + "step": 2544 + }, + { + "epoch": 0.4332652366360232, + "grad_norm": 1.2238634824752808, + "learning_rate": 1e-06, + "loss": 0.0106, + "step": 2545 + }, + { + "epoch": 0.4334354783792986, + "grad_norm": 1.3249047994613647, + "learning_rate": 1e-06, + "loss": 0.0133, + "step": 2546 + }, + { + "epoch": 0.43360572012257403, + "grad_norm": 1.049531102180481, + "learning_rate": 1e-06, + "loss": 0.0107, + "step": 2547 + }, + { + "epoch": 0.4337759618658495, + "grad_norm": 1.623108983039856, + "learning_rate": 1e-06, + "loss": 0.0204, + "step": 2548 + }, + { + "epoch": 0.43394620360912495, + "grad_norm": 1.2683053016662598, + "learning_rate": 1e-06, + "loss": 0.0173, + "step": 2549 + }, + { + "epoch": 0.4341164453524004, + "grad_norm": 1.6255425214767456, + "learning_rate": 1e-06, + "loss": 0.0163, + "step": 2550 + }, + { + "epoch": 0.43428668709567586, + "grad_norm": 1.1563000679016113, + "learning_rate": 1e-06, + "loss": 0.0145, + "step": 2551 + }, + { + "epoch": 0.4344569288389513, + "grad_norm": 1.3939322233200073, + "learning_rate": 1e-06, + "loss": 0.02, + "step": 2552 + }, + { + "epoch": 0.4346271705822268, + "grad_norm": 1.2949892282485962, + "learning_rate": 1e-06, + "loss": 0.0103, + "step": 2553 + }, + { + "epoch": 0.43479741232550223, + "grad_norm": 1.4135187864303589, + "learning_rate": 1e-06, + "loss": 0.017, + "step": 2554 + }, + { + "epoch": 0.4349676540687777, + "grad_norm": 1.1989250183105469, + "learning_rate": 1e-06, + "loss": 0.0108, + "step": 2555 + }, + { + "epoch": 0.4351378958120531, + "grad_norm": 1.180803894996643, + "learning_rate": 1e-06, + "loss": 0.0145, + "step": 2556 + }, + { + "epoch": 0.43530813755532854, + "grad_norm": 0.8694745302200317, + "learning_rate": 1e-06, + "loss": 0.0084, + "step": 2557 + }, + { + "epoch": 0.435478379298604, + "grad_norm": 1.489912986755371, + "learning_rate": 1e-06, + "loss": 0.0158, + "step": 2558 + }, + { + "epoch": 0.43564862104187946, + "grad_norm": 1.1703554391860962, + "learning_rate": 1e-06, + "loss": 0.011, + "step": 2559 + }, + { + "epoch": 0.4358188627851549, + "grad_norm": 1.3352656364440918, + "learning_rate": 1e-06, + "loss": 0.0115, + "step": 2560 + }, + { + "epoch": 0.43598910452843037, + "grad_norm": 2.0167603492736816, + "learning_rate": 1e-06, + "loss": 0.0132, + "step": 2561 + }, + { + "epoch": 0.43615934627170583, + "grad_norm": 1.5169191360473633, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 2562 + }, + { + "epoch": 0.4363295880149813, + "grad_norm": 1.1504062414169312, + "learning_rate": 1e-06, + "loss": 0.0124, + "step": 2563 + }, + { + "epoch": 0.43649982975825674, + "grad_norm": 1.3212984800338745, + "learning_rate": 1e-06, + "loss": 0.0156, + "step": 2564 + }, + { + "epoch": 0.4366700715015322, + "grad_norm": 1.149043321609497, + "learning_rate": 1e-06, + "loss": 0.0115, + "step": 2565 + }, + { + "epoch": 0.43684031324480765, + "grad_norm": 1.2085059881210327, + "learning_rate": 1e-06, + "loss": 0.012, + "step": 2566 + }, + { + "epoch": 0.43701055498808306, + "grad_norm": 1.2474228143692017, + "learning_rate": 1e-06, + "loss": 0.0123, + "step": 2567 + }, + { + "epoch": 0.4371807967313585, + "grad_norm": 1.2562228441238403, + "learning_rate": 1e-06, + "loss": 0.0118, + "step": 2568 + }, + { + "epoch": 0.43735103847463397, + "grad_norm": 1.5228354930877686, + "learning_rate": 1e-06, + "loss": 0.0136, + "step": 2569 + }, + { + "epoch": 0.4375212802179094, + "grad_norm": 1.6287275552749634, + "learning_rate": 1e-06, + "loss": 0.0179, + "step": 2570 + }, + { + "epoch": 0.4376915219611849, + "grad_norm": 1.3553366661071777, + "learning_rate": 1e-06, + "loss": 0.0154, + "step": 2571 + }, + { + "epoch": 0.43786176370446034, + "grad_norm": 1.4978290796279907, + "learning_rate": 1e-06, + "loss": 0.0142, + "step": 2572 + }, + { + "epoch": 0.4380320054477358, + "grad_norm": 1.4585553407669067, + "learning_rate": 1e-06, + "loss": 0.0153, + "step": 2573 + }, + { + "epoch": 0.43820224719101125, + "grad_norm": 1.175268530845642, + "learning_rate": 1e-06, + "loss": 0.0151, + "step": 2574 + }, + { + "epoch": 0.4383724889342867, + "grad_norm": 1.2472771406173706, + "learning_rate": 1e-06, + "loss": 0.0172, + "step": 2575 + }, + { + "epoch": 0.43854273067756216, + "grad_norm": 1.109674096107483, + "learning_rate": 1e-06, + "loss": 0.0094, + "step": 2576 + }, + { + "epoch": 0.43871297242083757, + "grad_norm": 0.974772572517395, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 2577 + }, + { + "epoch": 0.438883214164113, + "grad_norm": 1.1081093549728394, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 2578 + }, + { + "epoch": 0.4390534559073885, + "grad_norm": 1.1672372817993164, + "learning_rate": 1e-06, + "loss": 0.0125, + "step": 2579 + }, + { + "epoch": 0.43922369765066394, + "grad_norm": 1.2188431024551392, + "learning_rate": 1e-06, + "loss": 0.0147, + "step": 2580 + }, + { + "epoch": 0.4393939393939394, + "grad_norm": 1.0513874292373657, + "learning_rate": 1e-06, + "loss": 0.0107, + "step": 2581 + }, + { + "epoch": 0.43956418113721485, + "grad_norm": 1.2327890396118164, + "learning_rate": 1e-06, + "loss": 0.0129, + "step": 2582 + }, + { + "epoch": 0.4397344228804903, + "grad_norm": 2.443394422531128, + "learning_rate": 1e-06, + "loss": 0.0233, + "step": 2583 + }, + { + "epoch": 0.43990466462376576, + "grad_norm": 1.189650535583496, + "learning_rate": 1e-06, + "loss": 0.0211, + "step": 2584 + }, + { + "epoch": 0.4400749063670412, + "grad_norm": 1.22821843624115, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 2585 + }, + { + "epoch": 0.4402451481103167, + "grad_norm": 1.223922848701477, + "learning_rate": 1e-06, + "loss": 0.0112, + "step": 2586 + }, + { + "epoch": 0.4404153898535921, + "grad_norm": 1.3211407661437988, + "learning_rate": 1e-06, + "loss": 0.0141, + "step": 2587 + }, + { + "epoch": 0.44058563159686753, + "grad_norm": 1.2087403535842896, + "learning_rate": 1e-06, + "loss": 0.0152, + "step": 2588 + }, + { + "epoch": 0.440755873340143, + "grad_norm": 1.6649106740951538, + "learning_rate": 1e-06, + "loss": 0.0225, + "step": 2589 + }, + { + "epoch": 0.44092611508341845, + "grad_norm": 1.302064061164856, + "learning_rate": 1e-06, + "loss": 0.0116, + "step": 2590 + }, + { + "epoch": 0.4410963568266939, + "grad_norm": 1.2485460042953491, + "learning_rate": 1e-06, + "loss": 0.014, + "step": 2591 + }, + { + "epoch": 0.44126659856996936, + "grad_norm": 1.1560567617416382, + "learning_rate": 1e-06, + "loss": 0.0134, + "step": 2592 + }, + { + "epoch": 0.4414368403132448, + "grad_norm": 1.436091661453247, + "learning_rate": 1e-06, + "loss": 0.0132, + "step": 2593 + }, + { + "epoch": 0.4416070820565203, + "grad_norm": 1.1101467609405518, + "learning_rate": 1e-06, + "loss": 0.0131, + "step": 2594 + }, + { + "epoch": 0.44177732379979573, + "grad_norm": 1.2781106233596802, + "learning_rate": 1e-06, + "loss": 0.0217, + "step": 2595 + }, + { + "epoch": 0.4419475655430712, + "grad_norm": 1.1032869815826416, + "learning_rate": 1e-06, + "loss": 0.0083, + "step": 2596 + }, + { + "epoch": 0.4421178072863466, + "grad_norm": 1.4114645719528198, + "learning_rate": 1e-06, + "loss": 0.0118, + "step": 2597 + }, + { + "epoch": 0.44228804902962204, + "grad_norm": 1.307861566543579, + "learning_rate": 1e-06, + "loss": 0.0106, + "step": 2598 + }, + { + "epoch": 0.4424582907728975, + "grad_norm": 1.3546010255813599, + "learning_rate": 1e-06, + "loss": 0.0124, + "step": 2599 + }, + { + "epoch": 0.44262853251617296, + "grad_norm": 1.1858961582183838, + "learning_rate": 1e-06, + "loss": 0.0181, + "step": 2600 + }, + { + "epoch": 0.4427987742594484, + "grad_norm": 1.7051303386688232, + "learning_rate": 1e-06, + "loss": 0.0155, + "step": 2601 + }, + { + "epoch": 0.44296901600272387, + "grad_norm": 1.2881419658660889, + "learning_rate": 1e-06, + "loss": 0.015, + "step": 2602 + }, + { + "epoch": 0.4431392577459993, + "grad_norm": 1.2599304914474487, + "learning_rate": 1e-06, + "loss": 0.0117, + "step": 2603 + }, + { + "epoch": 0.4433094994892748, + "grad_norm": 1.2731688022613525, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 2604 + }, + { + "epoch": 0.44347974123255024, + "grad_norm": 1.2084245681762695, + "learning_rate": 1e-06, + "loss": 0.0129, + "step": 2605 + }, + { + "epoch": 0.4436499829758257, + "grad_norm": 1.2445482015609741, + "learning_rate": 1e-06, + "loss": 0.0115, + "step": 2606 + }, + { + "epoch": 0.4438202247191011, + "grad_norm": 1.8854734897613525, + "learning_rate": 1e-06, + "loss": 0.0193, + "step": 2607 + }, + { + "epoch": 0.44399046646237655, + "grad_norm": 1.4180908203125, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 2608 + }, + { + "epoch": 0.444160708205652, + "grad_norm": 1.7596639394760132, + "learning_rate": 1e-06, + "loss": 0.0215, + "step": 2609 + }, + { + "epoch": 0.44433094994892747, + "grad_norm": 1.8333537578582764, + "learning_rate": 1e-06, + "loss": 0.0171, + "step": 2610 + }, + { + "epoch": 0.4445011916922029, + "grad_norm": 1.5384416580200195, + "learning_rate": 1e-06, + "loss": 0.0121, + "step": 2611 + }, + { + "epoch": 0.4446714334354784, + "grad_norm": 1.6996245384216309, + "learning_rate": 1e-06, + "loss": 0.0296, + "step": 2612 + }, + { + "epoch": 0.44484167517875384, + "grad_norm": 1.2037662267684937, + "learning_rate": 1e-06, + "loss": 0.0107, + "step": 2613 + }, + { + "epoch": 0.4450119169220293, + "grad_norm": 1.2231796979904175, + "learning_rate": 1e-06, + "loss": 0.0118, + "step": 2614 + }, + { + "epoch": 0.44518215866530475, + "grad_norm": 1.1597298383712769, + "learning_rate": 1e-06, + "loss": 0.0147, + "step": 2615 + }, + { + "epoch": 0.4453524004085802, + "grad_norm": 1.1221568584442139, + "learning_rate": 1e-06, + "loss": 0.0108, + "step": 2616 + }, + { + "epoch": 0.4455226421518556, + "grad_norm": 3.223909378051758, + "learning_rate": 1e-06, + "loss": 0.0231, + "step": 2617 + }, + { + "epoch": 0.44569288389513106, + "grad_norm": 1.6201415061950684, + "learning_rate": 1e-06, + "loss": 0.0174, + "step": 2618 + }, + { + "epoch": 0.4458631256384065, + "grad_norm": 1.248681902885437, + "learning_rate": 1e-06, + "loss": 0.0129, + "step": 2619 + }, + { + "epoch": 0.446033367381682, + "grad_norm": 1.1186449527740479, + "learning_rate": 1e-06, + "loss": 0.0093, + "step": 2620 + }, + { + "epoch": 0.44620360912495743, + "grad_norm": 2.316514253616333, + "learning_rate": 1e-06, + "loss": 0.0223, + "step": 2621 + }, + { + "epoch": 0.4463738508682329, + "grad_norm": 1.214141607284546, + "learning_rate": 1e-06, + "loss": 0.01, + "step": 2622 + }, + { + "epoch": 0.44654409261150835, + "grad_norm": 1.4486373662948608, + "learning_rate": 1e-06, + "loss": 0.0292, + "step": 2623 + }, + { + "epoch": 0.4467143343547838, + "grad_norm": 1.2913445234298706, + "learning_rate": 1e-06, + "loss": 0.0123, + "step": 2624 + }, + { + "epoch": 0.44688457609805926, + "grad_norm": 1.2563960552215576, + "learning_rate": 1e-06, + "loss": 0.0222, + "step": 2625 + }, + { + "epoch": 0.4470548178413347, + "grad_norm": 1.4476072788238525, + "learning_rate": 1e-06, + "loss": 0.0188, + "step": 2626 + }, + { + "epoch": 0.4472250595846102, + "grad_norm": 1.5944808721542358, + "learning_rate": 1e-06, + "loss": 0.0161, + "step": 2627 + }, + { + "epoch": 0.4473953013278856, + "grad_norm": 1.1989669799804688, + "learning_rate": 1e-06, + "loss": 0.0134, + "step": 2628 + }, + { + "epoch": 0.44756554307116103, + "grad_norm": 1.5682440996170044, + "learning_rate": 1e-06, + "loss": 0.021, + "step": 2629 + }, + { + "epoch": 0.4477357848144365, + "grad_norm": 1.1739362478256226, + "learning_rate": 1e-06, + "loss": 0.0105, + "step": 2630 + }, + { + "epoch": 0.44790602655771194, + "grad_norm": 1.2502210140228271, + "learning_rate": 1e-06, + "loss": 0.019, + "step": 2631 + }, + { + "epoch": 0.4480762683009874, + "grad_norm": 1.1751649379730225, + "learning_rate": 1e-06, + "loss": 0.0098, + "step": 2632 + }, + { + "epoch": 0.44824651004426286, + "grad_norm": 1.1609954833984375, + "learning_rate": 1e-06, + "loss": 0.0186, + "step": 2633 + }, + { + "epoch": 0.4484167517875383, + "grad_norm": 1.4711358547210693, + "learning_rate": 1e-06, + "loss": 0.0197, + "step": 2634 + }, + { + "epoch": 0.44858699353081377, + "grad_norm": 1.154991865158081, + "learning_rate": 1e-06, + "loss": 0.0182, + "step": 2635 + }, + { + "epoch": 0.44875723527408923, + "grad_norm": 1.0405231714248657, + "learning_rate": 1e-06, + "loss": 0.0104, + "step": 2636 + }, + { + "epoch": 0.4489274770173647, + "grad_norm": 1.2849845886230469, + "learning_rate": 1e-06, + "loss": 0.0148, + "step": 2637 + }, + { + "epoch": 0.4490977187606401, + "grad_norm": 1.4233804941177368, + "learning_rate": 1e-06, + "loss": 0.0149, + "step": 2638 + }, + { + "epoch": 0.44926796050391554, + "grad_norm": 1.2203373908996582, + "learning_rate": 1e-06, + "loss": 0.0156, + "step": 2639 + }, + { + "epoch": 0.449438202247191, + "grad_norm": 1.1618257761001587, + "learning_rate": 1e-06, + "loss": 0.0114, + "step": 2640 + }, + { + "epoch": 0.44960844399046646, + "grad_norm": 1.307241439819336, + "learning_rate": 1e-06, + "loss": 0.0121, + "step": 2641 + }, + { + "epoch": 0.4497786857337419, + "grad_norm": 1.3096569776535034, + "learning_rate": 1e-06, + "loss": 0.016, + "step": 2642 + }, + { + "epoch": 0.44994892747701737, + "grad_norm": 2.0565989017486572, + "learning_rate": 1e-06, + "loss": 0.0164, + "step": 2643 + }, + { + "epoch": 0.4501191692202928, + "grad_norm": 1.3968826532363892, + "learning_rate": 1e-06, + "loss": 0.0118, + "step": 2644 + }, + { + "epoch": 0.4502894109635683, + "grad_norm": 1.3210657835006714, + "learning_rate": 1e-06, + "loss": 0.0126, + "step": 2645 + }, + { + "epoch": 0.45045965270684374, + "grad_norm": 1.1253306865692139, + "learning_rate": 1e-06, + "loss": 0.0113, + "step": 2646 + }, + { + "epoch": 0.4506298944501192, + "grad_norm": 1.6668893098831177, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 2647 + }, + { + "epoch": 0.4508001361933946, + "grad_norm": 1.3010493516921997, + "learning_rate": 1e-06, + "loss": 0.0151, + "step": 2648 + }, + { + "epoch": 0.45097037793667005, + "grad_norm": 1.4834797382354736, + "learning_rate": 1e-06, + "loss": 0.0105, + "step": 2649 + }, + { + "epoch": 0.4511406196799455, + "grad_norm": 1.3294862508773804, + "learning_rate": 1e-06, + "loss": 0.0119, + "step": 2650 + }, + { + "epoch": 0.45131086142322097, + "grad_norm": 1.2867186069488525, + "learning_rate": 1e-06, + "loss": 0.0124, + "step": 2651 + }, + { + "epoch": 0.4514811031664964, + "grad_norm": 1.1677923202514648, + "learning_rate": 1e-06, + "loss": 0.0116, + "step": 2652 + }, + { + "epoch": 0.4516513449097719, + "grad_norm": 1.1350395679473877, + "learning_rate": 1e-06, + "loss": 0.0131, + "step": 2653 + }, + { + "epoch": 0.45182158665304734, + "grad_norm": 1.4789958000183105, + "learning_rate": 1e-06, + "loss": 0.0141, + "step": 2654 + }, + { + "epoch": 0.4519918283963228, + "grad_norm": 1.325280785560608, + "learning_rate": 1e-06, + "loss": 0.0194, + "step": 2655 + }, + { + "epoch": 0.45216207013959825, + "grad_norm": 1.2727978229522705, + "learning_rate": 1e-06, + "loss": 0.0098, + "step": 2656 + }, + { + "epoch": 0.4523323118828737, + "grad_norm": 1.313781976699829, + "learning_rate": 1e-06, + "loss": 0.0177, + "step": 2657 + }, + { + "epoch": 0.4525025536261491, + "grad_norm": 1.6075435876846313, + "learning_rate": 1e-06, + "loss": 0.0196, + "step": 2658 + }, + { + "epoch": 0.45267279536942456, + "grad_norm": 1.2357505559921265, + "learning_rate": 1e-06, + "loss": 0.0122, + "step": 2659 + }, + { + "epoch": 0.4528430371127, + "grad_norm": 1.182496428489685, + "learning_rate": 1e-06, + "loss": 0.0181, + "step": 2660 + }, + { + "epoch": 0.4530132788559755, + "grad_norm": 0.940984845161438, + "learning_rate": 1e-06, + "loss": 0.0086, + "step": 2661 + }, + { + "epoch": 0.45318352059925093, + "grad_norm": 1.304612636566162, + "learning_rate": 1e-06, + "loss": 0.0117, + "step": 2662 + }, + { + "epoch": 0.4533537623425264, + "grad_norm": 1.482349157333374, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 2663 + }, + { + "epoch": 0.45352400408580185, + "grad_norm": 1.1997534036636353, + "learning_rate": 1e-06, + "loss": 0.0157, + "step": 2664 + }, + { + "epoch": 0.4536942458290773, + "grad_norm": 1.2524585723876953, + "learning_rate": 1e-06, + "loss": 0.0124, + "step": 2665 + }, + { + "epoch": 0.45386448757235276, + "grad_norm": 1.139270544052124, + "learning_rate": 1e-06, + "loss": 0.0141, + "step": 2666 + }, + { + "epoch": 0.4540347293156282, + "grad_norm": 1.2429019212722778, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 2667 + }, + { + "epoch": 0.4542049710589036, + "grad_norm": 1.0963373184204102, + "learning_rate": 1e-06, + "loss": 0.0112, + "step": 2668 + }, + { + "epoch": 0.4543752128021791, + "grad_norm": 1.0817755460739136, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 2669 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 1.246390700340271, + "learning_rate": 1e-06, + "loss": 0.013, + "step": 2670 + }, + { + "epoch": 0.45471569628873, + "grad_norm": 1.3719927072525024, + "learning_rate": 1e-06, + "loss": 0.0196, + "step": 2671 + }, + { + "epoch": 0.45488593803200544, + "grad_norm": 1.0424156188964844, + "learning_rate": 1e-06, + "loss": 0.0122, + "step": 2672 + }, + { + "epoch": 0.4550561797752809, + "grad_norm": 1.1549937725067139, + "learning_rate": 1e-06, + "loss": 0.015, + "step": 2673 + }, + { + "epoch": 0.45522642151855636, + "grad_norm": 1.3112993240356445, + "learning_rate": 1e-06, + "loss": 0.0137, + "step": 2674 + }, + { + "epoch": 0.4553966632618318, + "grad_norm": 1.9749765396118164, + "learning_rate": 1e-06, + "loss": 0.0143, + "step": 2675 + }, + { + "epoch": 0.45556690500510727, + "grad_norm": 1.4129223823547363, + "learning_rate": 1e-06, + "loss": 0.0171, + "step": 2676 + }, + { + "epoch": 0.4557371467483827, + "grad_norm": 1.2779748439788818, + "learning_rate": 1e-06, + "loss": 0.0153, + "step": 2677 + }, + { + "epoch": 0.4559073884916581, + "grad_norm": 4.037950038909912, + "learning_rate": 1e-06, + "loss": 0.0177, + "step": 2678 + }, + { + "epoch": 0.4560776302349336, + "grad_norm": 1.1884809732437134, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 2679 + }, + { + "epoch": 0.45624787197820904, + "grad_norm": 1.5136353969573975, + "learning_rate": 1e-06, + "loss": 0.0176, + "step": 2680 + }, + { + "epoch": 0.4564181137214845, + "grad_norm": 5.290273189544678, + "learning_rate": 1e-06, + "loss": 0.0812, + "step": 2681 + }, + { + "epoch": 0.45658835546475995, + "grad_norm": 1.0113693475723267, + "learning_rate": 1e-06, + "loss": 0.0086, + "step": 2682 + }, + { + "epoch": 0.4567585972080354, + "grad_norm": 1.279430627822876, + "learning_rate": 1e-06, + "loss": 0.0182, + "step": 2683 + }, + { + "epoch": 0.45692883895131087, + "grad_norm": 1.4505813121795654, + "learning_rate": 1e-06, + "loss": 0.0156, + "step": 2684 + }, + { + "epoch": 0.4570990806945863, + "grad_norm": 1.6365201473236084, + "learning_rate": 1e-06, + "loss": 0.017, + "step": 2685 + }, + { + "epoch": 0.4572693224378618, + "grad_norm": 1.1942694187164307, + "learning_rate": 1e-06, + "loss": 0.0119, + "step": 2686 + }, + { + "epoch": 0.45743956418113724, + "grad_norm": 1.1934460401535034, + "learning_rate": 1e-06, + "loss": 0.0172, + "step": 2687 + }, + { + "epoch": 0.4576098059244127, + "grad_norm": 1.3042709827423096, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 2688 + }, + { + "epoch": 0.4577800476676881, + "grad_norm": 1.2046012878417969, + "learning_rate": 1e-06, + "loss": 0.0092, + "step": 2689 + }, + { + "epoch": 0.45795028941096355, + "grad_norm": 1.5803378820419312, + "learning_rate": 1e-06, + "loss": 0.0146, + "step": 2690 + }, + { + "epoch": 0.458120531154239, + "grad_norm": 1.2354884147644043, + "learning_rate": 1e-06, + "loss": 0.0117, + "step": 2691 + }, + { + "epoch": 0.45829077289751446, + "grad_norm": 2.588909864425659, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 2692 + }, + { + "epoch": 0.4584610146407899, + "grad_norm": 1.2800123691558838, + "learning_rate": 1e-06, + "loss": 0.012, + "step": 2693 + }, + { + "epoch": 0.4586312563840654, + "grad_norm": 1.6324779987335205, + "learning_rate": 1e-06, + "loss": 0.0157, + "step": 2694 + }, + { + "epoch": 0.45880149812734083, + "grad_norm": 1.0287292003631592, + "learning_rate": 1e-06, + "loss": 0.0115, + "step": 2695 + }, + { + "epoch": 0.4589717398706163, + "grad_norm": 1.4863289594650269, + "learning_rate": 1e-06, + "loss": 0.0166, + "step": 2696 + }, + { + "epoch": 0.45914198161389175, + "grad_norm": 1.7314716577529907, + "learning_rate": 1e-06, + "loss": 0.0222, + "step": 2697 + }, + { + "epoch": 0.4593122233571672, + "grad_norm": 1.315182089805603, + "learning_rate": 1e-06, + "loss": 0.0103, + "step": 2698 + }, + { + "epoch": 0.4594824651004426, + "grad_norm": 1.8149510622024536, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 2699 + }, + { + "epoch": 0.45965270684371806, + "grad_norm": 1.528834581375122, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 2700 + }, + { + "epoch": 0.4598229485869935, + "grad_norm": 1.3489001989364624, + "learning_rate": 1e-06, + "loss": 0.021, + "step": 2701 + }, + { + "epoch": 0.459993190330269, + "grad_norm": 1.1386735439300537, + "learning_rate": 1e-06, + "loss": 0.0146, + "step": 2702 + }, + { + "epoch": 0.46016343207354443, + "grad_norm": 0.9807332754135132, + "learning_rate": 1e-06, + "loss": 0.0097, + "step": 2703 + }, + { + "epoch": 0.4603336738168199, + "grad_norm": 1.237453579902649, + "learning_rate": 1e-06, + "loss": 0.018, + "step": 2704 + }, + { + "epoch": 0.46050391556009534, + "grad_norm": 1.3836872577667236, + "learning_rate": 1e-06, + "loss": 0.0127, + "step": 2705 + }, + { + "epoch": 0.4606741573033708, + "grad_norm": 1.1257848739624023, + "learning_rate": 1e-06, + "loss": 0.0132, + "step": 2706 + }, + { + "epoch": 0.46084439904664626, + "grad_norm": 1.412534475326538, + "learning_rate": 1e-06, + "loss": 0.0153, + "step": 2707 + }, + { + "epoch": 0.4610146407899217, + "grad_norm": 1.4209263324737549, + "learning_rate": 1e-06, + "loss": 0.0144, + "step": 2708 + }, + { + "epoch": 0.4611848825331971, + "grad_norm": 1.466731309890747, + "learning_rate": 1e-06, + "loss": 0.0163, + "step": 2709 + }, + { + "epoch": 0.4613551242764726, + "grad_norm": 2.445279121398926, + "learning_rate": 1e-06, + "loss": 0.0225, + "step": 2710 + }, + { + "epoch": 0.46152536601974803, + "grad_norm": 1.6007717847824097, + "learning_rate": 1e-06, + "loss": 0.0136, + "step": 2711 + }, + { + "epoch": 0.4616956077630235, + "grad_norm": 4.2332844734191895, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 2712 + }, + { + "epoch": 0.46186584950629894, + "grad_norm": 1.627395749092102, + "learning_rate": 1e-06, + "loss": 0.0187, + "step": 2713 + }, + { + "epoch": 0.4620360912495744, + "grad_norm": 1.2684941291809082, + "learning_rate": 1e-06, + "loss": 0.0114, + "step": 2714 + }, + { + "epoch": 0.46220633299284986, + "grad_norm": 1.287532925605774, + "learning_rate": 1e-06, + "loss": 0.012, + "step": 2715 + }, + { + "epoch": 0.4623765747361253, + "grad_norm": 1.6071957349777222, + "learning_rate": 1e-06, + "loss": 0.0171, + "step": 2716 + }, + { + "epoch": 0.46254681647940077, + "grad_norm": 1.3653993606567383, + "learning_rate": 1e-06, + "loss": 0.0153, + "step": 2717 + }, + { + "epoch": 0.4627170582226762, + "grad_norm": 0.9938444495201111, + "learning_rate": 1e-06, + "loss": 0.0084, + "step": 2718 + }, + { + "epoch": 0.4628872999659516, + "grad_norm": 1.3033769130706787, + "learning_rate": 1e-06, + "loss": 0.0105, + "step": 2719 + }, + { + "epoch": 0.4630575417092271, + "grad_norm": 1.2116801738739014, + "learning_rate": 1e-06, + "loss": 0.0147, + "step": 2720 + }, + { + "epoch": 0.46322778345250254, + "grad_norm": 0.969497561454773, + "learning_rate": 1e-06, + "loss": 0.0091, + "step": 2721 + }, + { + "epoch": 0.463398025195778, + "grad_norm": 1.1007535457611084, + "learning_rate": 1e-06, + "loss": 0.0151, + "step": 2722 + }, + { + "epoch": 0.46356826693905345, + "grad_norm": 1.2838865518569946, + "learning_rate": 1e-06, + "loss": 0.0115, + "step": 2723 + }, + { + "epoch": 0.4637385086823289, + "grad_norm": 1.1375126838684082, + "learning_rate": 1e-06, + "loss": 0.0142, + "step": 2724 + }, + { + "epoch": 0.46390875042560437, + "grad_norm": 0.9952174425125122, + "learning_rate": 1e-06, + "loss": 0.0102, + "step": 2725 + }, + { + "epoch": 0.4640789921688798, + "grad_norm": 1.0825743675231934, + "learning_rate": 1e-06, + "loss": 0.0108, + "step": 2726 + }, + { + "epoch": 0.4642492339121553, + "grad_norm": 1.658443808555603, + "learning_rate": 1e-06, + "loss": 0.0286, + "step": 2727 + }, + { + "epoch": 0.46441947565543074, + "grad_norm": 1.2224160432815552, + "learning_rate": 1e-06, + "loss": 0.0171, + "step": 2728 + }, + { + "epoch": 0.46458971739870614, + "grad_norm": 1.210042119026184, + "learning_rate": 1e-06, + "loss": 0.0128, + "step": 2729 + }, + { + "epoch": 0.4647599591419816, + "grad_norm": 1.4320639371871948, + "learning_rate": 1e-06, + "loss": 0.0099, + "step": 2730 + }, + { + "epoch": 0.46493020088525705, + "grad_norm": 1.2222720384597778, + "learning_rate": 1e-06, + "loss": 0.0134, + "step": 2731 + }, + { + "epoch": 0.4651004426285325, + "grad_norm": 1.5248254537582397, + "learning_rate": 1e-06, + "loss": 0.0136, + "step": 2732 + }, + { + "epoch": 0.46527068437180796, + "grad_norm": 1.599933385848999, + "learning_rate": 1e-06, + "loss": 0.0173, + "step": 2733 + }, + { + "epoch": 0.4654409261150834, + "grad_norm": 1.824179768562317, + "learning_rate": 1e-06, + "loss": 0.0158, + "step": 2734 + }, + { + "epoch": 0.4656111678583589, + "grad_norm": 1.516343355178833, + "learning_rate": 1e-06, + "loss": 0.0209, + "step": 2735 + }, + { + "epoch": 0.46578140960163433, + "grad_norm": 1.768630862236023, + "learning_rate": 1e-06, + "loss": 0.0191, + "step": 2736 + }, + { + "epoch": 0.4659516513449098, + "grad_norm": 1.7268173694610596, + "learning_rate": 1e-06, + "loss": 0.0312, + "step": 2737 + }, + { + "epoch": 0.46612189308818525, + "grad_norm": 1.7336803674697876, + "learning_rate": 1e-06, + "loss": 0.0117, + "step": 2738 + }, + { + "epoch": 0.46629213483146065, + "grad_norm": 1.4071462154388428, + "learning_rate": 1e-06, + "loss": 0.0165, + "step": 2739 + }, + { + "epoch": 0.4664623765747361, + "grad_norm": 1.6221487522125244, + "learning_rate": 1e-06, + "loss": 0.0204, + "step": 2740 + }, + { + "epoch": 0.46663261831801156, + "grad_norm": 1.0625100135803223, + "learning_rate": 1e-06, + "loss": 0.018, + "step": 2741 + }, + { + "epoch": 0.466802860061287, + "grad_norm": 3.000713348388672, + "learning_rate": 1e-06, + "loss": 0.0445, + "step": 2742 + }, + { + "epoch": 0.4669731018045625, + "grad_norm": 1.5852463245391846, + "learning_rate": 1e-06, + "loss": 0.0152, + "step": 2743 + }, + { + "epoch": 0.46714334354783793, + "grad_norm": 1.2370667457580566, + "learning_rate": 1e-06, + "loss": 0.0136, + "step": 2744 + }, + { + "epoch": 0.4673135852911134, + "grad_norm": 1.1240363121032715, + "learning_rate": 1e-06, + "loss": 0.0113, + "step": 2745 + }, + { + "epoch": 0.46748382703438884, + "grad_norm": 1.7612371444702148, + "learning_rate": 1e-06, + "loss": 0.0312, + "step": 2746 + }, + { + "epoch": 0.4676540687776643, + "grad_norm": 1.0981045961380005, + "learning_rate": 1e-06, + "loss": 0.016, + "step": 2747 + }, + { + "epoch": 0.46782431052093976, + "grad_norm": 1.3208484649658203, + "learning_rate": 1e-06, + "loss": 0.0167, + "step": 2748 + }, + { + "epoch": 0.46799455226421516, + "grad_norm": 1.1048396825790405, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 2749 + }, + { + "epoch": 0.4681647940074906, + "grad_norm": 1.2207963466644287, + "learning_rate": 1e-06, + "loss": 0.0188, + "step": 2750 + }, + { + "epoch": 0.46833503575076607, + "grad_norm": 1.3159321546554565, + "learning_rate": 1e-06, + "loss": 0.0136, + "step": 2751 + }, + { + "epoch": 0.4685052774940415, + "grad_norm": 1.4273492097854614, + "learning_rate": 1e-06, + "loss": 0.013, + "step": 2752 + }, + { + "epoch": 0.468675519237317, + "grad_norm": 1.4401754140853882, + "learning_rate": 1e-06, + "loss": 0.0151, + "step": 2753 + }, + { + "epoch": 0.46884576098059244, + "grad_norm": 1.262450098991394, + "learning_rate": 1e-06, + "loss": 0.0145, + "step": 2754 + }, + { + "epoch": 0.4690160027238679, + "grad_norm": 1.0684226751327515, + "learning_rate": 1e-06, + "loss": 0.0086, + "step": 2755 + }, + { + "epoch": 0.46918624446714335, + "grad_norm": 1.1081675291061401, + "learning_rate": 1e-06, + "loss": 0.0085, + "step": 2756 + }, + { + "epoch": 0.4693564862104188, + "grad_norm": 1.0820542573928833, + "learning_rate": 1e-06, + "loss": 0.0112, + "step": 2757 + }, + { + "epoch": 0.46952672795369427, + "grad_norm": 1.4552348852157593, + "learning_rate": 1e-06, + "loss": 0.0123, + "step": 2758 + }, + { + "epoch": 0.4696969696969697, + "grad_norm": 1.2042700052261353, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 2759 + }, + { + "epoch": 0.4698672114402451, + "grad_norm": 1.2401795387268066, + "learning_rate": 1e-06, + "loss": 0.0119, + "step": 2760 + }, + { + "epoch": 0.4700374531835206, + "grad_norm": 1.3959617614746094, + "learning_rate": 1e-06, + "loss": 0.0092, + "step": 2761 + }, + { + "epoch": 0.47020769492679604, + "grad_norm": 1.220746636390686, + "learning_rate": 1e-06, + "loss": 0.0172, + "step": 2762 + }, + { + "epoch": 0.4703779366700715, + "grad_norm": 1.241463303565979, + "learning_rate": 1e-06, + "loss": 0.0131, + "step": 2763 + }, + { + "epoch": 0.47054817841334695, + "grad_norm": 1.2252508401870728, + "learning_rate": 1e-06, + "loss": 0.013, + "step": 2764 + }, + { + "epoch": 0.4707184201566224, + "grad_norm": 1.9142897129058838, + "learning_rate": 1e-06, + "loss": 0.028, + "step": 2765 + }, + { + "epoch": 0.47088866189989786, + "grad_norm": 1.1304916143417358, + "learning_rate": 1e-06, + "loss": 0.0119, + "step": 2766 + }, + { + "epoch": 0.4710589036431733, + "grad_norm": 1.247910499572754, + "learning_rate": 1e-06, + "loss": 0.0179, + "step": 2767 + }, + { + "epoch": 0.4712291453864488, + "grad_norm": 1.3291040658950806, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 2768 + }, + { + "epoch": 0.47139938712972423, + "grad_norm": 1.139250636100769, + "learning_rate": 1e-06, + "loss": 0.0146, + "step": 2769 + }, + { + "epoch": 0.47156962887299964, + "grad_norm": 1.6703858375549316, + "learning_rate": 1e-06, + "loss": 0.0184, + "step": 2770 + }, + { + "epoch": 0.4717398706162751, + "grad_norm": 1.6940549612045288, + "learning_rate": 1e-06, + "loss": 0.0164, + "step": 2771 + }, + { + "epoch": 0.47191011235955055, + "grad_norm": 1.24887216091156, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 2772 + }, + { + "epoch": 0.472080354102826, + "grad_norm": 1.0762343406677246, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 2773 + }, + { + "epoch": 0.47225059584610146, + "grad_norm": 1.1544651985168457, + "learning_rate": 1e-06, + "loss": 0.0129, + "step": 2774 + }, + { + "epoch": 0.4724208375893769, + "grad_norm": 1.3130018711090088, + "learning_rate": 1e-06, + "loss": 0.0144, + "step": 2775 + }, + { + "epoch": 0.4725910793326524, + "grad_norm": 1.620080590248108, + "learning_rate": 1e-06, + "loss": 0.0181, + "step": 2776 + }, + { + "epoch": 0.47276132107592783, + "grad_norm": 1.6613308191299438, + "learning_rate": 1e-06, + "loss": 0.0118, + "step": 2777 + }, + { + "epoch": 0.4729315628192033, + "grad_norm": 1.066480040550232, + "learning_rate": 1e-06, + "loss": 0.0077, + "step": 2778 + }, + { + "epoch": 0.47310180456247874, + "grad_norm": 1.109659194946289, + "learning_rate": 1e-06, + "loss": 0.0123, + "step": 2779 + }, + { + "epoch": 0.47327204630575415, + "grad_norm": 1.551084041595459, + "learning_rate": 1e-06, + "loss": 0.0107, + "step": 2780 + }, + { + "epoch": 0.4734422880490296, + "grad_norm": 1.2061805725097656, + "learning_rate": 1e-06, + "loss": 0.0166, + "step": 2781 + }, + { + "epoch": 0.47361252979230506, + "grad_norm": 1.9850165843963623, + "learning_rate": 1e-06, + "loss": 0.0128, + "step": 2782 + }, + { + "epoch": 0.4737827715355805, + "grad_norm": 1.3873836994171143, + "learning_rate": 1e-06, + "loss": 0.0146, + "step": 2783 + }, + { + "epoch": 0.473953013278856, + "grad_norm": 1.0683759450912476, + "learning_rate": 1e-06, + "loss": 0.0106, + "step": 2784 + }, + { + "epoch": 0.47412325502213143, + "grad_norm": 0.9684468507766724, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 2785 + }, + { + "epoch": 0.4742934967654069, + "grad_norm": 1.323331594467163, + "learning_rate": 1e-06, + "loss": 0.0156, + "step": 2786 + }, + { + "epoch": 0.47446373850868234, + "grad_norm": 1.1693027019500732, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 2787 + }, + { + "epoch": 0.4746339802519578, + "grad_norm": 0.9231655597686768, + "learning_rate": 1e-06, + "loss": 0.0085, + "step": 2788 + }, + { + "epoch": 0.47480422199523326, + "grad_norm": 1.9782781600952148, + "learning_rate": 1e-06, + "loss": 0.0199, + "step": 2789 + }, + { + "epoch": 0.47497446373850866, + "grad_norm": 1.5112477540969849, + "learning_rate": 1e-06, + "loss": 0.0166, + "step": 2790 + }, + { + "epoch": 0.4751447054817841, + "grad_norm": 1.178739309310913, + "learning_rate": 1e-06, + "loss": 0.012, + "step": 2791 + }, + { + "epoch": 0.47531494722505957, + "grad_norm": 2.3978826999664307, + "learning_rate": 1e-06, + "loss": 0.0155, + "step": 2792 + }, + { + "epoch": 0.475485188968335, + "grad_norm": 1.3973369598388672, + "learning_rate": 1e-06, + "loss": 0.0115, + "step": 2793 + }, + { + "epoch": 0.4756554307116105, + "grad_norm": 1.1762423515319824, + "learning_rate": 1e-06, + "loss": 0.0124, + "step": 2794 + }, + { + "epoch": 0.47582567245488594, + "grad_norm": 1.2299197912216187, + "learning_rate": 1e-06, + "loss": 0.0118, + "step": 2795 + }, + { + "epoch": 0.4759959141981614, + "grad_norm": 1.3088619709014893, + "learning_rate": 1e-06, + "loss": 0.0174, + "step": 2796 + }, + { + "epoch": 0.47616615594143685, + "grad_norm": 1.0132900476455688, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 2797 + }, + { + "epoch": 0.4763363976847123, + "grad_norm": 1.129879355430603, + "learning_rate": 1e-06, + "loss": 0.0085, + "step": 2798 + }, + { + "epoch": 0.47650663942798777, + "grad_norm": 0.9757645130157471, + "learning_rate": 1e-06, + "loss": 0.0132, + "step": 2799 + }, + { + "epoch": 0.47667688117126317, + "grad_norm": 1.4585630893707275, + "learning_rate": 1e-06, + "loss": 0.017, + "step": 2800 + }, + { + "epoch": 0.4768471229145386, + "grad_norm": 1.0814754962921143, + "learning_rate": 1e-06, + "loss": 0.0162, + "step": 2801 + }, + { + "epoch": 0.4770173646578141, + "grad_norm": 1.095276117324829, + "learning_rate": 1e-06, + "loss": 0.0098, + "step": 2802 + }, + { + "epoch": 0.47718760640108954, + "grad_norm": 1.1605823040008545, + "learning_rate": 1e-06, + "loss": 0.0111, + "step": 2803 + }, + { + "epoch": 0.477357848144365, + "grad_norm": 0.9512302279472351, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 2804 + }, + { + "epoch": 0.47752808988764045, + "grad_norm": 0.8988835215568542, + "learning_rate": 1e-06, + "loss": 0.0097, + "step": 2805 + }, + { + "epoch": 0.4776983316309159, + "grad_norm": 1.4442946910858154, + "learning_rate": 1e-06, + "loss": 0.017, + "step": 2806 + }, + { + "epoch": 0.47786857337419136, + "grad_norm": 1.1615439653396606, + "learning_rate": 1e-06, + "loss": 0.0118, + "step": 2807 + }, + { + "epoch": 0.4780388151174668, + "grad_norm": 1.2446208000183105, + "learning_rate": 1e-06, + "loss": 0.0108, + "step": 2808 + }, + { + "epoch": 0.4782090568607423, + "grad_norm": 1.1819733381271362, + "learning_rate": 1e-06, + "loss": 0.0126, + "step": 2809 + }, + { + "epoch": 0.4783792986040177, + "grad_norm": 1.3760933876037598, + "learning_rate": 1e-06, + "loss": 0.011, + "step": 2810 + }, + { + "epoch": 0.47854954034729313, + "grad_norm": 1.2199891805648804, + "learning_rate": 1e-06, + "loss": 0.011, + "step": 2811 + }, + { + "epoch": 0.4787197820905686, + "grad_norm": 1.2610541582107544, + "learning_rate": 1e-06, + "loss": 0.0119, + "step": 2812 + }, + { + "epoch": 0.47889002383384405, + "grad_norm": 1.4175294637680054, + "learning_rate": 1e-06, + "loss": 0.0105, + "step": 2813 + }, + { + "epoch": 0.4790602655771195, + "grad_norm": 1.0820069313049316, + "learning_rate": 1e-06, + "loss": 0.0083, + "step": 2814 + }, + { + "epoch": 0.47923050732039496, + "grad_norm": 1.236606478691101, + "learning_rate": 1e-06, + "loss": 0.015, + "step": 2815 + }, + { + "epoch": 0.4794007490636704, + "grad_norm": 1.2256829738616943, + "learning_rate": 1e-06, + "loss": 0.0121, + "step": 2816 + }, + { + "epoch": 0.4795709908069459, + "grad_norm": 1.4265694618225098, + "learning_rate": 1e-06, + "loss": 0.0137, + "step": 2817 + }, + { + "epoch": 0.47974123255022133, + "grad_norm": 1.2129721641540527, + "learning_rate": 1e-06, + "loss": 0.0104, + "step": 2818 + }, + { + "epoch": 0.4799114742934968, + "grad_norm": 1.3204402923583984, + "learning_rate": 1e-06, + "loss": 0.0143, + "step": 2819 + }, + { + "epoch": 0.48008171603677224, + "grad_norm": 2.682424783706665, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 2820 + }, + { + "epoch": 0.48025195778004764, + "grad_norm": 1.4030735492706299, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 2821 + }, + { + "epoch": 0.4804221995233231, + "grad_norm": 2.13521671295166, + "learning_rate": 1e-06, + "loss": 0.02, + "step": 2822 + }, + { + "epoch": 0.48059244126659856, + "grad_norm": 1.1667373180389404, + "learning_rate": 1e-06, + "loss": 0.0134, + "step": 2823 + }, + { + "epoch": 0.480762683009874, + "grad_norm": 1.3074268102645874, + "learning_rate": 1e-06, + "loss": 0.0114, + "step": 2824 + }, + { + "epoch": 0.48093292475314947, + "grad_norm": 1.321334719657898, + "learning_rate": 1e-06, + "loss": 0.0118, + "step": 2825 + }, + { + "epoch": 0.4811031664964249, + "grad_norm": 0.9559478759765625, + "learning_rate": 1e-06, + "loss": 0.0091, + "step": 2826 + }, + { + "epoch": 0.4812734082397004, + "grad_norm": 0.8174387812614441, + "learning_rate": 1e-06, + "loss": 0.007, + "step": 2827 + }, + { + "epoch": 0.48144364998297584, + "grad_norm": 1.3920475244522095, + "learning_rate": 1e-06, + "loss": 0.0136, + "step": 2828 + }, + { + "epoch": 0.4816138917262513, + "grad_norm": 1.5686707496643066, + "learning_rate": 1e-06, + "loss": 0.0187, + "step": 2829 + }, + { + "epoch": 0.48178413346952675, + "grad_norm": 1.0519627332687378, + "learning_rate": 1e-06, + "loss": 0.0105, + "step": 2830 + }, + { + "epoch": 0.48195437521280216, + "grad_norm": 1.2499573230743408, + "learning_rate": 1e-06, + "loss": 0.0158, + "step": 2831 + }, + { + "epoch": 0.4821246169560776, + "grad_norm": 1.9623509645462036, + "learning_rate": 1e-06, + "loss": 0.0192, + "step": 2832 + }, + { + "epoch": 0.48229485869935307, + "grad_norm": 1.0774935483932495, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 2833 + }, + { + "epoch": 0.4824651004426285, + "grad_norm": 1.0941082239151, + "learning_rate": 1e-06, + "loss": 0.0097, + "step": 2834 + }, + { + "epoch": 0.482635342185904, + "grad_norm": 1.3845523595809937, + "learning_rate": 1e-06, + "loss": 0.0154, + "step": 2835 + }, + { + "epoch": 0.48280558392917944, + "grad_norm": 1.569908857345581, + "learning_rate": 1e-06, + "loss": 0.018, + "step": 2836 + }, + { + "epoch": 0.4829758256724549, + "grad_norm": 1.1683586835861206, + "learning_rate": 1e-06, + "loss": 0.0151, + "step": 2837 + }, + { + "epoch": 0.48314606741573035, + "grad_norm": 1.277343511581421, + "learning_rate": 1e-06, + "loss": 0.0166, + "step": 2838 + }, + { + "epoch": 0.4833163091590058, + "grad_norm": 1.2092841863632202, + "learning_rate": 1e-06, + "loss": 0.0105, + "step": 2839 + }, + { + "epoch": 0.48348655090228126, + "grad_norm": 1.1916158199310303, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 2840 + }, + { + "epoch": 0.48365679264555667, + "grad_norm": 1.280978798866272, + "learning_rate": 1e-06, + "loss": 0.0123, + "step": 2841 + }, + { + "epoch": 0.4838270343888321, + "grad_norm": 1.2461068630218506, + "learning_rate": 1e-06, + "loss": 0.0098, + "step": 2842 + }, + { + "epoch": 0.4839972761321076, + "grad_norm": 1.471252202987671, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 2843 + }, + { + "epoch": 0.48416751787538304, + "grad_norm": 1.4099452495574951, + "learning_rate": 1e-06, + "loss": 0.0151, + "step": 2844 + }, + { + "epoch": 0.4843377596186585, + "grad_norm": 1.3784700632095337, + "learning_rate": 1e-06, + "loss": 0.0086, + "step": 2845 + }, + { + "epoch": 0.48450800136193395, + "grad_norm": 1.322296142578125, + "learning_rate": 1e-06, + "loss": 0.0154, + "step": 2846 + }, + { + "epoch": 0.4846782431052094, + "grad_norm": 1.3607927560806274, + "learning_rate": 1e-06, + "loss": 0.0103, + "step": 2847 + }, + { + "epoch": 0.48484848484848486, + "grad_norm": 0.7826800346374512, + "learning_rate": 1e-06, + "loss": 0.0089, + "step": 2848 + }, + { + "epoch": 0.4850187265917603, + "grad_norm": 1.2559735774993896, + "learning_rate": 1e-06, + "loss": 0.0118, + "step": 2849 + }, + { + "epoch": 0.4851889683350358, + "grad_norm": 1.423887848854065, + "learning_rate": 1e-06, + "loss": 0.0127, + "step": 2850 + }, + { + "epoch": 0.4853592100783112, + "grad_norm": 0.9544010758399963, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 2851 + }, + { + "epoch": 0.48552945182158663, + "grad_norm": 1.8954381942749023, + "learning_rate": 1e-06, + "loss": 0.0208, + "step": 2852 + }, + { + "epoch": 0.4856996935648621, + "grad_norm": 1.5164066553115845, + "learning_rate": 1e-06, + "loss": 0.0115, + "step": 2853 + }, + { + "epoch": 0.48586993530813755, + "grad_norm": 1.3428789377212524, + "learning_rate": 1e-06, + "loss": 0.0127, + "step": 2854 + }, + { + "epoch": 0.486040177051413, + "grad_norm": 1.2199963331222534, + "learning_rate": 1e-06, + "loss": 0.0168, + "step": 2855 + }, + { + "epoch": 0.48621041879468846, + "grad_norm": 1.103547215461731, + "learning_rate": 1e-06, + "loss": 0.0117, + "step": 2856 + }, + { + "epoch": 0.4863806605379639, + "grad_norm": 1.19355309009552, + "learning_rate": 1e-06, + "loss": 0.0137, + "step": 2857 + }, + { + "epoch": 0.4865509022812394, + "grad_norm": 1.5257407426834106, + "learning_rate": 1e-06, + "loss": 0.0125, + "step": 2858 + }, + { + "epoch": 0.48672114402451483, + "grad_norm": 1.4707660675048828, + "learning_rate": 1e-06, + "loss": 0.0142, + "step": 2859 + }, + { + "epoch": 0.4868913857677903, + "grad_norm": 1.3224432468414307, + "learning_rate": 1e-06, + "loss": 0.0141, + "step": 2860 + }, + { + "epoch": 0.4870616275110657, + "grad_norm": 1.5693126916885376, + "learning_rate": 1e-06, + "loss": 0.0144, + "step": 2861 + }, + { + "epoch": 0.48723186925434114, + "grad_norm": 1.315860629081726, + "learning_rate": 1e-06, + "loss": 0.0107, + "step": 2862 + }, + { + "epoch": 0.4874021109976166, + "grad_norm": 1.2941914796829224, + "learning_rate": 1e-06, + "loss": 0.0125, + "step": 2863 + }, + { + "epoch": 0.48757235274089206, + "grad_norm": 1.343846321105957, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 2864 + }, + { + "epoch": 0.4877425944841675, + "grad_norm": 1.0334171056747437, + "learning_rate": 1e-06, + "loss": 0.0086, + "step": 2865 + }, + { + "epoch": 0.48791283622744297, + "grad_norm": 1.295588493347168, + "learning_rate": 1e-06, + "loss": 0.0115, + "step": 2866 + }, + { + "epoch": 0.4880830779707184, + "grad_norm": 1.4225810766220093, + "learning_rate": 1e-06, + "loss": 0.0131, + "step": 2867 + }, + { + "epoch": 0.4882533197139939, + "grad_norm": 1.4328070878982544, + "learning_rate": 1e-06, + "loss": 0.0186, + "step": 2868 + }, + { + "epoch": 0.48842356145726934, + "grad_norm": 1.6347346305847168, + "learning_rate": 1e-06, + "loss": 0.0111, + "step": 2869 + }, + { + "epoch": 0.4885938032005448, + "grad_norm": 1.3069541454315186, + "learning_rate": 1e-06, + "loss": 0.0124, + "step": 2870 + }, + { + "epoch": 0.4887640449438202, + "grad_norm": 1.1802916526794434, + "learning_rate": 1e-06, + "loss": 0.0111, + "step": 2871 + }, + { + "epoch": 0.48893428668709565, + "grad_norm": 1.2617202997207642, + "learning_rate": 1e-06, + "loss": 0.0106, + "step": 2872 + }, + { + "epoch": 0.4891045284303711, + "grad_norm": 1.1735727787017822, + "learning_rate": 1e-06, + "loss": 0.0113, + "step": 2873 + }, + { + "epoch": 0.48927477017364657, + "grad_norm": 1.4026036262512207, + "learning_rate": 1e-06, + "loss": 0.0163, + "step": 2874 + }, + { + "epoch": 0.489445011916922, + "grad_norm": 1.503218173980713, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 2875 + }, + { + "epoch": 0.4896152536601975, + "grad_norm": 1.212363362312317, + "learning_rate": 1e-06, + "loss": 0.0138, + "step": 2876 + }, + { + "epoch": 0.48978549540347294, + "grad_norm": 1.4121812582015991, + "learning_rate": 1e-06, + "loss": 0.0141, + "step": 2877 + }, + { + "epoch": 0.4899557371467484, + "grad_norm": 1.3481481075286865, + "learning_rate": 1e-06, + "loss": 0.0136, + "step": 2878 + }, + { + "epoch": 0.49012597889002385, + "grad_norm": 1.1686102151870728, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 2879 + }, + { + "epoch": 0.4902962206332993, + "grad_norm": 1.311177372932434, + "learning_rate": 1e-06, + "loss": 0.0189, + "step": 2880 + }, + { + "epoch": 0.49046646237657476, + "grad_norm": 1.1763383150100708, + "learning_rate": 1e-06, + "loss": 0.0133, + "step": 2881 + }, + { + "epoch": 0.49063670411985016, + "grad_norm": 1.164952039718628, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 2882 + }, + { + "epoch": 0.4908069458631256, + "grad_norm": 1.2506550550460815, + "learning_rate": 1e-06, + "loss": 0.0115, + "step": 2883 + }, + { + "epoch": 0.4909771876064011, + "grad_norm": 1.4701337814331055, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 2884 + }, + { + "epoch": 0.49114742934967653, + "grad_norm": 1.318747639656067, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 2885 + }, + { + "epoch": 0.491317671092952, + "grad_norm": 1.1562262773513794, + "learning_rate": 1e-06, + "loss": 0.0121, + "step": 2886 + }, + { + "epoch": 0.49148791283622745, + "grad_norm": 1.6450200080871582, + "learning_rate": 1e-06, + "loss": 0.0184, + "step": 2887 + }, + { + "epoch": 0.4916581545795029, + "grad_norm": 1.0631816387176514, + "learning_rate": 1e-06, + "loss": 0.0121, + "step": 2888 + }, + { + "epoch": 0.49182839632277836, + "grad_norm": 0.9874283671379089, + "learning_rate": 1e-06, + "loss": 0.0116, + "step": 2889 + }, + { + "epoch": 0.4919986380660538, + "grad_norm": 1.4185420274734497, + "learning_rate": 1e-06, + "loss": 0.0162, + "step": 2890 + }, + { + "epoch": 0.4921688798093293, + "grad_norm": 0.9347524642944336, + "learning_rate": 1e-06, + "loss": 0.0069, + "step": 2891 + }, + { + "epoch": 0.4923391215526047, + "grad_norm": 0.9645169973373413, + "learning_rate": 1e-06, + "loss": 0.0113, + "step": 2892 + }, + { + "epoch": 0.49250936329588013, + "grad_norm": 1.236021876335144, + "learning_rate": 1e-06, + "loss": 0.0136, + "step": 2893 + }, + { + "epoch": 0.4926796050391556, + "grad_norm": 1.1450927257537842, + "learning_rate": 1e-06, + "loss": 0.0149, + "step": 2894 + }, + { + "epoch": 0.49284984678243104, + "grad_norm": 1.5991291999816895, + "learning_rate": 1e-06, + "loss": 0.0155, + "step": 2895 + }, + { + "epoch": 0.4930200885257065, + "grad_norm": 1.1790274381637573, + "learning_rate": 1e-06, + "loss": 0.0134, + "step": 2896 + }, + { + "epoch": 0.49319033026898196, + "grad_norm": 1.1705747842788696, + "learning_rate": 1e-06, + "loss": 0.0102, + "step": 2897 + }, + { + "epoch": 0.4933605720122574, + "grad_norm": 0.9381265640258789, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 2898 + }, + { + "epoch": 0.49353081375553287, + "grad_norm": 1.261146903038025, + "learning_rate": 1e-06, + "loss": 0.014, + "step": 2899 + }, + { + "epoch": 0.4937010554988083, + "grad_norm": 1.0365086793899536, + "learning_rate": 1e-06, + "loss": 0.0125, + "step": 2900 + }, + { + "epoch": 0.4938712972420838, + "grad_norm": 1.20415198802948, + "learning_rate": 1e-06, + "loss": 0.0063, + "step": 2901 + }, + { + "epoch": 0.4940415389853592, + "grad_norm": 1.1824898719787598, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 2902 + }, + { + "epoch": 0.49421178072863464, + "grad_norm": 1.3033336400985718, + "learning_rate": 1e-06, + "loss": 0.0153, + "step": 2903 + }, + { + "epoch": 0.4943820224719101, + "grad_norm": 1.490906834602356, + "learning_rate": 1e-06, + "loss": 0.0129, + "step": 2904 + }, + { + "epoch": 0.49455226421518556, + "grad_norm": 1.298890471458435, + "learning_rate": 1e-06, + "loss": 0.0121, + "step": 2905 + }, + { + "epoch": 0.494722505958461, + "grad_norm": 1.805892825126648, + "learning_rate": 1e-06, + "loss": 0.028, + "step": 2906 + }, + { + "epoch": 0.49489274770173647, + "grad_norm": 1.0890419483184814, + "learning_rate": 1e-06, + "loss": 0.0098, + "step": 2907 + }, + { + "epoch": 0.4950629894450119, + "grad_norm": 1.0342925786972046, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 2908 + }, + { + "epoch": 0.4952332311882874, + "grad_norm": 1.5081183910369873, + "learning_rate": 1e-06, + "loss": 0.0134, + "step": 2909 + }, + { + "epoch": 0.49540347293156284, + "grad_norm": 2.6579811573028564, + "learning_rate": 1e-06, + "loss": 0.0148, + "step": 2910 + }, + { + "epoch": 0.4955737146748383, + "grad_norm": 1.3652353286743164, + "learning_rate": 1e-06, + "loss": 0.0125, + "step": 2911 + }, + { + "epoch": 0.4957439564181137, + "grad_norm": 1.6202207803726196, + "learning_rate": 1e-06, + "loss": 0.0186, + "step": 2912 + }, + { + "epoch": 0.49591419816138915, + "grad_norm": 1.1754422187805176, + "learning_rate": 1e-06, + "loss": 0.0154, + "step": 2913 + }, + { + "epoch": 0.4960844399046646, + "grad_norm": 1.2409707307815552, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 2914 + }, + { + "epoch": 0.49625468164794007, + "grad_norm": 1.7688099145889282, + "learning_rate": 1e-06, + "loss": 0.0294, + "step": 2915 + }, + { + "epoch": 0.4964249233912155, + "grad_norm": 1.4038777351379395, + "learning_rate": 1e-06, + "loss": 0.0121, + "step": 2916 + }, + { + "epoch": 0.496595165134491, + "grad_norm": 0.8886030316352844, + "learning_rate": 1e-06, + "loss": 0.0087, + "step": 2917 + }, + { + "epoch": 0.49676540687776644, + "grad_norm": 2.1861026287078857, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 2918 + }, + { + "epoch": 0.4969356486210419, + "grad_norm": 1.338416576385498, + "learning_rate": 1e-06, + "loss": 0.0111, + "step": 2919 + }, + { + "epoch": 0.49710589036431735, + "grad_norm": 1.1682435274124146, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 2920 + }, + { + "epoch": 0.4972761321075928, + "grad_norm": 1.538461446762085, + "learning_rate": 1e-06, + "loss": 0.0213, + "step": 2921 + }, + { + "epoch": 0.4974463738508682, + "grad_norm": 1.5773953199386597, + "learning_rate": 1e-06, + "loss": 0.0114, + "step": 2922 + }, + { + "epoch": 0.49761661559414366, + "grad_norm": 1.3020201921463013, + "learning_rate": 1e-06, + "loss": 0.0145, + "step": 2923 + }, + { + "epoch": 0.4977868573374191, + "grad_norm": 1.1434447765350342, + "learning_rate": 1e-06, + "loss": 0.01, + "step": 2924 + }, + { + "epoch": 0.4979570990806946, + "grad_norm": 1.2094712257385254, + "learning_rate": 1e-06, + "loss": 0.011, + "step": 2925 + }, + { + "epoch": 0.49812734082397003, + "grad_norm": 1.0923223495483398, + "learning_rate": 1e-06, + "loss": 0.0066, + "step": 2926 + }, + { + "epoch": 0.4982975825672455, + "grad_norm": 1.6678919792175293, + "learning_rate": 1e-06, + "loss": 0.0123, + "step": 2927 + }, + { + "epoch": 0.49846782431052095, + "grad_norm": 1.2051362991333008, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 2928 + }, + { + "epoch": 0.4986380660537964, + "grad_norm": 1.1779026985168457, + "learning_rate": 1e-06, + "loss": 0.0148, + "step": 2929 + }, + { + "epoch": 0.49880830779707186, + "grad_norm": 1.1191192865371704, + "learning_rate": 1e-06, + "loss": 0.0091, + "step": 2930 + }, + { + "epoch": 0.4989785495403473, + "grad_norm": 1.4694077968597412, + "learning_rate": 1e-06, + "loss": 0.0117, + "step": 2931 + }, + { + "epoch": 0.4991487912836227, + "grad_norm": 1.1187613010406494, + "learning_rate": 1e-06, + "loss": 0.0126, + "step": 2932 + }, + { + "epoch": 0.4993190330268982, + "grad_norm": 1.3665251731872559, + "learning_rate": 1e-06, + "loss": 0.0116, + "step": 2933 + }, + { + "epoch": 0.49948927477017363, + "grad_norm": 2.8328680992126465, + "learning_rate": 1e-06, + "loss": 0.0335, + "step": 2934 + }, + { + "epoch": 0.4996595165134491, + "grad_norm": 1.5214513540267944, + "learning_rate": 1e-06, + "loss": 0.0125, + "step": 2935 + }, + { + "epoch": 0.49982975825672454, + "grad_norm": 1.1730064153671265, + "learning_rate": 1e-06, + "loss": 0.013, + "step": 2936 + }, + { + "epoch": 0.5, + "grad_norm": 1.2666690349578857, + "learning_rate": 1e-06, + "loss": 0.0125, + "step": 2937 + }, + { + "epoch": 0.5001702417432754, + "grad_norm": 1.3443002700805664, + "learning_rate": 1e-06, + "loss": 0.0124, + "step": 2938 + }, + { + "epoch": 0.5003404834865509, + "grad_norm": 1.225641131401062, + "learning_rate": 1e-06, + "loss": 0.0089, + "step": 2939 + }, + { + "epoch": 0.5005107252298263, + "grad_norm": 1.4913207292556763, + "learning_rate": 1e-06, + "loss": 0.0155, + "step": 2940 + }, + { + "epoch": 0.5006809669731018, + "grad_norm": 1.65010666847229, + "learning_rate": 1e-06, + "loss": 0.0235, + "step": 2941 + }, + { + "epoch": 0.5008512087163772, + "grad_norm": 0.9611451625823975, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 2942 + }, + { + "epoch": 0.5010214504596527, + "grad_norm": 1.4015928506851196, + "learning_rate": 1e-06, + "loss": 0.0099, + "step": 2943 + }, + { + "epoch": 0.5011916922029281, + "grad_norm": 0.9811087250709534, + "learning_rate": 1e-06, + "loss": 0.0103, + "step": 2944 + }, + { + "epoch": 0.5013619339462037, + "grad_norm": 1.1719774007797241, + "learning_rate": 1e-06, + "loss": 0.0122, + "step": 2945 + }, + { + "epoch": 0.501532175689479, + "grad_norm": 1.2584707736968994, + "learning_rate": 1e-06, + "loss": 0.011, + "step": 2946 + }, + { + "epoch": 0.5017024174327546, + "grad_norm": 2.9327192306518555, + "learning_rate": 1e-06, + "loss": 0.0499, + "step": 2947 + }, + { + "epoch": 0.50187265917603, + "grad_norm": 1.3452236652374268, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 2948 + }, + { + "epoch": 0.5020429009193054, + "grad_norm": 1.2244794368743896, + "learning_rate": 1e-06, + "loss": 0.0104, + "step": 2949 + }, + { + "epoch": 0.5022131426625809, + "grad_norm": 1.3074755668640137, + "learning_rate": 1e-06, + "loss": 0.0099, + "step": 2950 + }, + { + "epoch": 0.5023833844058563, + "grad_norm": 1.3225749731063843, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 2951 + }, + { + "epoch": 0.5025536261491318, + "grad_norm": 1.5591984987258911, + "learning_rate": 1e-06, + "loss": 0.0122, + "step": 2952 + }, + { + "epoch": 0.5027238678924072, + "grad_norm": 1.4079065322875977, + "learning_rate": 1e-06, + "loss": 0.01, + "step": 2953 + }, + { + "epoch": 0.5028941096356827, + "grad_norm": 1.699751377105713, + "learning_rate": 1e-06, + "loss": 0.0138, + "step": 2954 + }, + { + "epoch": 0.5030643513789581, + "grad_norm": 1.5341709852218628, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 2955 + }, + { + "epoch": 0.5032345931222336, + "grad_norm": 1.3966187238693237, + "learning_rate": 1e-06, + "loss": 0.0099, + "step": 2956 + }, + { + "epoch": 0.503404834865509, + "grad_norm": 1.1383638381958008, + "learning_rate": 1e-06, + "loss": 0.0119, + "step": 2957 + }, + { + "epoch": 0.5035750766087844, + "grad_norm": 1.2133922576904297, + "learning_rate": 1e-06, + "loss": 0.0134, + "step": 2958 + }, + { + "epoch": 0.5037453183520599, + "grad_norm": 1.335353970527649, + "learning_rate": 1e-06, + "loss": 0.013, + "step": 2959 + }, + { + "epoch": 0.5039155600953353, + "grad_norm": 1.509978175163269, + "learning_rate": 1e-06, + "loss": 0.0143, + "step": 2960 + }, + { + "epoch": 0.5040858018386108, + "grad_norm": 1.110413670539856, + "learning_rate": 1e-06, + "loss": 0.0104, + "step": 2961 + }, + { + "epoch": 0.5042560435818862, + "grad_norm": 1.325568437576294, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 2962 + }, + { + "epoch": 0.5044262853251618, + "grad_norm": 1.420128345489502, + "learning_rate": 1e-06, + "loss": 0.015, + "step": 2963 + }, + { + "epoch": 0.5045965270684372, + "grad_norm": 0.9586927890777588, + "learning_rate": 1e-06, + "loss": 0.0074, + "step": 2964 + }, + { + "epoch": 0.5047667688117127, + "grad_norm": 1.4876079559326172, + "learning_rate": 1e-06, + "loss": 0.0169, + "step": 2965 + }, + { + "epoch": 0.5049370105549881, + "grad_norm": 1.4042977094650269, + "learning_rate": 1e-06, + "loss": 0.0127, + "step": 2966 + }, + { + "epoch": 0.5051072522982636, + "grad_norm": 1.723920226097107, + "learning_rate": 1e-06, + "loss": 0.0132, + "step": 2967 + }, + { + "epoch": 0.505277494041539, + "grad_norm": 1.1755249500274658, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 2968 + }, + { + "epoch": 0.5054477357848144, + "grad_norm": 1.0757211446762085, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 2969 + }, + { + "epoch": 0.5056179775280899, + "grad_norm": 1.336470603942871, + "learning_rate": 1e-06, + "loss": 0.0131, + "step": 2970 + }, + { + "epoch": 0.5057882192713653, + "grad_norm": 1.0691769123077393, + "learning_rate": 1e-06, + "loss": 0.0112, + "step": 2971 + }, + { + "epoch": 0.5059584610146408, + "grad_norm": 1.1425819396972656, + "learning_rate": 1e-06, + "loss": 0.0089, + "step": 2972 + }, + { + "epoch": 0.5061287027579162, + "grad_norm": 1.1680139303207397, + "learning_rate": 1e-06, + "loss": 0.0128, + "step": 2973 + }, + { + "epoch": 0.5062989445011917, + "grad_norm": 0.9332813024520874, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 2974 + }, + { + "epoch": 0.5064691862444671, + "grad_norm": 0.983639121055603, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 2975 + }, + { + "epoch": 0.5066394279877426, + "grad_norm": 1.32431161403656, + "learning_rate": 1e-06, + "loss": 0.0165, + "step": 2976 + }, + { + "epoch": 0.506809669731018, + "grad_norm": 1.2261784076690674, + "learning_rate": 1e-06, + "loss": 0.0108, + "step": 2977 + }, + { + "epoch": 0.5069799114742934, + "grad_norm": 3.250417470932007, + "learning_rate": 1e-06, + "loss": 0.0345, + "step": 2978 + }, + { + "epoch": 0.507150153217569, + "grad_norm": 1.4886819124221802, + "learning_rate": 1e-06, + "loss": 0.0108, + "step": 2979 + }, + { + "epoch": 0.5073203949608444, + "grad_norm": 1.391330361366272, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 2980 + }, + { + "epoch": 0.5074906367041199, + "grad_norm": 1.0295233726501465, + "learning_rate": 1e-06, + "loss": 0.0098, + "step": 2981 + }, + { + "epoch": 0.5076608784473953, + "grad_norm": 1.6181305646896362, + "learning_rate": 1e-06, + "loss": 0.0126, + "step": 2982 + }, + { + "epoch": 0.5078311201906708, + "grad_norm": 1.2847509384155273, + "learning_rate": 1e-06, + "loss": 0.0105, + "step": 2983 + }, + { + "epoch": 0.5080013619339462, + "grad_norm": 1.1932363510131836, + "learning_rate": 1e-06, + "loss": 0.0119, + "step": 2984 + }, + { + "epoch": 0.5081716036772217, + "grad_norm": 1.5506092309951782, + "learning_rate": 1e-06, + "loss": 0.0108, + "step": 2985 + }, + { + "epoch": 0.5083418454204971, + "grad_norm": 1.11466646194458, + "learning_rate": 1e-06, + "loss": 0.0098, + "step": 2986 + }, + { + "epoch": 0.5085120871637726, + "grad_norm": 1.4007675647735596, + "learning_rate": 1e-06, + "loss": 0.0114, + "step": 2987 + }, + { + "epoch": 0.508682328907048, + "grad_norm": 1.0601606369018555, + "learning_rate": 1e-06, + "loss": 0.0084, + "step": 2988 + }, + { + "epoch": 0.5088525706503234, + "grad_norm": 1.1195155382156372, + "learning_rate": 1e-06, + "loss": 0.0104, + "step": 2989 + }, + { + "epoch": 0.5090228123935989, + "grad_norm": 1.2109330892562866, + "learning_rate": 1e-06, + "loss": 0.0176, + "step": 2990 + }, + { + "epoch": 0.5091930541368743, + "grad_norm": 1.20180082321167, + "learning_rate": 1e-06, + "loss": 0.0119, + "step": 2991 + }, + { + "epoch": 0.5093632958801498, + "grad_norm": 1.3377587795257568, + "learning_rate": 1e-06, + "loss": 0.0121, + "step": 2992 + }, + { + "epoch": 0.5095335376234252, + "grad_norm": 1.0854805707931519, + "learning_rate": 1e-06, + "loss": 0.0081, + "step": 2993 + }, + { + "epoch": 0.5097037793667007, + "grad_norm": 1.8077919483184814, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 2994 + }, + { + "epoch": 0.5098740211099761, + "grad_norm": 1.4350749254226685, + "learning_rate": 1e-06, + "loss": 0.0113, + "step": 2995 + }, + { + "epoch": 0.5100442628532517, + "grad_norm": 1.6307591199874878, + "learning_rate": 1e-06, + "loss": 0.0142, + "step": 2996 + }, + { + "epoch": 0.5102145045965271, + "grad_norm": 1.2001097202301025, + "learning_rate": 1e-06, + "loss": 0.0098, + "step": 2997 + }, + { + "epoch": 0.5103847463398025, + "grad_norm": 0.9367316365242004, + "learning_rate": 1e-06, + "loss": 0.0073, + "step": 2998 + }, + { + "epoch": 0.510554988083078, + "grad_norm": 1.4046683311462402, + "learning_rate": 1e-06, + "loss": 0.0142, + "step": 2999 + }, + { + "epoch": 0.5107252298263534, + "grad_norm": 1.2139997482299805, + "learning_rate": 1e-06, + "loss": 0.0213, + "step": 3000 + }, + { + "epoch": 0.5107252298263534, + "eval_loss": 0.31468451023101807, + "eval_runtime": 21.1588, + "eval_samples_per_second": 14.178, + "eval_steps_per_second": 0.378, + "step": 3000 + }, + { + "epoch": 0.5108954715696289, + "grad_norm": 1.300648808479309, + "learning_rate": 1e-06, + "loss": 0.0112, + "step": 3001 + }, + { + "epoch": 0.5110657133129043, + "grad_norm": 1.0096200704574585, + "learning_rate": 1e-06, + "loss": 0.0087, + "step": 3002 + }, + { + "epoch": 0.5112359550561798, + "grad_norm": 1.1899932622909546, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 3003 + }, + { + "epoch": 0.5114061967994552, + "grad_norm": 0.9526229500770569, + "learning_rate": 1e-06, + "loss": 0.0092, + "step": 3004 + }, + { + "epoch": 0.5115764385427307, + "grad_norm": 1.8259762525558472, + "learning_rate": 1e-06, + "loss": 0.0231, + "step": 3005 + }, + { + "epoch": 0.5117466802860061, + "grad_norm": 1.3105390071868896, + "learning_rate": 1e-06, + "loss": 0.0119, + "step": 3006 + }, + { + "epoch": 0.5119169220292816, + "grad_norm": 1.3401422500610352, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 3007 + }, + { + "epoch": 0.512087163772557, + "grad_norm": 1.4094361066818237, + "learning_rate": 1e-06, + "loss": 0.0152, + "step": 3008 + }, + { + "epoch": 0.5122574055158324, + "grad_norm": 1.5171374082565308, + "learning_rate": 1e-06, + "loss": 0.0166, + "step": 3009 + }, + { + "epoch": 0.5124276472591079, + "grad_norm": 1.069426417350769, + "learning_rate": 1e-06, + "loss": 0.0162, + "step": 3010 + }, + { + "epoch": 0.5125978890023833, + "grad_norm": 1.3579994440078735, + "learning_rate": 1e-06, + "loss": 0.0136, + "step": 3011 + }, + { + "epoch": 0.5127681307456589, + "grad_norm": 0.9840942621231079, + "learning_rate": 1e-06, + "loss": 0.0114, + "step": 3012 + }, + { + "epoch": 0.5129383724889343, + "grad_norm": 1.523397445678711, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 3013 + }, + { + "epoch": 0.5131086142322098, + "grad_norm": 1.2539904117584229, + "learning_rate": 1e-06, + "loss": 0.0112, + "step": 3014 + }, + { + "epoch": 0.5132788559754852, + "grad_norm": 1.0365042686462402, + "learning_rate": 1e-06, + "loss": 0.0086, + "step": 3015 + }, + { + "epoch": 0.5134490977187607, + "grad_norm": 0.8410796523094177, + "learning_rate": 1e-06, + "loss": 0.0089, + "step": 3016 + }, + { + "epoch": 0.5136193394620361, + "grad_norm": 1.3191943168640137, + "learning_rate": 1e-06, + "loss": 0.0182, + "step": 3017 + }, + { + "epoch": 0.5137895812053116, + "grad_norm": 0.8363417387008667, + "learning_rate": 1e-06, + "loss": 0.0068, + "step": 3018 + }, + { + "epoch": 0.513959822948587, + "grad_norm": 1.1895376443862915, + "learning_rate": 1e-06, + "loss": 0.0141, + "step": 3019 + }, + { + "epoch": 0.5141300646918624, + "grad_norm": 1.4793716669082642, + "learning_rate": 1e-06, + "loss": 0.0191, + "step": 3020 + }, + { + "epoch": 0.5143003064351379, + "grad_norm": 1.583125114440918, + "learning_rate": 1e-06, + "loss": 0.0159, + "step": 3021 + }, + { + "epoch": 0.5144705481784133, + "grad_norm": 1.3003768920898438, + "learning_rate": 1e-06, + "loss": 0.0138, + "step": 3022 + }, + { + "epoch": 0.5146407899216888, + "grad_norm": 1.2707653045654297, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 3023 + }, + { + "epoch": 0.5148110316649642, + "grad_norm": 1.094768762588501, + "learning_rate": 1e-06, + "loss": 0.0086, + "step": 3024 + }, + { + "epoch": 0.5149812734082397, + "grad_norm": 0.997310221195221, + "learning_rate": 1e-06, + "loss": 0.0073, + "step": 3025 + }, + { + "epoch": 0.5151515151515151, + "grad_norm": 1.2261766195297241, + "learning_rate": 1e-06, + "loss": 0.0144, + "step": 3026 + }, + { + "epoch": 0.5153217568947907, + "grad_norm": 1.513581395149231, + "learning_rate": 1e-06, + "loss": 0.0145, + "step": 3027 + }, + { + "epoch": 0.515491998638066, + "grad_norm": 1.1396069526672363, + "learning_rate": 1e-06, + "loss": 0.0107, + "step": 3028 + }, + { + "epoch": 0.5156622403813415, + "grad_norm": 1.0320284366607666, + "learning_rate": 1e-06, + "loss": 0.0074, + "step": 3029 + }, + { + "epoch": 0.515832482124617, + "grad_norm": 1.4781116247177124, + "learning_rate": 1e-06, + "loss": 0.0167, + "step": 3030 + }, + { + "epoch": 0.5160027238678924, + "grad_norm": 1.4949243068695068, + "learning_rate": 1e-06, + "loss": 0.0176, + "step": 3031 + }, + { + "epoch": 0.5161729656111679, + "grad_norm": 1.2848188877105713, + "learning_rate": 1e-06, + "loss": 0.0142, + "step": 3032 + }, + { + "epoch": 0.5163432073544433, + "grad_norm": 1.240247368812561, + "learning_rate": 1e-06, + "loss": 0.0103, + "step": 3033 + }, + { + "epoch": 0.5165134490977188, + "grad_norm": 0.9559020400047302, + "learning_rate": 1e-06, + "loss": 0.0091, + "step": 3034 + }, + { + "epoch": 0.5166836908409942, + "grad_norm": 1.2412127256393433, + "learning_rate": 1e-06, + "loss": 0.012, + "step": 3035 + }, + { + "epoch": 0.5168539325842697, + "grad_norm": 1.1315244436264038, + "learning_rate": 1e-06, + "loss": 0.0145, + "step": 3036 + }, + { + "epoch": 0.5170241743275451, + "grad_norm": 1.2756315469741821, + "learning_rate": 1e-06, + "loss": 0.0113, + "step": 3037 + }, + { + "epoch": 0.5171944160708206, + "grad_norm": 1.620142936706543, + "learning_rate": 1e-06, + "loss": 0.0264, + "step": 3038 + }, + { + "epoch": 0.517364657814096, + "grad_norm": 1.0165133476257324, + "learning_rate": 1e-06, + "loss": 0.0086, + "step": 3039 + }, + { + "epoch": 0.5175348995573714, + "grad_norm": 1.0977479219436646, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 3040 + }, + { + "epoch": 0.5177051413006469, + "grad_norm": 1.153247356414795, + "learning_rate": 1e-06, + "loss": 0.0092, + "step": 3041 + }, + { + "epoch": 0.5178753830439223, + "grad_norm": 1.3911783695220947, + "learning_rate": 1e-06, + "loss": 0.0167, + "step": 3042 + }, + { + "epoch": 0.5180456247871978, + "grad_norm": 1.2800657749176025, + "learning_rate": 1e-06, + "loss": 0.0115, + "step": 3043 + }, + { + "epoch": 0.5182158665304732, + "grad_norm": 1.216147780418396, + "learning_rate": 1e-06, + "loss": 0.0126, + "step": 3044 + }, + { + "epoch": 0.5183861082737488, + "grad_norm": 1.263554573059082, + "learning_rate": 1e-06, + "loss": 0.0114, + "step": 3045 + }, + { + "epoch": 0.5185563500170242, + "grad_norm": 1.123915672302246, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 3046 + }, + { + "epoch": 0.5187265917602997, + "grad_norm": 1.1444989442825317, + "learning_rate": 1e-06, + "loss": 0.0131, + "step": 3047 + }, + { + "epoch": 0.5188968335035751, + "grad_norm": 1.2586398124694824, + "learning_rate": 1e-06, + "loss": 0.0084, + "step": 3048 + }, + { + "epoch": 0.5190670752468505, + "grad_norm": 1.3242063522338867, + "learning_rate": 1e-06, + "loss": 0.0113, + "step": 3049 + }, + { + "epoch": 0.519237316990126, + "grad_norm": 1.1678466796875, + "learning_rate": 1e-06, + "loss": 0.0103, + "step": 3050 + }, + { + "epoch": 0.5194075587334014, + "grad_norm": 1.157733678817749, + "learning_rate": 1e-06, + "loss": 0.0108, + "step": 3051 + }, + { + "epoch": 0.5195778004766769, + "grad_norm": 1.495124340057373, + "learning_rate": 1e-06, + "loss": 0.0091, + "step": 3052 + }, + { + "epoch": 0.5197480422199523, + "grad_norm": 1.0295337438583374, + "learning_rate": 1e-06, + "loss": 0.0097, + "step": 3053 + }, + { + "epoch": 0.5199182839632278, + "grad_norm": 0.9237085580825806, + "learning_rate": 1e-06, + "loss": 0.0085, + "step": 3054 + }, + { + "epoch": 0.5200885257065032, + "grad_norm": 1.2110072374343872, + "learning_rate": 1e-06, + "loss": 0.0122, + "step": 3055 + }, + { + "epoch": 0.5202587674497787, + "grad_norm": 1.4104846715927124, + "learning_rate": 1e-06, + "loss": 0.0115, + "step": 3056 + }, + { + "epoch": 0.5204290091930541, + "grad_norm": 1.0618224143981934, + "learning_rate": 1e-06, + "loss": 0.0115, + "step": 3057 + }, + { + "epoch": 0.5205992509363296, + "grad_norm": 1.23426353931427, + "learning_rate": 1e-06, + "loss": 0.011, + "step": 3058 + }, + { + "epoch": 0.520769492679605, + "grad_norm": 1.1079343557357788, + "learning_rate": 1e-06, + "loss": 0.0089, + "step": 3059 + }, + { + "epoch": 0.5209397344228804, + "grad_norm": 1.472641110420227, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 3060 + }, + { + "epoch": 0.521109976166156, + "grad_norm": 1.1031280755996704, + "learning_rate": 1e-06, + "loss": 0.0103, + "step": 3061 + }, + { + "epoch": 0.5212802179094314, + "grad_norm": 1.4007097482681274, + "learning_rate": 1e-06, + "loss": 0.0093, + "step": 3062 + }, + { + "epoch": 0.5214504596527069, + "grad_norm": 1.2048598527908325, + "learning_rate": 1e-06, + "loss": 0.0141, + "step": 3063 + }, + { + "epoch": 0.5216207013959823, + "grad_norm": 1.1933079957962036, + "learning_rate": 1e-06, + "loss": 0.012, + "step": 3064 + }, + { + "epoch": 0.5217909431392578, + "grad_norm": 1.139629602432251, + "learning_rate": 1e-06, + "loss": 0.0107, + "step": 3065 + }, + { + "epoch": 0.5219611848825332, + "grad_norm": 1.330549716949463, + "learning_rate": 1e-06, + "loss": 0.0087, + "step": 3066 + }, + { + "epoch": 0.5221314266258087, + "grad_norm": 0.8479977250099182, + "learning_rate": 1e-06, + "loss": 0.009, + "step": 3067 + }, + { + "epoch": 0.5223016683690841, + "grad_norm": 1.093963623046875, + "learning_rate": 1e-06, + "loss": 0.0102, + "step": 3068 + }, + { + "epoch": 0.5224719101123596, + "grad_norm": 1.0407942533493042, + "learning_rate": 1e-06, + "loss": 0.0104, + "step": 3069 + }, + { + "epoch": 0.522642151855635, + "grad_norm": 1.4160226583480835, + "learning_rate": 1e-06, + "loss": 0.0178, + "step": 3070 + }, + { + "epoch": 0.5228123935989104, + "grad_norm": 1.3179391622543335, + "learning_rate": 1e-06, + "loss": 0.01, + "step": 3071 + }, + { + "epoch": 0.5229826353421859, + "grad_norm": 1.8564910888671875, + "learning_rate": 1e-06, + "loss": 0.0199, + "step": 3072 + }, + { + "epoch": 0.5231528770854613, + "grad_norm": 1.2088946104049683, + "learning_rate": 1e-06, + "loss": 0.0129, + "step": 3073 + }, + { + "epoch": 0.5233231188287368, + "grad_norm": 1.1301778554916382, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 3074 + }, + { + "epoch": 0.5234933605720122, + "grad_norm": 1.4394501447677612, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 3075 + }, + { + "epoch": 0.5236636023152877, + "grad_norm": 1.0562634468078613, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 3076 + }, + { + "epoch": 0.5238338440585631, + "grad_norm": 1.2691398859024048, + "learning_rate": 1e-06, + "loss": 0.0072, + "step": 3077 + }, + { + "epoch": 0.5240040858018387, + "grad_norm": 1.257546305656433, + "learning_rate": 1e-06, + "loss": 0.013, + "step": 3078 + }, + { + "epoch": 0.5241743275451141, + "grad_norm": 1.5691494941711426, + "learning_rate": 1e-06, + "loss": 0.0152, + "step": 3079 + }, + { + "epoch": 0.5243445692883895, + "grad_norm": 1.1722148656845093, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 3080 + }, + { + "epoch": 0.524514811031665, + "grad_norm": 0.8400589823722839, + "learning_rate": 1e-06, + "loss": 0.008, + "step": 3081 + }, + { + "epoch": 0.5246850527749404, + "grad_norm": 1.016576886177063, + "learning_rate": 1e-06, + "loss": 0.0127, + "step": 3082 + }, + { + "epoch": 0.5248552945182159, + "grad_norm": 1.129569411277771, + "learning_rate": 1e-06, + "loss": 0.0083, + "step": 3083 + }, + { + "epoch": 0.5250255362614913, + "grad_norm": 1.5929679870605469, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 3084 + }, + { + "epoch": 0.5251957780047668, + "grad_norm": 1.9398027658462524, + "learning_rate": 1e-06, + "loss": 0.0098, + "step": 3085 + }, + { + "epoch": 0.5253660197480422, + "grad_norm": 1.6181927919387817, + "learning_rate": 1e-06, + "loss": 0.0182, + "step": 3086 + }, + { + "epoch": 0.5255362614913177, + "grad_norm": 1.1063342094421387, + "learning_rate": 1e-06, + "loss": 0.0103, + "step": 3087 + }, + { + "epoch": 0.5257065032345931, + "grad_norm": 1.2465970516204834, + "learning_rate": 1e-06, + "loss": 0.0136, + "step": 3088 + }, + { + "epoch": 0.5258767449778686, + "grad_norm": 0.9947730302810669, + "learning_rate": 1e-06, + "loss": 0.0081, + "step": 3089 + }, + { + "epoch": 0.526046986721144, + "grad_norm": 1.0790003538131714, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 3090 + }, + { + "epoch": 0.5262172284644194, + "grad_norm": 1.3154680728912354, + "learning_rate": 1e-06, + "loss": 0.0102, + "step": 3091 + }, + { + "epoch": 0.5263874702076949, + "grad_norm": 1.074992060661316, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 3092 + }, + { + "epoch": 0.5265577119509703, + "grad_norm": 1.3531382083892822, + "learning_rate": 1e-06, + "loss": 0.0127, + "step": 3093 + }, + { + "epoch": 0.5267279536942459, + "grad_norm": 1.8065028190612793, + "learning_rate": 1e-06, + "loss": 0.0149, + "step": 3094 + }, + { + "epoch": 0.5268981954375213, + "grad_norm": 1.652100920677185, + "learning_rate": 1e-06, + "loss": 0.015, + "step": 3095 + }, + { + "epoch": 0.5270684371807968, + "grad_norm": 1.073090672492981, + "learning_rate": 1e-06, + "loss": 0.0123, + "step": 3096 + }, + { + "epoch": 0.5272386789240722, + "grad_norm": 1.0769790410995483, + "learning_rate": 1e-06, + "loss": 0.01, + "step": 3097 + }, + { + "epoch": 0.5274089206673477, + "grad_norm": 1.2385014295578003, + "learning_rate": 1e-06, + "loss": 0.0091, + "step": 3098 + }, + { + "epoch": 0.5275791624106231, + "grad_norm": 1.9769033193588257, + "learning_rate": 1e-06, + "loss": 0.0126, + "step": 3099 + }, + { + "epoch": 0.5277494041538985, + "grad_norm": 1.2743234634399414, + "learning_rate": 1e-06, + "loss": 0.0085, + "step": 3100 + }, + { + "epoch": 0.527919645897174, + "grad_norm": 0.9275697469711304, + "learning_rate": 1e-06, + "loss": 0.0085, + "step": 3101 + }, + { + "epoch": 0.5280898876404494, + "grad_norm": 1.1067229509353638, + "learning_rate": 1e-06, + "loss": 0.0112, + "step": 3102 + }, + { + "epoch": 0.5282601293837249, + "grad_norm": 1.1381607055664062, + "learning_rate": 1e-06, + "loss": 0.0103, + "step": 3103 + }, + { + "epoch": 0.5284303711270003, + "grad_norm": 1.1659680604934692, + "learning_rate": 1e-06, + "loss": 0.0104, + "step": 3104 + }, + { + "epoch": 0.5286006128702758, + "grad_norm": 1.089293122291565, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 3105 + }, + { + "epoch": 0.5287708546135512, + "grad_norm": 1.252436876296997, + "learning_rate": 1e-06, + "loss": 0.0097, + "step": 3106 + }, + { + "epoch": 0.5289410963568267, + "grad_norm": 1.4552236795425415, + "learning_rate": 1e-06, + "loss": 0.0137, + "step": 3107 + }, + { + "epoch": 0.5291113381001021, + "grad_norm": 1.2212755680084229, + "learning_rate": 1e-06, + "loss": 0.0107, + "step": 3108 + }, + { + "epoch": 0.5292815798433776, + "grad_norm": 1.3947482109069824, + "learning_rate": 1e-06, + "loss": 0.0149, + "step": 3109 + }, + { + "epoch": 0.529451821586653, + "grad_norm": 1.2081656455993652, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 3110 + }, + { + "epoch": 0.5296220633299284, + "grad_norm": 1.2081656455993652, + "learning_rate": 1e-06, + "loss": 0.0395, + "step": 3111 + }, + { + "epoch": 0.529792305073204, + "grad_norm": 1.6731754541397095, + "learning_rate": 1e-06, + "loss": 0.0147, + "step": 3112 + }, + { + "epoch": 0.5299625468164794, + "grad_norm": 1.3174570798873901, + "learning_rate": 1e-06, + "loss": 0.0143, + "step": 3113 + }, + { + "epoch": 0.5301327885597549, + "grad_norm": 1.2845019102096558, + "learning_rate": 1e-06, + "loss": 0.0094, + "step": 3114 + }, + { + "epoch": 0.5303030303030303, + "grad_norm": 1.1987799406051636, + "learning_rate": 1e-06, + "loss": 0.0086, + "step": 3115 + }, + { + "epoch": 0.5304732720463058, + "grad_norm": 0.9767221808433533, + "learning_rate": 1e-06, + "loss": 0.0066, + "step": 3116 + }, + { + "epoch": 0.5306435137895812, + "grad_norm": 1.1443259716033936, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 3117 + }, + { + "epoch": 0.5308137555328567, + "grad_norm": 1.4838154315948486, + "learning_rate": 1e-06, + "loss": 0.0178, + "step": 3118 + }, + { + "epoch": 0.5309839972761321, + "grad_norm": 1.5887151956558228, + "learning_rate": 1e-06, + "loss": 0.0122, + "step": 3119 + }, + { + "epoch": 0.5311542390194075, + "grad_norm": 1.2178723812103271, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 3120 + }, + { + "epoch": 0.531324480762683, + "grad_norm": 1.0301523208618164, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 3121 + }, + { + "epoch": 0.5314947225059584, + "grad_norm": 1.1088286638259888, + "learning_rate": 1e-06, + "loss": 0.0107, + "step": 3122 + }, + { + "epoch": 0.5316649642492339, + "grad_norm": 1.2651501893997192, + "learning_rate": 1e-06, + "loss": 0.0141, + "step": 3123 + }, + { + "epoch": 0.5318352059925093, + "grad_norm": 1.6318732500076294, + "learning_rate": 1e-06, + "loss": 0.0198, + "step": 3124 + }, + { + "epoch": 0.5320054477357848, + "grad_norm": 1.1797488927841187, + "learning_rate": 1e-06, + "loss": 0.0127, + "step": 3125 + }, + { + "epoch": 0.5321756894790602, + "grad_norm": 1.480616569519043, + "learning_rate": 1e-06, + "loss": 0.0091, + "step": 3126 + }, + { + "epoch": 0.5323459312223358, + "grad_norm": 1.1608449220657349, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 3127 + }, + { + "epoch": 0.5325161729656112, + "grad_norm": 1.0857185125350952, + "learning_rate": 1e-06, + "loss": 0.0111, + "step": 3128 + }, + { + "epoch": 0.5326864147088867, + "grad_norm": 1.1040613651275635, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 3129 + }, + { + "epoch": 0.5328566564521621, + "grad_norm": 1.3238484859466553, + "learning_rate": 1e-06, + "loss": 0.01, + "step": 3130 + }, + { + "epoch": 0.5330268981954375, + "grad_norm": 1.1844618320465088, + "learning_rate": 1e-06, + "loss": 0.0153, + "step": 3131 + }, + { + "epoch": 0.533197139938713, + "grad_norm": 1.6046870946884155, + "learning_rate": 1e-06, + "loss": 0.0159, + "step": 3132 + }, + { + "epoch": 0.5333673816819884, + "grad_norm": 1.2018859386444092, + "learning_rate": 1e-06, + "loss": 0.01, + "step": 3133 + }, + { + "epoch": 0.5335376234252639, + "grad_norm": 1.0841635465621948, + "learning_rate": 1e-06, + "loss": 0.0111, + "step": 3134 + }, + { + "epoch": 0.5337078651685393, + "grad_norm": 1.3309770822525024, + "learning_rate": 1e-06, + "loss": 0.0162, + "step": 3135 + }, + { + "epoch": 0.5338781069118148, + "grad_norm": 1.043618083000183, + "learning_rate": 1e-06, + "loss": 0.0085, + "step": 3136 + }, + { + "epoch": 0.5340483486550902, + "grad_norm": 1.2823837995529175, + "learning_rate": 1e-06, + "loss": 0.0108, + "step": 3137 + }, + { + "epoch": 0.5342185903983657, + "grad_norm": 1.3309781551361084, + "learning_rate": 1e-06, + "loss": 0.011, + "step": 3138 + }, + { + "epoch": 0.5343888321416411, + "grad_norm": 1.053794264793396, + "learning_rate": 1e-06, + "loss": 0.0119, + "step": 3139 + }, + { + "epoch": 0.5345590738849166, + "grad_norm": 1.136751413345337, + "learning_rate": 1e-06, + "loss": 0.0087, + "step": 3140 + }, + { + "epoch": 0.534729315628192, + "grad_norm": 1.0801527500152588, + "learning_rate": 1e-06, + "loss": 0.0071, + "step": 3141 + }, + { + "epoch": 0.5348995573714674, + "grad_norm": 1.2816129922866821, + "learning_rate": 1e-06, + "loss": 0.0147, + "step": 3142 + }, + { + "epoch": 0.535069799114743, + "grad_norm": 1.2644041776657104, + "learning_rate": 1e-06, + "loss": 0.0076, + "step": 3143 + }, + { + "epoch": 0.5352400408580184, + "grad_norm": 1.4873162508010864, + "learning_rate": 1e-06, + "loss": 0.0158, + "step": 3144 + }, + { + "epoch": 0.5354102826012939, + "grad_norm": 1.1576414108276367, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 3145 + }, + { + "epoch": 0.5355805243445693, + "grad_norm": 1.1078191995620728, + "learning_rate": 1e-06, + "loss": 0.0112, + "step": 3146 + }, + { + "epoch": 0.5357507660878448, + "grad_norm": 1.2426562309265137, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 3147 + }, + { + "epoch": 0.5359210078311202, + "grad_norm": 1.977630615234375, + "learning_rate": 1e-06, + "loss": 0.0157, + "step": 3148 + }, + { + "epoch": 0.5360912495743957, + "grad_norm": 1.5125465393066406, + "learning_rate": 1e-06, + "loss": 0.0126, + "step": 3149 + }, + { + "epoch": 0.5362614913176711, + "grad_norm": 1.4426789283752441, + "learning_rate": 1e-06, + "loss": 0.0115, + "step": 3150 + }, + { + "epoch": 0.5364317330609465, + "grad_norm": 1.0922977924346924, + "learning_rate": 1e-06, + "loss": 0.0116, + "step": 3151 + }, + { + "epoch": 0.536601974804222, + "grad_norm": 1.118429183959961, + "learning_rate": 1e-06, + "loss": 0.0098, + "step": 3152 + }, + { + "epoch": 0.5367722165474974, + "grad_norm": 1.0450609922409058, + "learning_rate": 1e-06, + "loss": 0.009, + "step": 3153 + }, + { + "epoch": 0.5369424582907729, + "grad_norm": 1.1401301622390747, + "learning_rate": 1e-06, + "loss": 0.0093, + "step": 3154 + }, + { + "epoch": 0.5371127000340483, + "grad_norm": 0.9356278777122498, + "learning_rate": 1e-06, + "loss": 0.0086, + "step": 3155 + }, + { + "epoch": 0.5372829417773238, + "grad_norm": 0.9514819979667664, + "learning_rate": 1e-06, + "loss": 0.0097, + "step": 3156 + }, + { + "epoch": 0.5374531835205992, + "grad_norm": 1.2086161375045776, + "learning_rate": 1e-06, + "loss": 0.0111, + "step": 3157 + }, + { + "epoch": 0.5376234252638747, + "grad_norm": 1.0937926769256592, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 3158 + }, + { + "epoch": 0.5377936670071501, + "grad_norm": 1.2354090213775635, + "learning_rate": 1e-06, + "loss": 0.0121, + "step": 3159 + }, + { + "epoch": 0.5379639087504257, + "grad_norm": 1.7234115600585938, + "learning_rate": 1e-06, + "loss": 0.0144, + "step": 3160 + }, + { + "epoch": 0.5381341504937011, + "grad_norm": 1.390219807624817, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 3161 + }, + { + "epoch": 0.5383043922369765, + "grad_norm": 1.2615468502044678, + "learning_rate": 1e-06, + "loss": 0.0102, + "step": 3162 + }, + { + "epoch": 0.538474633980252, + "grad_norm": 1.4214125871658325, + "learning_rate": 1e-06, + "loss": 0.0086, + "step": 3163 + }, + { + "epoch": 0.5386448757235274, + "grad_norm": 1.3602176904678345, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 3164 + }, + { + "epoch": 0.5388151174668029, + "grad_norm": 1.4177559614181519, + "learning_rate": 1e-06, + "loss": 0.0143, + "step": 3165 + }, + { + "epoch": 0.5389853592100783, + "grad_norm": 0.868809163570404, + "learning_rate": 1e-06, + "loss": 0.0094, + "step": 3166 + }, + { + "epoch": 0.5391556009533538, + "grad_norm": 1.1446378231048584, + "learning_rate": 1e-06, + "loss": 0.0098, + "step": 3167 + }, + { + "epoch": 0.5393258426966292, + "grad_norm": 1.3364777565002441, + "learning_rate": 1e-06, + "loss": 0.0112, + "step": 3168 + }, + { + "epoch": 0.5394960844399047, + "grad_norm": 1.2700159549713135, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 3169 + }, + { + "epoch": 0.5396663261831801, + "grad_norm": 1.1539145708084106, + "learning_rate": 1e-06, + "loss": 0.0092, + "step": 3170 + }, + { + "epoch": 0.5398365679264555, + "grad_norm": 1.2744473218917847, + "learning_rate": 1e-06, + "loss": 0.0146, + "step": 3171 + }, + { + "epoch": 0.540006809669731, + "grad_norm": 1.206419587135315, + "learning_rate": 1e-06, + "loss": 0.0127, + "step": 3172 + }, + { + "epoch": 0.5401770514130064, + "grad_norm": 1.7593352794647217, + "learning_rate": 1e-06, + "loss": 0.0106, + "step": 3173 + }, + { + "epoch": 0.5403472931562819, + "grad_norm": 1.1174089908599854, + "learning_rate": 1e-06, + "loss": 0.013, + "step": 3174 + }, + { + "epoch": 0.5405175348995573, + "grad_norm": 1.2210553884506226, + "learning_rate": 1e-06, + "loss": 0.012, + "step": 3175 + }, + { + "epoch": 0.5406877766428329, + "grad_norm": 1.1429721117019653, + "learning_rate": 1e-06, + "loss": 0.0091, + "step": 3176 + }, + { + "epoch": 0.5408580183861083, + "grad_norm": 1.086243748664856, + "learning_rate": 1e-06, + "loss": 0.0079, + "step": 3177 + }, + { + "epoch": 0.5410282601293838, + "grad_norm": 1.1069355010986328, + "learning_rate": 1e-06, + "loss": 0.0148, + "step": 3178 + }, + { + "epoch": 0.5411985018726592, + "grad_norm": 1.1497927904129028, + "learning_rate": 1e-06, + "loss": 0.0127, + "step": 3179 + }, + { + "epoch": 0.5413687436159347, + "grad_norm": 1.2102113962173462, + "learning_rate": 1e-06, + "loss": 0.0113, + "step": 3180 + }, + { + "epoch": 0.5415389853592101, + "grad_norm": 2.604693651199341, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3181 + }, + { + "epoch": 0.5417092271024855, + "grad_norm": 1.2018678188323975, + "learning_rate": 1e-06, + "loss": 0.0117, + "step": 3182 + }, + { + "epoch": 0.541879468845761, + "grad_norm": 1.0666275024414062, + "learning_rate": 1e-06, + "loss": 0.0081, + "step": 3183 + }, + { + "epoch": 0.5420497105890364, + "grad_norm": 1.0146986246109009, + "learning_rate": 1e-06, + "loss": 0.0117, + "step": 3184 + }, + { + "epoch": 0.5422199523323119, + "grad_norm": 1.2989884614944458, + "learning_rate": 1e-06, + "loss": 0.0113, + "step": 3185 + }, + { + "epoch": 0.5423901940755873, + "grad_norm": 1.071435570716858, + "learning_rate": 1e-06, + "loss": 0.0133, + "step": 3186 + }, + { + "epoch": 0.5425604358188628, + "grad_norm": 1.090047836303711, + "learning_rate": 1e-06, + "loss": 0.0087, + "step": 3187 + }, + { + "epoch": 0.5427306775621382, + "grad_norm": 1.1832104921340942, + "learning_rate": 1e-06, + "loss": 0.0131, + "step": 3188 + }, + { + "epoch": 0.5429009193054137, + "grad_norm": 1.1112650632858276, + "learning_rate": 1e-06, + "loss": 0.0112, + "step": 3189 + }, + { + "epoch": 0.5430711610486891, + "grad_norm": 1.0457887649536133, + "learning_rate": 1e-06, + "loss": 0.0158, + "step": 3190 + }, + { + "epoch": 0.5432414027919646, + "grad_norm": 1.2125804424285889, + "learning_rate": 1e-06, + "loss": 0.0165, + "step": 3191 + }, + { + "epoch": 0.54341164453524, + "grad_norm": 1.0135105848312378, + "learning_rate": 1e-06, + "loss": 0.0115, + "step": 3192 + }, + { + "epoch": 0.5435818862785154, + "grad_norm": 1.0347403287887573, + "learning_rate": 1e-06, + "loss": 0.0094, + "step": 3193 + }, + { + "epoch": 0.543752128021791, + "grad_norm": 1.2791105508804321, + "learning_rate": 1e-06, + "loss": 0.0093, + "step": 3194 + }, + { + "epoch": 0.5439223697650664, + "grad_norm": 1.1392316818237305, + "learning_rate": 1e-06, + "loss": 0.0121, + "step": 3195 + }, + { + "epoch": 0.5440926115083419, + "grad_norm": 1.273457646369934, + "learning_rate": 1e-06, + "loss": 0.0074, + "step": 3196 + }, + { + "epoch": 0.5442628532516173, + "grad_norm": 1.5065120458602905, + "learning_rate": 1e-06, + "loss": 0.0144, + "step": 3197 + }, + { + "epoch": 0.5444330949948928, + "grad_norm": 1.0680369138717651, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 3198 + }, + { + "epoch": 0.5446033367381682, + "grad_norm": 1.3402657508850098, + "learning_rate": 1e-06, + "loss": 0.0111, + "step": 3199 + }, + { + "epoch": 0.5447735784814437, + "grad_norm": 1.0954920053482056, + "learning_rate": 1e-06, + "loss": 0.0108, + "step": 3200 + }, + { + "epoch": 0.5449438202247191, + "grad_norm": 1.4809321165084839, + "learning_rate": 1e-06, + "loss": 0.0127, + "step": 3201 + }, + { + "epoch": 0.5451140619679945, + "grad_norm": 1.1664936542510986, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 3202 + }, + { + "epoch": 0.54528430371127, + "grad_norm": 1.4334688186645508, + "learning_rate": 1e-06, + "loss": 0.0119, + "step": 3203 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 1.2824522256851196, + "learning_rate": 1e-06, + "loss": 0.0103, + "step": 3204 + }, + { + "epoch": 0.5456247871978209, + "grad_norm": 1.2238925695419312, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 3205 + }, + { + "epoch": 0.5457950289410963, + "grad_norm": 1.2942602634429932, + "learning_rate": 1e-06, + "loss": 0.0142, + "step": 3206 + }, + { + "epoch": 0.5459652706843718, + "grad_norm": 0.9721873998641968, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 3207 + }, + { + "epoch": 0.5461355124276472, + "grad_norm": 1.3409883975982666, + "learning_rate": 1e-06, + "loss": 0.0122, + "step": 3208 + }, + { + "epoch": 0.5463057541709228, + "grad_norm": 1.283263087272644, + "learning_rate": 1e-06, + "loss": 0.0108, + "step": 3209 + }, + { + "epoch": 0.5464759959141982, + "grad_norm": 0.9081169366836548, + "learning_rate": 1e-06, + "loss": 0.0071, + "step": 3210 + }, + { + "epoch": 0.5466462376574737, + "grad_norm": 1.0881484746932983, + "learning_rate": 1e-06, + "loss": 0.008, + "step": 3211 + }, + { + "epoch": 0.5468164794007491, + "grad_norm": 1.2378934621810913, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 3212 + }, + { + "epoch": 0.5469867211440245, + "grad_norm": 1.277190089225769, + "learning_rate": 1e-06, + "loss": 0.0152, + "step": 3213 + }, + { + "epoch": 0.5471569628873, + "grad_norm": 1.6451539993286133, + "learning_rate": 1e-06, + "loss": 0.017, + "step": 3214 + }, + { + "epoch": 0.5473272046305754, + "grad_norm": 1.5039424896240234, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 3215 + }, + { + "epoch": 0.5474974463738509, + "grad_norm": 0.7711275219917297, + "learning_rate": 1e-06, + "loss": 0.007, + "step": 3216 + }, + { + "epoch": 0.5476676881171263, + "grad_norm": 1.2757947444915771, + "learning_rate": 1e-06, + "loss": 0.0137, + "step": 3217 + }, + { + "epoch": 0.5478379298604018, + "grad_norm": 1.2242053747177124, + "learning_rate": 1e-06, + "loss": 0.0102, + "step": 3218 + }, + { + "epoch": 0.5480081716036772, + "grad_norm": 1.3338735103607178, + "learning_rate": 1e-06, + "loss": 0.01, + "step": 3219 + }, + { + "epoch": 0.5481784133469527, + "grad_norm": 0.9244385361671448, + "learning_rate": 1e-06, + "loss": 0.0084, + "step": 3220 + }, + { + "epoch": 0.5483486550902281, + "grad_norm": 1.0538643598556519, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 3221 + }, + { + "epoch": 0.5485188968335035, + "grad_norm": 1.0866847038269043, + "learning_rate": 1e-06, + "loss": 0.0086, + "step": 3222 + }, + { + "epoch": 0.548689138576779, + "grad_norm": 0.8596549034118652, + "learning_rate": 1e-06, + "loss": 0.0083, + "step": 3223 + }, + { + "epoch": 0.5488593803200544, + "grad_norm": 1.1239334344863892, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 3224 + }, + { + "epoch": 0.54902962206333, + "grad_norm": 1.1236169338226318, + "learning_rate": 1e-06, + "loss": 0.0094, + "step": 3225 + }, + { + "epoch": 0.5491998638066053, + "grad_norm": 1.6141034364700317, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 3226 + }, + { + "epoch": 0.5493701055498809, + "grad_norm": 1.1685870885849, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 3227 + }, + { + "epoch": 0.5495403472931563, + "grad_norm": 1.2277820110321045, + "learning_rate": 1e-06, + "loss": 0.0146, + "step": 3228 + }, + { + "epoch": 0.5497105890364318, + "grad_norm": 1.1030385494232178, + "learning_rate": 1e-06, + "loss": 0.0116, + "step": 3229 + }, + { + "epoch": 0.5498808307797072, + "grad_norm": 1.246597409248352, + "learning_rate": 1e-06, + "loss": 0.0091, + "step": 3230 + }, + { + "epoch": 0.5500510725229827, + "grad_norm": 1.3571773767471313, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 3231 + }, + { + "epoch": 0.5502213142662581, + "grad_norm": 1.283843755722046, + "learning_rate": 1e-06, + "loss": 0.011, + "step": 3232 + }, + { + "epoch": 0.5503915560095335, + "grad_norm": 1.2857335805892944, + "learning_rate": 1e-06, + "loss": 0.0113, + "step": 3233 + }, + { + "epoch": 0.550561797752809, + "grad_norm": 1.0135763883590698, + "learning_rate": 1e-06, + "loss": 0.0102, + "step": 3234 + }, + { + "epoch": 0.5507320394960844, + "grad_norm": 1.3282479047775269, + "learning_rate": 1e-06, + "loss": 0.0117, + "step": 3235 + }, + { + "epoch": 0.5509022812393599, + "grad_norm": 1.251345157623291, + "learning_rate": 1e-06, + "loss": 0.0118, + "step": 3236 + }, + { + "epoch": 0.5510725229826353, + "grad_norm": 1.0486385822296143, + "learning_rate": 1e-06, + "loss": 0.0125, + "step": 3237 + }, + { + "epoch": 0.5512427647259108, + "grad_norm": 0.894183874130249, + "learning_rate": 1e-06, + "loss": 0.007, + "step": 3238 + }, + { + "epoch": 0.5514130064691862, + "grad_norm": 1.545824646949768, + "learning_rate": 1e-06, + "loss": 0.0141, + "step": 3239 + }, + { + "epoch": 0.5515832482124617, + "grad_norm": 1.3999972343444824, + "learning_rate": 1e-06, + "loss": 0.0137, + "step": 3240 + }, + { + "epoch": 0.5517534899557371, + "grad_norm": 1.1850388050079346, + "learning_rate": 1e-06, + "loss": 0.0081, + "step": 3241 + }, + { + "epoch": 0.5519237316990125, + "grad_norm": 1.2595492601394653, + "learning_rate": 1e-06, + "loss": 0.0143, + "step": 3242 + }, + { + "epoch": 0.552093973442288, + "grad_norm": 1.0089160203933716, + "learning_rate": 1e-06, + "loss": 0.008, + "step": 3243 + }, + { + "epoch": 0.5522642151855635, + "grad_norm": 2.293635845184326, + "learning_rate": 1e-06, + "loss": 0.0285, + "step": 3244 + }, + { + "epoch": 0.552434456928839, + "grad_norm": 1.250726342201233, + "learning_rate": 1e-06, + "loss": 0.0168, + "step": 3245 + }, + { + "epoch": 0.5526046986721144, + "grad_norm": 1.1201728582382202, + "learning_rate": 1e-06, + "loss": 0.0085, + "step": 3246 + }, + { + "epoch": 0.5527749404153899, + "grad_norm": 1.013512134552002, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 3247 + }, + { + "epoch": 0.5529451821586653, + "grad_norm": 1.111093282699585, + "learning_rate": 1e-06, + "loss": 0.0113, + "step": 3248 + }, + { + "epoch": 0.5531154239019408, + "grad_norm": 0.9368535876274109, + "learning_rate": 1e-06, + "loss": 0.0149, + "step": 3249 + }, + { + "epoch": 0.5532856656452162, + "grad_norm": 1.0745536088943481, + "learning_rate": 1e-06, + "loss": 0.0134, + "step": 3250 + }, + { + "epoch": 0.5534559073884917, + "grad_norm": 1.2841395139694214, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 3251 + }, + { + "epoch": 0.5536261491317671, + "grad_norm": 1.3827086687088013, + "learning_rate": 1e-06, + "loss": 0.0094, + "step": 3252 + }, + { + "epoch": 0.5537963908750425, + "grad_norm": 1.475622534751892, + "learning_rate": 1e-06, + "loss": 0.0131, + "step": 3253 + }, + { + "epoch": 0.553966632618318, + "grad_norm": 0.9399949312210083, + "learning_rate": 1e-06, + "loss": 0.0104, + "step": 3254 + }, + { + "epoch": 0.5541368743615934, + "grad_norm": 1.1346850395202637, + "learning_rate": 1e-06, + "loss": 0.0087, + "step": 3255 + }, + { + "epoch": 0.5543071161048689, + "grad_norm": 1.2634351253509521, + "learning_rate": 1e-06, + "loss": 0.0105, + "step": 3256 + }, + { + "epoch": 0.5544773578481443, + "grad_norm": 0.9699029326438904, + "learning_rate": 1e-06, + "loss": 0.0081, + "step": 3257 + }, + { + "epoch": 0.5546475995914198, + "grad_norm": 1.6691503524780273, + "learning_rate": 1e-06, + "loss": 0.0188, + "step": 3258 + }, + { + "epoch": 0.5548178413346952, + "grad_norm": 1.39431893825531, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 3259 + }, + { + "epoch": 0.5549880830779708, + "grad_norm": 1.5263134241104126, + "learning_rate": 1e-06, + "loss": 0.0186, + "step": 3260 + }, + { + "epoch": 0.5551583248212462, + "grad_norm": 1.1584216356277466, + "learning_rate": 1e-06, + "loss": 0.0164, + "step": 3261 + }, + { + "epoch": 0.5553285665645217, + "grad_norm": 1.0202586650848389, + "learning_rate": 1e-06, + "loss": 0.0107, + "step": 3262 + }, + { + "epoch": 0.5554988083077971, + "grad_norm": 1.393701434135437, + "learning_rate": 1e-06, + "loss": 0.0097, + "step": 3263 + }, + { + "epoch": 0.5556690500510725, + "grad_norm": 1.0818877220153809, + "learning_rate": 1e-06, + "loss": 0.0102, + "step": 3264 + }, + { + "epoch": 0.555839291794348, + "grad_norm": 1.3524203300476074, + "learning_rate": 1e-06, + "loss": 0.0103, + "step": 3265 + }, + { + "epoch": 0.5560095335376234, + "grad_norm": 1.1028423309326172, + "learning_rate": 1e-06, + "loss": 0.008, + "step": 3266 + }, + { + "epoch": 0.5561797752808989, + "grad_norm": 1.4970299005508423, + "learning_rate": 1e-06, + "loss": 0.0173, + "step": 3267 + }, + { + "epoch": 0.5563500170241743, + "grad_norm": 1.0553059577941895, + "learning_rate": 1e-06, + "loss": 0.0178, + "step": 3268 + }, + { + "epoch": 0.5565202587674498, + "grad_norm": 1.0315494537353516, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 3269 + }, + { + "epoch": 0.5566905005107252, + "grad_norm": 1.201059341430664, + "learning_rate": 1e-06, + "loss": 0.0134, + "step": 3270 + }, + { + "epoch": 0.5568607422540007, + "grad_norm": 1.472936987876892, + "learning_rate": 1e-06, + "loss": 0.0179, + "step": 3271 + }, + { + "epoch": 0.5570309839972761, + "grad_norm": 1.2015950679779053, + "learning_rate": 1e-06, + "loss": 0.0102, + "step": 3272 + }, + { + "epoch": 0.5572012257405515, + "grad_norm": 1.715442180633545, + "learning_rate": 1e-06, + "loss": 0.0126, + "step": 3273 + }, + { + "epoch": 0.557371467483827, + "grad_norm": 0.8229348659515381, + "learning_rate": 1e-06, + "loss": 0.0067, + "step": 3274 + }, + { + "epoch": 0.5575417092271024, + "grad_norm": 1.4160443544387817, + "learning_rate": 1e-06, + "loss": 0.0123, + "step": 3275 + }, + { + "epoch": 0.557711950970378, + "grad_norm": 1.3626257181167603, + "learning_rate": 1e-06, + "loss": 0.0116, + "step": 3276 + }, + { + "epoch": 0.5578821927136534, + "grad_norm": 0.9082250595092773, + "learning_rate": 1e-06, + "loss": 0.0081, + "step": 3277 + }, + { + "epoch": 0.5580524344569289, + "grad_norm": 1.0986019372940063, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 3278 + }, + { + "epoch": 0.5582226762002043, + "grad_norm": 1.828899621963501, + "learning_rate": 1e-06, + "loss": 0.0307, + "step": 3279 + }, + { + "epoch": 0.5583929179434798, + "grad_norm": 0.9443734288215637, + "learning_rate": 1e-06, + "loss": 0.0076, + "step": 3280 + }, + { + "epoch": 0.5585631596867552, + "grad_norm": 0.9179772138595581, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 3281 + }, + { + "epoch": 0.5587334014300307, + "grad_norm": 1.0771028995513916, + "learning_rate": 1e-06, + "loss": 0.0071, + "step": 3282 + }, + { + "epoch": 0.5589036431733061, + "grad_norm": 1.1734918355941772, + "learning_rate": 1e-06, + "loss": 0.0092, + "step": 3283 + }, + { + "epoch": 0.5590738849165815, + "grad_norm": 1.24713134765625, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 3284 + }, + { + "epoch": 0.559244126659857, + "grad_norm": 1.0508451461791992, + "learning_rate": 1e-06, + "loss": 0.0083, + "step": 3285 + }, + { + "epoch": 0.5594143684031324, + "grad_norm": 1.0896153450012207, + "learning_rate": 1e-06, + "loss": 0.011, + "step": 3286 + }, + { + "epoch": 0.5595846101464079, + "grad_norm": 1.1460729837417603, + "learning_rate": 1e-06, + "loss": 0.0112, + "step": 3287 + }, + { + "epoch": 0.5597548518896833, + "grad_norm": 1.5633426904678345, + "learning_rate": 1e-06, + "loss": 0.0148, + "step": 3288 + }, + { + "epoch": 0.5599250936329588, + "grad_norm": 1.5234661102294922, + "learning_rate": 1e-06, + "loss": 0.0132, + "step": 3289 + }, + { + "epoch": 0.5600953353762342, + "grad_norm": 0.8329904079437256, + "learning_rate": 1e-06, + "loss": 0.0059, + "step": 3290 + }, + { + "epoch": 0.5602655771195098, + "grad_norm": 1.244648814201355, + "learning_rate": 1e-06, + "loss": 0.0106, + "step": 3291 + }, + { + "epoch": 0.5604358188627852, + "grad_norm": 1.2756433486938477, + "learning_rate": 1e-06, + "loss": 0.0113, + "step": 3292 + }, + { + "epoch": 0.5606060606060606, + "grad_norm": 0.9432902932167053, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 3293 + }, + { + "epoch": 0.5607763023493361, + "grad_norm": 1.3495780229568481, + "learning_rate": 1e-06, + "loss": 0.0122, + "step": 3294 + }, + { + "epoch": 0.5609465440926115, + "grad_norm": 1.3346593379974365, + "learning_rate": 1e-06, + "loss": 0.0121, + "step": 3295 + }, + { + "epoch": 0.561116785835887, + "grad_norm": 1.12068510055542, + "learning_rate": 1e-06, + "loss": 0.0087, + "step": 3296 + }, + { + "epoch": 0.5612870275791624, + "grad_norm": 1.0519905090332031, + "learning_rate": 1e-06, + "loss": 0.009, + "step": 3297 + }, + { + "epoch": 0.5614572693224379, + "grad_norm": 1.1385189294815063, + "learning_rate": 1e-06, + "loss": 0.0111, + "step": 3298 + }, + { + "epoch": 0.5616275110657133, + "grad_norm": 1.3988077640533447, + "learning_rate": 1e-06, + "loss": 0.0183, + "step": 3299 + }, + { + "epoch": 0.5617977528089888, + "grad_norm": 1.1090185642242432, + "learning_rate": 1e-06, + "loss": 0.0072, + "step": 3300 + }, + { + "epoch": 0.5619679945522642, + "grad_norm": 1.2562520503997803, + "learning_rate": 1e-06, + "loss": 0.0116, + "step": 3301 + }, + { + "epoch": 0.5621382362955397, + "grad_norm": 1.1855425834655762, + "learning_rate": 1e-06, + "loss": 0.0132, + "step": 3302 + }, + { + "epoch": 0.5623084780388151, + "grad_norm": 1.492267370223999, + "learning_rate": 1e-06, + "loss": 0.0105, + "step": 3303 + }, + { + "epoch": 0.5624787197820905, + "grad_norm": 1.1446338891983032, + "learning_rate": 1e-06, + "loss": 0.0112, + "step": 3304 + }, + { + "epoch": 0.562648961525366, + "grad_norm": 0.9587407112121582, + "learning_rate": 1e-06, + "loss": 0.0091, + "step": 3305 + }, + { + "epoch": 0.5628192032686414, + "grad_norm": 1.0618356466293335, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 3306 + }, + { + "epoch": 0.562989445011917, + "grad_norm": 1.6512682437896729, + "learning_rate": 1e-06, + "loss": 0.0229, + "step": 3307 + }, + { + "epoch": 0.5631596867551923, + "grad_norm": 1.2739150524139404, + "learning_rate": 1e-06, + "loss": 0.0156, + "step": 3308 + }, + { + "epoch": 0.5633299284984679, + "grad_norm": 1.4853968620300293, + "learning_rate": 1e-06, + "loss": 0.0197, + "step": 3309 + }, + { + "epoch": 0.5635001702417433, + "grad_norm": 1.269023060798645, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 3310 + }, + { + "epoch": 0.5636704119850188, + "grad_norm": 1.162980556488037, + "learning_rate": 1e-06, + "loss": 0.0103, + "step": 3311 + }, + { + "epoch": 0.5638406537282942, + "grad_norm": 2.1068434715270996, + "learning_rate": 1e-06, + "loss": 0.0151, + "step": 3312 + }, + { + "epoch": 0.5640108954715697, + "grad_norm": 1.2134860754013062, + "learning_rate": 1e-06, + "loss": 0.0142, + "step": 3313 + }, + { + "epoch": 0.5641811372148451, + "grad_norm": 1.2850688695907593, + "learning_rate": 1e-06, + "loss": 0.015, + "step": 3314 + }, + { + "epoch": 0.5643513789581205, + "grad_norm": 1.0942927598953247, + "learning_rate": 1e-06, + "loss": 0.0068, + "step": 3315 + }, + { + "epoch": 0.564521620701396, + "grad_norm": 1.20088791847229, + "learning_rate": 1e-06, + "loss": 0.0092, + "step": 3316 + }, + { + "epoch": 0.5646918624446714, + "grad_norm": 1.0646437406539917, + "learning_rate": 1e-06, + "loss": 0.0091, + "step": 3317 + }, + { + "epoch": 0.5648621041879469, + "grad_norm": 0.8980295062065125, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 3318 + }, + { + "epoch": 0.5650323459312223, + "grad_norm": 1.4501545429229736, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 3319 + }, + { + "epoch": 0.5652025876744978, + "grad_norm": 1.4403294324874878, + "learning_rate": 1e-06, + "loss": 0.0204, + "step": 3320 + }, + { + "epoch": 0.5653728294177732, + "grad_norm": 1.6485029458999634, + "learning_rate": 1e-06, + "loss": 0.0119, + "step": 3321 + }, + { + "epoch": 0.5655430711610487, + "grad_norm": 1.0820796489715576, + "learning_rate": 1e-06, + "loss": 0.0068, + "step": 3322 + }, + { + "epoch": 0.5657133129043241, + "grad_norm": 1.0240590572357178, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 3323 + }, + { + "epoch": 0.5658835546475995, + "grad_norm": 1.1677789688110352, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 3324 + }, + { + "epoch": 0.566053796390875, + "grad_norm": 1.3193532228469849, + "learning_rate": 1e-06, + "loss": 0.0113, + "step": 3325 + }, + { + "epoch": 0.5662240381341505, + "grad_norm": 0.9879570007324219, + "learning_rate": 1e-06, + "loss": 0.009, + "step": 3326 + }, + { + "epoch": 0.566394279877426, + "grad_norm": 1.6975972652435303, + "learning_rate": 1e-06, + "loss": 0.0098, + "step": 3327 + }, + { + "epoch": 0.5665645216207014, + "grad_norm": 1.4392386674880981, + "learning_rate": 1e-06, + "loss": 0.0126, + "step": 3328 + }, + { + "epoch": 0.5667347633639769, + "grad_norm": 1.341951847076416, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 3329 + }, + { + "epoch": 0.5669050051072523, + "grad_norm": 1.0540547370910645, + "learning_rate": 1e-06, + "loss": 0.0086, + "step": 3330 + }, + { + "epoch": 0.5670752468505278, + "grad_norm": 1.0311988592147827, + "learning_rate": 1e-06, + "loss": 0.0147, + "step": 3331 + }, + { + "epoch": 0.5672454885938032, + "grad_norm": 1.1205745935440063, + "learning_rate": 1e-06, + "loss": 0.0094, + "step": 3332 + }, + { + "epoch": 0.5674157303370787, + "grad_norm": 0.9916560649871826, + "learning_rate": 1e-06, + "loss": 0.01, + "step": 3333 + }, + { + "epoch": 0.5675859720803541, + "grad_norm": 1.0641469955444336, + "learning_rate": 1e-06, + "loss": 0.0116, + "step": 3334 + }, + { + "epoch": 0.5677562138236295, + "grad_norm": 1.4012588262557983, + "learning_rate": 1e-06, + "loss": 0.0185, + "step": 3335 + }, + { + "epoch": 0.567926455566905, + "grad_norm": 1.0425010919570923, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 3336 + }, + { + "epoch": 0.5680966973101804, + "grad_norm": 1.2166441679000854, + "learning_rate": 1e-06, + "loss": 0.0123, + "step": 3337 + }, + { + "epoch": 0.5682669390534559, + "grad_norm": 1.4279252290725708, + "learning_rate": 1e-06, + "loss": 0.0118, + "step": 3338 + }, + { + "epoch": 0.5684371807967313, + "grad_norm": 1.7417263984680176, + "learning_rate": 1e-06, + "loss": 0.0167, + "step": 3339 + }, + { + "epoch": 0.5686074225400068, + "grad_norm": 1.190510869026184, + "learning_rate": 1e-06, + "loss": 0.0112, + "step": 3340 + }, + { + "epoch": 0.5687776642832822, + "grad_norm": 1.2967720031738281, + "learning_rate": 1e-06, + "loss": 0.0087, + "step": 3341 + }, + { + "epoch": 0.5689479060265578, + "grad_norm": 1.2348685264587402, + "learning_rate": 1e-06, + "loss": 0.0084, + "step": 3342 + }, + { + "epoch": 0.5691181477698332, + "grad_norm": 1.24281907081604, + "learning_rate": 1e-06, + "loss": 0.0116, + "step": 3343 + }, + { + "epoch": 0.5692883895131086, + "grad_norm": 1.3598650693893433, + "learning_rate": 1e-06, + "loss": 0.0173, + "step": 3344 + }, + { + "epoch": 0.5694586312563841, + "grad_norm": 1.173242211341858, + "learning_rate": 1e-06, + "loss": 0.0103, + "step": 3345 + }, + { + "epoch": 0.5696288729996595, + "grad_norm": 1.2969194650650024, + "learning_rate": 1e-06, + "loss": 0.0114, + "step": 3346 + }, + { + "epoch": 0.569799114742935, + "grad_norm": 0.953393280506134, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 3347 + }, + { + "epoch": 0.5699693564862104, + "grad_norm": 1.2837368249893188, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 3348 + }, + { + "epoch": 0.5701395982294859, + "grad_norm": 0.9645003080368042, + "learning_rate": 1e-06, + "loss": 0.0086, + "step": 3349 + }, + { + "epoch": 0.5703098399727613, + "grad_norm": 0.9485358595848083, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 3350 + }, + { + "epoch": 0.5704800817160368, + "grad_norm": 1.000497817993164, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 3351 + }, + { + "epoch": 0.5706503234593122, + "grad_norm": 1.262580394744873, + "learning_rate": 1e-06, + "loss": 0.0091, + "step": 3352 + }, + { + "epoch": 0.5708205652025877, + "grad_norm": 1.2770668268203735, + "learning_rate": 1e-06, + "loss": 0.0116, + "step": 3353 + }, + { + "epoch": 0.5709908069458631, + "grad_norm": 0.9978349804878235, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 3354 + }, + { + "epoch": 0.5711610486891385, + "grad_norm": 1.1348481178283691, + "learning_rate": 1e-06, + "loss": 0.0114, + "step": 3355 + }, + { + "epoch": 0.571331290432414, + "grad_norm": 1.1138725280761719, + "learning_rate": 1e-06, + "loss": 0.0077, + "step": 3356 + }, + { + "epoch": 0.5715015321756894, + "grad_norm": 1.243876576423645, + "learning_rate": 1e-06, + "loss": 0.0103, + "step": 3357 + }, + { + "epoch": 0.571671773918965, + "grad_norm": 2.1010210514068604, + "learning_rate": 1e-06, + "loss": 0.0229, + "step": 3358 + }, + { + "epoch": 0.5718420156622404, + "grad_norm": 1.5689268112182617, + "learning_rate": 1e-06, + "loss": 0.0199, + "step": 3359 + }, + { + "epoch": 0.5720122574055159, + "grad_norm": 1.4927451610565186, + "learning_rate": 1e-06, + "loss": 0.0145, + "step": 3360 + }, + { + "epoch": 0.5721824991487913, + "grad_norm": 1.2088425159454346, + "learning_rate": 1e-06, + "loss": 0.0084, + "step": 3361 + }, + { + "epoch": 0.5723527408920668, + "grad_norm": 3.114051580429077, + "learning_rate": 1e-06, + "loss": 0.0122, + "step": 3362 + }, + { + "epoch": 0.5725229826353422, + "grad_norm": 0.9580630660057068, + "learning_rate": 1e-06, + "loss": 0.0076, + "step": 3363 + }, + { + "epoch": 0.5726932243786176, + "grad_norm": 0.9830418229103088, + "learning_rate": 1e-06, + "loss": 0.0097, + "step": 3364 + }, + { + "epoch": 0.5728634661218931, + "grad_norm": 1.2389250993728638, + "learning_rate": 1e-06, + "loss": 0.0099, + "step": 3365 + }, + { + "epoch": 0.5730337078651685, + "grad_norm": 1.159663438796997, + "learning_rate": 1e-06, + "loss": 0.0141, + "step": 3366 + }, + { + "epoch": 0.573203949608444, + "grad_norm": 1.5765507221221924, + "learning_rate": 1e-06, + "loss": 0.0123, + "step": 3367 + }, + { + "epoch": 0.5733741913517194, + "grad_norm": 1.1403703689575195, + "learning_rate": 1e-06, + "loss": 0.0091, + "step": 3368 + }, + { + "epoch": 0.5735444330949949, + "grad_norm": 1.376774787902832, + "learning_rate": 1e-06, + "loss": 0.0111, + "step": 3369 + }, + { + "epoch": 0.5737146748382703, + "grad_norm": 1.386721134185791, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 3370 + }, + { + "epoch": 0.5738849165815458, + "grad_norm": 1.6140387058258057, + "learning_rate": 1e-06, + "loss": 0.014, + "step": 3371 + }, + { + "epoch": 0.5740551583248212, + "grad_norm": 0.9297884702682495, + "learning_rate": 1e-06, + "loss": 0.0089, + "step": 3372 + }, + { + "epoch": 0.5742254000680967, + "grad_norm": 1.196844220161438, + "learning_rate": 1e-06, + "loss": 0.0103, + "step": 3373 + }, + { + "epoch": 0.5743956418113721, + "grad_norm": 0.999453604221344, + "learning_rate": 1e-06, + "loss": 0.0097, + "step": 3374 + }, + { + "epoch": 0.5745658835546475, + "grad_norm": 1.378943920135498, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 3375 + }, + { + "epoch": 0.5747361252979231, + "grad_norm": 1.1324682235717773, + "learning_rate": 1e-06, + "loss": 0.0102, + "step": 3376 + }, + { + "epoch": 0.5749063670411985, + "grad_norm": 1.0931276082992554, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 3377 + }, + { + "epoch": 0.575076608784474, + "grad_norm": 1.0368367433547974, + "learning_rate": 1e-06, + "loss": 0.0073, + "step": 3378 + }, + { + "epoch": 0.5752468505277494, + "grad_norm": 1.0412521362304688, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 3379 + }, + { + "epoch": 0.5754170922710249, + "grad_norm": 1.5202747583389282, + "learning_rate": 1e-06, + "loss": 0.0134, + "step": 3380 + }, + { + "epoch": 0.5755873340143003, + "grad_norm": 0.9431686401367188, + "learning_rate": 1e-06, + "loss": 0.0062, + "step": 3381 + }, + { + "epoch": 0.5757575757575758, + "grad_norm": 1.391595482826233, + "learning_rate": 1e-06, + "loss": 0.0187, + "step": 3382 + }, + { + "epoch": 0.5759278175008512, + "grad_norm": 1.1645992994308472, + "learning_rate": 1e-06, + "loss": 0.0091, + "step": 3383 + }, + { + "epoch": 0.5760980592441267, + "grad_norm": 1.1957941055297852, + "learning_rate": 1e-06, + "loss": 0.0107, + "step": 3384 + }, + { + "epoch": 0.5762683009874021, + "grad_norm": 0.9163592457771301, + "learning_rate": 1e-06, + "loss": 0.0083, + "step": 3385 + }, + { + "epoch": 0.5764385427306775, + "grad_norm": 1.4192230701446533, + "learning_rate": 1e-06, + "loss": 0.0132, + "step": 3386 + }, + { + "epoch": 0.576608784473953, + "grad_norm": 1.128556728363037, + "learning_rate": 1e-06, + "loss": 0.0094, + "step": 3387 + }, + { + "epoch": 0.5767790262172284, + "grad_norm": 1.0899877548217773, + "learning_rate": 1e-06, + "loss": 0.0099, + "step": 3388 + }, + { + "epoch": 0.5769492679605039, + "grad_norm": 1.5892912149429321, + "learning_rate": 1e-06, + "loss": 0.0116, + "step": 3389 + }, + { + "epoch": 0.5771195097037793, + "grad_norm": 0.9154031276702881, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 3390 + }, + { + "epoch": 0.5772897514470549, + "grad_norm": 1.380571722984314, + "learning_rate": 1e-06, + "loss": 0.0131, + "step": 3391 + }, + { + "epoch": 0.5774599931903303, + "grad_norm": 1.0964083671569824, + "learning_rate": 1e-06, + "loss": 0.0084, + "step": 3392 + }, + { + "epoch": 0.5776302349336058, + "grad_norm": 1.2165638208389282, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 3393 + }, + { + "epoch": 0.5778004766768812, + "grad_norm": 1.234350323677063, + "learning_rate": 1e-06, + "loss": 0.0085, + "step": 3394 + }, + { + "epoch": 0.5779707184201566, + "grad_norm": 1.2220001220703125, + "learning_rate": 1e-06, + "loss": 0.0143, + "step": 3395 + }, + { + "epoch": 0.5781409601634321, + "grad_norm": 1.1645594835281372, + "learning_rate": 1e-06, + "loss": 0.0086, + "step": 3396 + }, + { + "epoch": 0.5783112019067075, + "grad_norm": 1.3842546939849854, + "learning_rate": 1e-06, + "loss": 0.0127, + "step": 3397 + }, + { + "epoch": 0.578481443649983, + "grad_norm": 0.8232072591781616, + "learning_rate": 1e-06, + "loss": 0.0083, + "step": 3398 + }, + { + "epoch": 0.5786516853932584, + "grad_norm": 1.1387802362442017, + "learning_rate": 1e-06, + "loss": 0.0103, + "step": 3399 + }, + { + "epoch": 0.5788219271365339, + "grad_norm": 1.1812922954559326, + "learning_rate": 1e-06, + "loss": 0.0097, + "step": 3400 + }, + { + "epoch": 0.5789921688798093, + "grad_norm": 1.0787347555160522, + "learning_rate": 1e-06, + "loss": 0.0167, + "step": 3401 + }, + { + "epoch": 0.5791624106230848, + "grad_norm": 1.2608331441879272, + "learning_rate": 1e-06, + "loss": 0.0122, + "step": 3402 + }, + { + "epoch": 0.5793326523663602, + "grad_norm": 1.1550672054290771, + "learning_rate": 1e-06, + "loss": 0.008, + "step": 3403 + }, + { + "epoch": 0.5795028941096357, + "grad_norm": 1.1368712186813354, + "learning_rate": 1e-06, + "loss": 0.0098, + "step": 3404 + }, + { + "epoch": 0.5796731358529111, + "grad_norm": 1.4843052625656128, + "learning_rate": 1e-06, + "loss": 0.0103, + "step": 3405 + }, + { + "epoch": 0.5798433775961865, + "grad_norm": 1.0383031368255615, + "learning_rate": 1e-06, + "loss": 0.0069, + "step": 3406 + }, + { + "epoch": 0.580013619339462, + "grad_norm": 1.9053559303283691, + "learning_rate": 1e-06, + "loss": 0.0148, + "step": 3407 + }, + { + "epoch": 0.5801838610827375, + "grad_norm": 1.11162531375885, + "learning_rate": 1e-06, + "loss": 0.0111, + "step": 3408 + }, + { + "epoch": 0.580354102826013, + "grad_norm": 1.5268166065216064, + "learning_rate": 1e-06, + "loss": 0.0105, + "step": 3409 + }, + { + "epoch": 0.5805243445692884, + "grad_norm": 0.8884620070457458, + "learning_rate": 1e-06, + "loss": 0.0074, + "step": 3410 + }, + { + "epoch": 0.5806945863125639, + "grad_norm": 1.1013774871826172, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 3411 + }, + { + "epoch": 0.5808648280558393, + "grad_norm": 0.822043776512146, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 3412 + }, + { + "epoch": 0.5810350697991148, + "grad_norm": 0.9275667667388916, + "learning_rate": 1e-06, + "loss": 0.0093, + "step": 3413 + }, + { + "epoch": 0.5812053115423902, + "grad_norm": 1.002583384513855, + "learning_rate": 1e-06, + "loss": 0.0125, + "step": 3414 + }, + { + "epoch": 0.5813755532856656, + "grad_norm": 0.8230078816413879, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 3415 + }, + { + "epoch": 0.5815457950289411, + "grad_norm": 1.2379169464111328, + "learning_rate": 1e-06, + "loss": 0.0102, + "step": 3416 + }, + { + "epoch": 0.5817160367722165, + "grad_norm": 0.9411685466766357, + "learning_rate": 1e-06, + "loss": 0.0071, + "step": 3417 + }, + { + "epoch": 0.581886278515492, + "grad_norm": 1.2261695861816406, + "learning_rate": 1e-06, + "loss": 0.0106, + "step": 3418 + }, + { + "epoch": 0.5820565202587674, + "grad_norm": 1.1066349744796753, + "learning_rate": 1e-06, + "loss": 0.0084, + "step": 3419 + }, + { + "epoch": 0.5822267620020429, + "grad_norm": 1.058130145072937, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 3420 + }, + { + "epoch": 0.5823970037453183, + "grad_norm": 2.5851633548736572, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3421 + }, + { + "epoch": 0.5825672454885938, + "grad_norm": 1.2730610370635986, + "learning_rate": 1e-06, + "loss": 0.0077, + "step": 3422 + }, + { + "epoch": 0.5827374872318692, + "grad_norm": 1.3514959812164307, + "learning_rate": 1e-06, + "loss": 0.0102, + "step": 3423 + }, + { + "epoch": 0.5829077289751448, + "grad_norm": 1.4177110195159912, + "learning_rate": 1e-06, + "loss": 0.0097, + "step": 3424 + }, + { + "epoch": 0.5830779707184202, + "grad_norm": 1.2021658420562744, + "learning_rate": 1e-06, + "loss": 0.0118, + "step": 3425 + }, + { + "epoch": 0.5832482124616956, + "grad_norm": 1.4584933519363403, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 3426 + }, + { + "epoch": 0.5834184542049711, + "grad_norm": 1.345293402671814, + "learning_rate": 1e-06, + "loss": 0.0128, + "step": 3427 + }, + { + "epoch": 0.5835886959482465, + "grad_norm": 1.2940164804458618, + "learning_rate": 1e-06, + "loss": 0.0113, + "step": 3428 + }, + { + "epoch": 0.583758937691522, + "grad_norm": 1.1534115076065063, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 3429 + }, + { + "epoch": 0.5839291794347974, + "grad_norm": 0.9437724947929382, + "learning_rate": 1e-06, + "loss": 0.0083, + "step": 3430 + }, + { + "epoch": 0.5840994211780729, + "grad_norm": 1.2992208003997803, + "learning_rate": 1e-06, + "loss": 0.0164, + "step": 3431 + }, + { + "epoch": 0.5842696629213483, + "grad_norm": 0.95462566614151, + "learning_rate": 1e-06, + "loss": 0.0091, + "step": 3432 + }, + { + "epoch": 0.5844399046646238, + "grad_norm": 1.0334690809249878, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 3433 + }, + { + "epoch": 0.5846101464078992, + "grad_norm": 1.707252025604248, + "learning_rate": 1e-06, + "loss": 0.0143, + "step": 3434 + }, + { + "epoch": 0.5847803881511746, + "grad_norm": 0.9804726839065552, + "learning_rate": 1e-06, + "loss": 0.0077, + "step": 3435 + }, + { + "epoch": 0.5849506298944501, + "grad_norm": 1.5533514022827148, + "learning_rate": 1e-06, + "loss": 0.0143, + "step": 3436 + }, + { + "epoch": 0.5851208716377255, + "grad_norm": 1.154433012008667, + "learning_rate": 1e-06, + "loss": 0.0111, + "step": 3437 + }, + { + "epoch": 0.585291113381001, + "grad_norm": 1.2386829853057861, + "learning_rate": 1e-06, + "loss": 0.0107, + "step": 3438 + }, + { + "epoch": 0.5854613551242764, + "grad_norm": 0.9458797574043274, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 3439 + }, + { + "epoch": 0.585631596867552, + "grad_norm": 1.1015353202819824, + "learning_rate": 1e-06, + "loss": 0.0093, + "step": 3440 + }, + { + "epoch": 0.5858018386108274, + "grad_norm": 1.14461350440979, + "learning_rate": 1e-06, + "loss": 0.0105, + "step": 3441 + }, + { + "epoch": 0.5859720803541029, + "grad_norm": 1.4417644739151, + "learning_rate": 1e-06, + "loss": 0.0104, + "step": 3442 + }, + { + "epoch": 0.5861423220973783, + "grad_norm": 1.3879674673080444, + "learning_rate": 1e-06, + "loss": 0.0098, + "step": 3443 + }, + { + "epoch": 0.5863125638406538, + "grad_norm": 1.0989384651184082, + "learning_rate": 1e-06, + "loss": 0.008, + "step": 3444 + }, + { + "epoch": 0.5864828055839292, + "grad_norm": 1.2886449098587036, + "learning_rate": 1e-06, + "loss": 0.0084, + "step": 3445 + }, + { + "epoch": 0.5866530473272046, + "grad_norm": 0.9253196716308594, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 3446 + }, + { + "epoch": 0.5868232890704801, + "grad_norm": 1.2055374383926392, + "learning_rate": 1e-06, + "loss": 0.01, + "step": 3447 + }, + { + "epoch": 0.5869935308137555, + "grad_norm": 0.9988220930099487, + "learning_rate": 1e-06, + "loss": 0.0085, + "step": 3448 + }, + { + "epoch": 0.587163772557031, + "grad_norm": 0.9941924214363098, + "learning_rate": 1e-06, + "loss": 0.0072, + "step": 3449 + }, + { + "epoch": 0.5873340143003064, + "grad_norm": 1.3294003009796143, + "learning_rate": 1e-06, + "loss": 0.0099, + "step": 3450 + }, + { + "epoch": 0.5875042560435819, + "grad_norm": 1.015142560005188, + "learning_rate": 1e-06, + "loss": 0.0106, + "step": 3451 + }, + { + "epoch": 0.5876744977868573, + "grad_norm": 1.1107553243637085, + "learning_rate": 1e-06, + "loss": 0.0083, + "step": 3452 + }, + { + "epoch": 0.5878447395301328, + "grad_norm": 1.1980547904968262, + "learning_rate": 1e-06, + "loss": 0.0087, + "step": 3453 + }, + { + "epoch": 0.5880149812734082, + "grad_norm": 1.1775740385055542, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 3454 + }, + { + "epoch": 0.5881852230166837, + "grad_norm": 1.5608429908752441, + "learning_rate": 1e-06, + "loss": 0.0131, + "step": 3455 + }, + { + "epoch": 0.5883554647599591, + "grad_norm": 1.3156591653823853, + "learning_rate": 1e-06, + "loss": 0.011, + "step": 3456 + }, + { + "epoch": 0.5885257065032345, + "grad_norm": 1.0954924821853638, + "learning_rate": 1e-06, + "loss": 0.0077, + "step": 3457 + }, + { + "epoch": 0.5886959482465101, + "grad_norm": 0.908723771572113, + "learning_rate": 1e-06, + "loss": 0.0067, + "step": 3458 + }, + { + "epoch": 0.5888661899897855, + "grad_norm": 0.9518954753875732, + "learning_rate": 1e-06, + "loss": 0.0065, + "step": 3459 + }, + { + "epoch": 0.589036431733061, + "grad_norm": 1.0838017463684082, + "learning_rate": 1e-06, + "loss": 0.0116, + "step": 3460 + }, + { + "epoch": 0.5892066734763364, + "grad_norm": 1.235297441482544, + "learning_rate": 1e-06, + "loss": 0.0149, + "step": 3461 + }, + { + "epoch": 0.5893769152196119, + "grad_norm": 1.1786282062530518, + "learning_rate": 1e-06, + "loss": 0.0094, + "step": 3462 + }, + { + "epoch": 0.5895471569628873, + "grad_norm": 1.1594127416610718, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 3463 + }, + { + "epoch": 0.5897173987061628, + "grad_norm": 0.8134921193122864, + "learning_rate": 1e-06, + "loss": 0.0079, + "step": 3464 + }, + { + "epoch": 0.5898876404494382, + "grad_norm": 1.253074288368225, + "learning_rate": 1e-06, + "loss": 0.0132, + "step": 3465 + }, + { + "epoch": 0.5900578821927136, + "grad_norm": 0.9157419800758362, + "learning_rate": 1e-06, + "loss": 0.0062, + "step": 3466 + }, + { + "epoch": 0.5902281239359891, + "grad_norm": 0.8979124426841736, + "learning_rate": 1e-06, + "loss": 0.0077, + "step": 3467 + }, + { + "epoch": 0.5903983656792645, + "grad_norm": 0.9070771336555481, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 3468 + }, + { + "epoch": 0.59056860742254, + "grad_norm": 1.6368584632873535, + "learning_rate": 1e-06, + "loss": 0.0089, + "step": 3469 + }, + { + "epoch": 0.5907388491658154, + "grad_norm": 1.0019363164901733, + "learning_rate": 1e-06, + "loss": 0.0077, + "step": 3470 + }, + { + "epoch": 0.5909090909090909, + "grad_norm": 1.3202464580535889, + "learning_rate": 1e-06, + "loss": 0.0146, + "step": 3471 + }, + { + "epoch": 0.5910793326523663, + "grad_norm": 1.0647854804992676, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 3472 + }, + { + "epoch": 0.5912495743956419, + "grad_norm": 1.138110637664795, + "learning_rate": 1e-06, + "loss": 0.0112, + "step": 3473 + }, + { + "epoch": 0.5914198161389173, + "grad_norm": 0.9572812914848328, + "learning_rate": 1e-06, + "loss": 0.0073, + "step": 3474 + }, + { + "epoch": 0.5915900578821928, + "grad_norm": 1.1067378520965576, + "learning_rate": 1e-06, + "loss": 0.0086, + "step": 3475 + }, + { + "epoch": 0.5917602996254682, + "grad_norm": 1.1624847650527954, + "learning_rate": 1e-06, + "loss": 0.015, + "step": 3476 + }, + { + "epoch": 0.5919305413687436, + "grad_norm": 1.1068007946014404, + "learning_rate": 1e-06, + "loss": 0.0098, + "step": 3477 + }, + { + "epoch": 0.5921007831120191, + "grad_norm": 0.9079290628433228, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 3478 + }, + { + "epoch": 0.5922710248552945, + "grad_norm": 1.3529939651489258, + "learning_rate": 1e-06, + "loss": 0.0145, + "step": 3479 + }, + { + "epoch": 0.59244126659857, + "grad_norm": 0.914438009262085, + "learning_rate": 1e-06, + "loss": 0.0073, + "step": 3480 + }, + { + "epoch": 0.5926115083418454, + "grad_norm": 0.9735174775123596, + "learning_rate": 1e-06, + "loss": 0.0068, + "step": 3481 + }, + { + "epoch": 0.5927817500851209, + "grad_norm": 1.064338207244873, + "learning_rate": 1e-06, + "loss": 0.0061, + "step": 3482 + }, + { + "epoch": 0.5929519918283963, + "grad_norm": 1.145073413848877, + "learning_rate": 1e-06, + "loss": 0.0086, + "step": 3483 + }, + { + "epoch": 0.5931222335716718, + "grad_norm": 0.9396929144859314, + "learning_rate": 1e-06, + "loss": 0.008, + "step": 3484 + }, + { + "epoch": 0.5932924753149472, + "grad_norm": 0.8736780881881714, + "learning_rate": 1e-06, + "loss": 0.0063, + "step": 3485 + }, + { + "epoch": 0.5934627170582226, + "grad_norm": 1.157373070716858, + "learning_rate": 1e-06, + "loss": 0.0094, + "step": 3486 + }, + { + "epoch": 0.5936329588014981, + "grad_norm": 1.1017827987670898, + "learning_rate": 1e-06, + "loss": 0.0085, + "step": 3487 + }, + { + "epoch": 0.5938032005447735, + "grad_norm": 1.0480536222457886, + "learning_rate": 1e-06, + "loss": 0.0093, + "step": 3488 + }, + { + "epoch": 0.593973442288049, + "grad_norm": 0.9790979027748108, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 3489 + }, + { + "epoch": 0.5941436840313244, + "grad_norm": 0.9106320142745972, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 3490 + }, + { + "epoch": 0.5943139257746, + "grad_norm": 0.946974515914917, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 3491 + }, + { + "epoch": 0.5944841675178754, + "grad_norm": 1.0634249448776245, + "learning_rate": 1e-06, + "loss": 0.0102, + "step": 3492 + }, + { + "epoch": 0.5946544092611509, + "grad_norm": 1.1991939544677734, + "learning_rate": 1e-06, + "loss": 0.0111, + "step": 3493 + }, + { + "epoch": 0.5948246510044263, + "grad_norm": 0.9980155229568481, + "learning_rate": 1e-06, + "loss": 0.0077, + "step": 3494 + }, + { + "epoch": 0.5949948927477018, + "grad_norm": 1.0674749612808228, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 3495 + }, + { + "epoch": 0.5951651344909772, + "grad_norm": 0.9656354784965515, + "learning_rate": 1e-06, + "loss": 0.0067, + "step": 3496 + }, + { + "epoch": 0.5953353762342526, + "grad_norm": 1.1175800561904907, + "learning_rate": 1e-06, + "loss": 0.0084, + "step": 3497 + }, + { + "epoch": 0.5955056179775281, + "grad_norm": 0.8521952629089355, + "learning_rate": 1e-06, + "loss": 0.0076, + "step": 3498 + }, + { + "epoch": 0.5956758597208035, + "grad_norm": 1.266019344329834, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 3499 + }, + { + "epoch": 0.595846101464079, + "grad_norm": 1.155662178993225, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 3500 + }, + { + "epoch": 0.5960163432073544, + "grad_norm": 1.200804352760315, + "learning_rate": 1e-06, + "loss": 0.0098, + "step": 3501 + }, + { + "epoch": 0.5961865849506299, + "grad_norm": 1.0332434177398682, + "learning_rate": 1e-06, + "loss": 0.0093, + "step": 3502 + }, + { + "epoch": 0.5963568266939053, + "grad_norm": 1.114933729171753, + "learning_rate": 1e-06, + "loss": 0.0083, + "step": 3503 + }, + { + "epoch": 0.5965270684371808, + "grad_norm": 1.211915135383606, + "learning_rate": 1e-06, + "loss": 0.0081, + "step": 3504 + }, + { + "epoch": 0.5966973101804562, + "grad_norm": 1.0030415058135986, + "learning_rate": 1e-06, + "loss": 0.0086, + "step": 3505 + }, + { + "epoch": 0.5968675519237318, + "grad_norm": 1.073610782623291, + "learning_rate": 1e-06, + "loss": 0.0077, + "step": 3506 + }, + { + "epoch": 0.5970377936670072, + "grad_norm": 0.9142771363258362, + "learning_rate": 1e-06, + "loss": 0.0065, + "step": 3507 + }, + { + "epoch": 0.5972080354102826, + "grad_norm": 1.1004252433776855, + "learning_rate": 1e-06, + "loss": 0.0083, + "step": 3508 + }, + { + "epoch": 0.5973782771535581, + "grad_norm": 1.1081154346466064, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 3509 + }, + { + "epoch": 0.5975485188968335, + "grad_norm": 1.1693594455718994, + "learning_rate": 1e-06, + "loss": 0.0106, + "step": 3510 + }, + { + "epoch": 0.597718760640109, + "grad_norm": 1.2381548881530762, + "learning_rate": 1e-06, + "loss": 0.009, + "step": 3511 + }, + { + "epoch": 0.5978890023833844, + "grad_norm": 1.1379530429840088, + "learning_rate": 1e-06, + "loss": 0.0114, + "step": 3512 + }, + { + "epoch": 0.5980592441266599, + "grad_norm": 1.3537330627441406, + "learning_rate": 1e-06, + "loss": 0.0115, + "step": 3513 + }, + { + "epoch": 0.5982294858699353, + "grad_norm": 0.8830147981643677, + "learning_rate": 1e-06, + "loss": 0.009, + "step": 3514 + }, + { + "epoch": 0.5983997276132108, + "grad_norm": 1.09042489528656, + "learning_rate": 1e-06, + "loss": 0.0083, + "step": 3515 + }, + { + "epoch": 0.5985699693564862, + "grad_norm": 1.1855099201202393, + "learning_rate": 1e-06, + "loss": 0.0092, + "step": 3516 + }, + { + "epoch": 0.5987402110997616, + "grad_norm": 0.7881447672843933, + "learning_rate": 1e-06, + "loss": 0.0068, + "step": 3517 + }, + { + "epoch": 0.5989104528430371, + "grad_norm": 0.8492823243141174, + "learning_rate": 1e-06, + "loss": 0.0068, + "step": 3518 + }, + { + "epoch": 0.5990806945863125, + "grad_norm": 1.2348202466964722, + "learning_rate": 1e-06, + "loss": 0.0102, + "step": 3519 + }, + { + "epoch": 0.599250936329588, + "grad_norm": 1.514280080795288, + "learning_rate": 1e-06, + "loss": 0.0134, + "step": 3520 + }, + { + "epoch": 0.5994211780728634, + "grad_norm": 1.5777965784072876, + "learning_rate": 1e-06, + "loss": 0.0104, + "step": 3521 + }, + { + "epoch": 0.599591419816139, + "grad_norm": 1.3330321311950684, + "learning_rate": 1e-06, + "loss": 0.0086, + "step": 3522 + }, + { + "epoch": 0.5997616615594143, + "grad_norm": 1.630621314048767, + "learning_rate": 1e-06, + "loss": 0.0122, + "step": 3523 + }, + { + "epoch": 0.5999319033026899, + "grad_norm": 1.2280832529067993, + "learning_rate": 1e-06, + "loss": 0.0097, + "step": 3524 + }, + { + "epoch": 0.6001021450459653, + "grad_norm": 1.187753677368164, + "learning_rate": 1e-06, + "loss": 0.0087, + "step": 3525 + }, + { + "epoch": 0.6002723867892408, + "grad_norm": 1.6510494947433472, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 3526 + }, + { + "epoch": 0.6004426285325162, + "grad_norm": 1.1509414911270142, + "learning_rate": 1e-06, + "loss": 0.0115, + "step": 3527 + }, + { + "epoch": 0.6006128702757916, + "grad_norm": 1.300317406654358, + "learning_rate": 1e-06, + "loss": 0.0103, + "step": 3528 + }, + { + "epoch": 0.6007831120190671, + "grad_norm": 1.013220191001892, + "learning_rate": 1e-06, + "loss": 0.0108, + "step": 3529 + }, + { + "epoch": 0.6009533537623425, + "grad_norm": 0.9694526195526123, + "learning_rate": 1e-06, + "loss": 0.0076, + "step": 3530 + }, + { + "epoch": 0.601123595505618, + "grad_norm": 1.1134835481643677, + "learning_rate": 1e-06, + "loss": 0.011, + "step": 3531 + }, + { + "epoch": 0.6012938372488934, + "grad_norm": 1.0825552940368652, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 3532 + }, + { + "epoch": 0.6014640789921689, + "grad_norm": 1.277435064315796, + "learning_rate": 1e-06, + "loss": 0.0117, + "step": 3533 + }, + { + "epoch": 0.6016343207354443, + "grad_norm": 1.1507532596588135, + "learning_rate": 1e-06, + "loss": 0.009, + "step": 3534 + }, + { + "epoch": 0.6018045624787198, + "grad_norm": 1.923765778541565, + "learning_rate": 1e-06, + "loss": 0.0134, + "step": 3535 + }, + { + "epoch": 0.6019748042219952, + "grad_norm": 1.156172275543213, + "learning_rate": 1e-06, + "loss": 0.0123, + "step": 3536 + }, + { + "epoch": 0.6021450459652706, + "grad_norm": 0.9838459491729736, + "learning_rate": 1e-06, + "loss": 0.0108, + "step": 3537 + }, + { + "epoch": 0.6023152877085461, + "grad_norm": 1.3179664611816406, + "learning_rate": 1e-06, + "loss": 0.0119, + "step": 3538 + }, + { + "epoch": 0.6024855294518215, + "grad_norm": 0.9081220626831055, + "learning_rate": 1e-06, + "loss": 0.0076, + "step": 3539 + }, + { + "epoch": 0.6026557711950971, + "grad_norm": 1.1752504110336304, + "learning_rate": 1e-06, + "loss": 0.0106, + "step": 3540 + }, + { + "epoch": 0.6028260129383725, + "grad_norm": 0.9486705660820007, + "learning_rate": 1e-06, + "loss": 0.0107, + "step": 3541 + }, + { + "epoch": 0.602996254681648, + "grad_norm": 0.8370993733406067, + "learning_rate": 1e-06, + "loss": 0.0059, + "step": 3542 + }, + { + "epoch": 0.6031664964249234, + "grad_norm": 0.8764846920967102, + "learning_rate": 1e-06, + "loss": 0.0091, + "step": 3543 + }, + { + "epoch": 0.6033367381681989, + "grad_norm": 1.19331955909729, + "learning_rate": 1e-06, + "loss": 0.0097, + "step": 3544 + }, + { + "epoch": 0.6035069799114743, + "grad_norm": 0.8320559859275818, + "learning_rate": 1e-06, + "loss": 0.0074, + "step": 3545 + }, + { + "epoch": 0.6036772216547498, + "grad_norm": 1.54590904712677, + "learning_rate": 1e-06, + "loss": 0.0126, + "step": 3546 + }, + { + "epoch": 0.6038474633980252, + "grad_norm": 0.878666877746582, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 3547 + }, + { + "epoch": 0.6040177051413006, + "grad_norm": 1.5745292901992798, + "learning_rate": 1e-06, + "loss": 0.0122, + "step": 3548 + }, + { + "epoch": 0.6041879468845761, + "grad_norm": 0.9554889798164368, + "learning_rate": 1e-06, + "loss": 0.0072, + "step": 3549 + }, + { + "epoch": 0.6043581886278515, + "grad_norm": 1.1246119737625122, + "learning_rate": 1e-06, + "loss": 0.0102, + "step": 3550 + }, + { + "epoch": 0.604528430371127, + "grad_norm": 1.0322120189666748, + "learning_rate": 1e-06, + "loss": 0.0063, + "step": 3551 + }, + { + "epoch": 0.6046986721144024, + "grad_norm": 0.9092923998832703, + "learning_rate": 1e-06, + "loss": 0.0064, + "step": 3552 + }, + { + "epoch": 0.6048689138576779, + "grad_norm": 1.2549725770950317, + "learning_rate": 1e-06, + "loss": 0.0141, + "step": 3553 + }, + { + "epoch": 0.6050391556009533, + "grad_norm": 1.0988869667053223, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 3554 + }, + { + "epoch": 0.6052093973442288, + "grad_norm": 1.0146656036376953, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 3555 + }, + { + "epoch": 0.6053796390875043, + "grad_norm": 1.2625722885131836, + "learning_rate": 1e-06, + "loss": 0.0091, + "step": 3556 + }, + { + "epoch": 0.6055498808307797, + "grad_norm": 1.0923871994018555, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 3557 + }, + { + "epoch": 0.6057201225740552, + "grad_norm": 0.8863467574119568, + "learning_rate": 1e-06, + "loss": 0.006, + "step": 3558 + }, + { + "epoch": 0.6058903643173306, + "grad_norm": 1.0210617780685425, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 3559 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 2.0322105884552, + "learning_rate": 1e-06, + "loss": 0.02, + "step": 3560 + }, + { + "epoch": 0.6062308478038815, + "grad_norm": 1.0655431747436523, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 3561 + }, + { + "epoch": 0.606401089547157, + "grad_norm": 1.0927060842514038, + "learning_rate": 1e-06, + "loss": 0.0085, + "step": 3562 + }, + { + "epoch": 0.6065713312904324, + "grad_norm": 1.0595030784606934, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 3563 + }, + { + "epoch": 0.6067415730337079, + "grad_norm": 1.1079046726226807, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 3564 + }, + { + "epoch": 0.6069118147769833, + "grad_norm": 1.0296462774276733, + "learning_rate": 1e-06, + "loss": 0.0108, + "step": 3565 + }, + { + "epoch": 0.6070820565202588, + "grad_norm": 0.868334949016571, + "learning_rate": 1e-06, + "loss": 0.0065, + "step": 3566 + }, + { + "epoch": 0.6072522982635342, + "grad_norm": 1.4144976139068604, + "learning_rate": 1e-06, + "loss": 0.0141, + "step": 3567 + }, + { + "epoch": 0.6074225400068096, + "grad_norm": 1.1668305397033691, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 3568 + }, + { + "epoch": 0.6075927817500851, + "grad_norm": 1.2079553604125977, + "learning_rate": 1e-06, + "loss": 0.0091, + "step": 3569 + }, + { + "epoch": 0.6077630234933605, + "grad_norm": 1.1177006959915161, + "learning_rate": 1e-06, + "loss": 0.0117, + "step": 3570 + }, + { + "epoch": 0.607933265236636, + "grad_norm": 1.1665843725204468, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 3571 + }, + { + "epoch": 0.6081035069799114, + "grad_norm": 1.0102547407150269, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 3572 + }, + { + "epoch": 0.608273748723187, + "grad_norm": 0.9635432362556458, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 3573 + }, + { + "epoch": 0.6084439904664624, + "grad_norm": 1.3346338272094727, + "learning_rate": 1e-06, + "loss": 0.0108, + "step": 3574 + }, + { + "epoch": 0.6086142322097379, + "grad_norm": 1.608765721321106, + "learning_rate": 1e-06, + "loss": 0.0268, + "step": 3575 + }, + { + "epoch": 0.6087844739530133, + "grad_norm": 0.9008107781410217, + "learning_rate": 1e-06, + "loss": 0.0071, + "step": 3576 + }, + { + "epoch": 0.6089547156962888, + "grad_norm": 1.0293365716934204, + "learning_rate": 1e-06, + "loss": 0.0061, + "step": 3577 + }, + { + "epoch": 0.6091249574395642, + "grad_norm": 1.082380771636963, + "learning_rate": 1e-06, + "loss": 0.011, + "step": 3578 + }, + { + "epoch": 0.6092951991828396, + "grad_norm": 0.7760728597640991, + "learning_rate": 1e-06, + "loss": 0.0069, + "step": 3579 + }, + { + "epoch": 0.6094654409261151, + "grad_norm": 0.9977663159370422, + "learning_rate": 1e-06, + "loss": 0.0057, + "step": 3580 + }, + { + "epoch": 0.6096356826693905, + "grad_norm": 0.7965632081031799, + "learning_rate": 1e-06, + "loss": 0.0066, + "step": 3581 + }, + { + "epoch": 0.609805924412666, + "grad_norm": 1.3011534214019775, + "learning_rate": 1e-06, + "loss": 0.0158, + "step": 3582 + }, + { + "epoch": 0.6099761661559414, + "grad_norm": 1.1788239479064941, + "learning_rate": 1e-06, + "loss": 0.0141, + "step": 3583 + }, + { + "epoch": 0.6101464078992169, + "grad_norm": 1.0740551948547363, + "learning_rate": 1e-06, + "loss": 0.0111, + "step": 3584 + }, + { + "epoch": 0.6103166496424923, + "grad_norm": 1.2038638591766357, + "learning_rate": 1e-06, + "loss": 0.0105, + "step": 3585 + }, + { + "epoch": 0.6104868913857678, + "grad_norm": 1.0219711065292358, + "learning_rate": 1e-06, + "loss": 0.0098, + "step": 3586 + }, + { + "epoch": 0.6106571331290432, + "grad_norm": 1.2618956565856934, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 3587 + }, + { + "epoch": 0.6108273748723186, + "grad_norm": 1.1306350231170654, + "learning_rate": 1e-06, + "loss": 0.007, + "step": 3588 + }, + { + "epoch": 0.6109976166155942, + "grad_norm": 0.8603691458702087, + "learning_rate": 1e-06, + "loss": 0.008, + "step": 3589 + }, + { + "epoch": 0.6111678583588696, + "grad_norm": 1.2200685739517212, + "learning_rate": 1e-06, + "loss": 0.0105, + "step": 3590 + }, + { + "epoch": 0.6113381001021451, + "grad_norm": 1.4106483459472656, + "learning_rate": 1e-06, + "loss": 0.0089, + "step": 3591 + }, + { + "epoch": 0.6115083418454205, + "grad_norm": 1.3192464113235474, + "learning_rate": 1e-06, + "loss": 0.0123, + "step": 3592 + }, + { + "epoch": 0.611678583588696, + "grad_norm": 1.1287734508514404, + "learning_rate": 1e-06, + "loss": 0.0102, + "step": 3593 + }, + { + "epoch": 0.6118488253319714, + "grad_norm": 1.193594217300415, + "learning_rate": 1e-06, + "loss": 0.0087, + "step": 3594 + }, + { + "epoch": 0.6120190670752469, + "grad_norm": 1.3210780620574951, + "learning_rate": 1e-06, + "loss": 0.0098, + "step": 3595 + }, + { + "epoch": 0.6121893088185223, + "grad_norm": 1.2266240119934082, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 3596 + }, + { + "epoch": 0.6123595505617978, + "grad_norm": 1.1135960817337036, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 3597 + }, + { + "epoch": 0.6125297923050732, + "grad_norm": 1.2418755292892456, + "learning_rate": 1e-06, + "loss": 0.0104, + "step": 3598 + }, + { + "epoch": 0.6127000340483486, + "grad_norm": 1.2024139165878296, + "learning_rate": 1e-06, + "loss": 0.0107, + "step": 3599 + }, + { + "epoch": 0.6128702757916241, + "grad_norm": 0.8348981142044067, + "learning_rate": 1e-06, + "loss": 0.0071, + "step": 3600 + }, + { + "epoch": 0.6130405175348995, + "grad_norm": 1.1524181365966797, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 3601 + }, + { + "epoch": 0.613210759278175, + "grad_norm": 1.0412416458129883, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 3602 + }, + { + "epoch": 0.6133810010214504, + "grad_norm": 0.7285078763961792, + "learning_rate": 1e-06, + "loss": 0.0063, + "step": 3603 + }, + { + "epoch": 0.613551242764726, + "grad_norm": 0.9448837041854858, + "learning_rate": 1e-06, + "loss": 0.0064, + "step": 3604 + }, + { + "epoch": 0.6137214845080013, + "grad_norm": 0.8977110385894775, + "learning_rate": 1e-06, + "loss": 0.007, + "step": 3605 + }, + { + "epoch": 0.6138917262512769, + "grad_norm": 1.5285866260528564, + "learning_rate": 1e-06, + "loss": 0.0143, + "step": 3606 + }, + { + "epoch": 0.6140619679945523, + "grad_norm": 1.3463897705078125, + "learning_rate": 1e-06, + "loss": 0.0138, + "step": 3607 + }, + { + "epoch": 0.6142322097378277, + "grad_norm": 1.2851765155792236, + "learning_rate": 1e-06, + "loss": 0.0084, + "step": 3608 + }, + { + "epoch": 0.6144024514811032, + "grad_norm": 1.5624653100967407, + "learning_rate": 1e-06, + "loss": 0.0094, + "step": 3609 + }, + { + "epoch": 0.6145726932243786, + "grad_norm": 1.6467691659927368, + "learning_rate": 1e-06, + "loss": 0.0092, + "step": 3610 + }, + { + "epoch": 0.6147429349676541, + "grad_norm": 1.128787636756897, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 3611 + }, + { + "epoch": 0.6149131767109295, + "grad_norm": 1.4503066539764404, + "learning_rate": 1e-06, + "loss": 0.02, + "step": 3612 + }, + { + "epoch": 0.615083418454205, + "grad_norm": 0.8577298521995544, + "learning_rate": 1e-06, + "loss": 0.0079, + "step": 3613 + }, + { + "epoch": 0.6152536601974804, + "grad_norm": 1.0738738775253296, + "learning_rate": 1e-06, + "loss": 0.008, + "step": 3614 + }, + { + "epoch": 0.6154239019407559, + "grad_norm": 0.769759476184845, + "learning_rate": 1e-06, + "loss": 0.0058, + "step": 3615 + }, + { + "epoch": 0.6155941436840313, + "grad_norm": 0.8557948470115662, + "learning_rate": 1e-06, + "loss": 0.0085, + "step": 3616 + }, + { + "epoch": 0.6157643854273068, + "grad_norm": 0.8239853382110596, + "learning_rate": 1e-06, + "loss": 0.0079, + "step": 3617 + }, + { + "epoch": 0.6159346271705822, + "grad_norm": 0.9858947396278381, + "learning_rate": 1e-06, + "loss": 0.007, + "step": 3618 + }, + { + "epoch": 0.6161048689138576, + "grad_norm": 1.341850996017456, + "learning_rate": 1e-06, + "loss": 0.0122, + "step": 3619 + }, + { + "epoch": 0.6162751106571331, + "grad_norm": 0.8130425214767456, + "learning_rate": 1e-06, + "loss": 0.0072, + "step": 3620 + }, + { + "epoch": 0.6164453524004085, + "grad_norm": 0.826560378074646, + "learning_rate": 1e-06, + "loss": 0.0053, + "step": 3621 + }, + { + "epoch": 0.616615594143684, + "grad_norm": 1.0723116397857666, + "learning_rate": 1e-06, + "loss": 0.008, + "step": 3622 + }, + { + "epoch": 0.6167858358869595, + "grad_norm": 0.9984582662582397, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 3623 + }, + { + "epoch": 0.616956077630235, + "grad_norm": 1.1177350282669067, + "learning_rate": 1e-06, + "loss": 0.0114, + "step": 3624 + }, + { + "epoch": 0.6171263193735104, + "grad_norm": 1.006271243095398, + "learning_rate": 1e-06, + "loss": 0.0061, + "step": 3625 + }, + { + "epoch": 0.6172965611167859, + "grad_norm": 1.1038841009140015, + "learning_rate": 1e-06, + "loss": 0.0068, + "step": 3626 + }, + { + "epoch": 0.6174668028600613, + "grad_norm": 1.2689024209976196, + "learning_rate": 1e-06, + "loss": 0.0097, + "step": 3627 + }, + { + "epoch": 0.6176370446033368, + "grad_norm": 1.8578965663909912, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 3628 + }, + { + "epoch": 0.6178072863466122, + "grad_norm": 1.2967028617858887, + "learning_rate": 1e-06, + "loss": 0.0091, + "step": 3629 + }, + { + "epoch": 0.6179775280898876, + "grad_norm": 0.9618591666221619, + "learning_rate": 1e-06, + "loss": 0.0069, + "step": 3630 + }, + { + "epoch": 0.6181477698331631, + "grad_norm": 0.8689693212509155, + "learning_rate": 1e-06, + "loss": 0.0073, + "step": 3631 + }, + { + "epoch": 0.6183180115764385, + "grad_norm": 1.112473487854004, + "learning_rate": 1e-06, + "loss": 0.0087, + "step": 3632 + }, + { + "epoch": 0.618488253319714, + "grad_norm": 1.1240233182907104, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 3633 + }, + { + "epoch": 0.6186584950629894, + "grad_norm": 0.7019220590591431, + "learning_rate": 1e-06, + "loss": 0.0084, + "step": 3634 + }, + { + "epoch": 0.6188287368062649, + "grad_norm": 0.8149914145469666, + "learning_rate": 1e-06, + "loss": 0.008, + "step": 3635 + }, + { + "epoch": 0.6189989785495403, + "grad_norm": 1.152665376663208, + "learning_rate": 1e-06, + "loss": 0.0085, + "step": 3636 + }, + { + "epoch": 0.6191692202928158, + "grad_norm": 1.2790850400924683, + "learning_rate": 1e-06, + "loss": 0.0134, + "step": 3637 + }, + { + "epoch": 0.6193394620360912, + "grad_norm": 1.127205491065979, + "learning_rate": 1e-06, + "loss": 0.0093, + "step": 3638 + }, + { + "epoch": 0.6195097037793666, + "grad_norm": 1.7708073854446411, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3639 + }, + { + "epoch": 0.6196799455226422, + "grad_norm": 1.142716646194458, + "learning_rate": 1e-06, + "loss": 0.0147, + "step": 3640 + }, + { + "epoch": 0.6198501872659176, + "grad_norm": 0.881137490272522, + "learning_rate": 1e-06, + "loss": 0.0069, + "step": 3641 + }, + { + "epoch": 0.6200204290091931, + "grad_norm": 1.1176531314849854, + "learning_rate": 1e-06, + "loss": 0.009, + "step": 3642 + }, + { + "epoch": 0.6201906707524685, + "grad_norm": 1.3249547481536865, + "learning_rate": 1e-06, + "loss": 0.0089, + "step": 3643 + }, + { + "epoch": 0.620360912495744, + "grad_norm": 2.152984142303467, + "learning_rate": 1e-06, + "loss": 0.0188, + "step": 3644 + }, + { + "epoch": 0.6205311542390194, + "grad_norm": 1.2621212005615234, + "learning_rate": 1e-06, + "loss": 0.0153, + "step": 3645 + }, + { + "epoch": 0.6207013959822949, + "grad_norm": 1.1430959701538086, + "learning_rate": 1e-06, + "loss": 0.0107, + "step": 3646 + }, + { + "epoch": 0.6208716377255703, + "grad_norm": 1.2035990953445435, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 3647 + }, + { + "epoch": 0.6210418794688458, + "grad_norm": 1.1001369953155518, + "learning_rate": 1e-06, + "loss": 0.0081, + "step": 3648 + }, + { + "epoch": 0.6212121212121212, + "grad_norm": 1.1591325998306274, + "learning_rate": 1e-06, + "loss": 0.008, + "step": 3649 + }, + { + "epoch": 0.6213823629553966, + "grad_norm": 0.8609371781349182, + "learning_rate": 1e-06, + "loss": 0.0077, + "step": 3650 + }, + { + "epoch": 0.6215526046986721, + "grad_norm": 1.3064498901367188, + "learning_rate": 1e-06, + "loss": 0.0189, + "step": 3651 + }, + { + "epoch": 0.6217228464419475, + "grad_norm": 1.1729905605316162, + "learning_rate": 1e-06, + "loss": 0.0092, + "step": 3652 + }, + { + "epoch": 0.621893088185223, + "grad_norm": 1.064233660697937, + "learning_rate": 1e-06, + "loss": 0.0138, + "step": 3653 + }, + { + "epoch": 0.6220633299284984, + "grad_norm": 1.0935028791427612, + "learning_rate": 1e-06, + "loss": 0.0127, + "step": 3654 + }, + { + "epoch": 0.622233571671774, + "grad_norm": 0.9840522408485413, + "learning_rate": 1e-06, + "loss": 0.0074, + "step": 3655 + }, + { + "epoch": 0.6224038134150494, + "grad_norm": 1.152642846107483, + "learning_rate": 1e-06, + "loss": 0.0076, + "step": 3656 + }, + { + "epoch": 0.6225740551583249, + "grad_norm": 1.3375604152679443, + "learning_rate": 1e-06, + "loss": 0.011, + "step": 3657 + }, + { + "epoch": 0.6227442969016003, + "grad_norm": 1.8616725206375122, + "learning_rate": 1e-06, + "loss": 0.0119, + "step": 3658 + }, + { + "epoch": 0.6229145386448757, + "grad_norm": 1.0592864751815796, + "learning_rate": 1e-06, + "loss": 0.0077, + "step": 3659 + }, + { + "epoch": 0.6230847803881512, + "grad_norm": 1.135156512260437, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 3660 + }, + { + "epoch": 0.6232550221314266, + "grad_norm": 1.1434532403945923, + "learning_rate": 1e-06, + "loss": 0.0079, + "step": 3661 + }, + { + "epoch": 0.6234252638747021, + "grad_norm": 1.0572514533996582, + "learning_rate": 1e-06, + "loss": 0.0076, + "step": 3662 + }, + { + "epoch": 0.6235955056179775, + "grad_norm": 1.5850212574005127, + "learning_rate": 1e-06, + "loss": 0.0124, + "step": 3663 + }, + { + "epoch": 0.623765747361253, + "grad_norm": 1.4172906875610352, + "learning_rate": 1e-06, + "loss": 0.0128, + "step": 3664 + }, + { + "epoch": 0.6239359891045284, + "grad_norm": 1.1192266941070557, + "learning_rate": 1e-06, + "loss": 0.0114, + "step": 3665 + }, + { + "epoch": 0.6241062308478039, + "grad_norm": 1.0807698965072632, + "learning_rate": 1e-06, + "loss": 0.0068, + "step": 3666 + }, + { + "epoch": 0.6242764725910793, + "grad_norm": 1.0555567741394043, + "learning_rate": 1e-06, + "loss": 0.0083, + "step": 3667 + }, + { + "epoch": 0.6244467143343548, + "grad_norm": 1.2566262483596802, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 3668 + }, + { + "epoch": 0.6246169560776302, + "grad_norm": 0.9095777273178101, + "learning_rate": 1e-06, + "loss": 0.0074, + "step": 3669 + }, + { + "epoch": 0.6247871978209056, + "grad_norm": 0.8502013683319092, + "learning_rate": 1e-06, + "loss": 0.0067, + "step": 3670 + }, + { + "epoch": 0.6249574395641811, + "grad_norm": 1.1455646753311157, + "learning_rate": 1e-06, + "loss": 0.0079, + "step": 3671 + }, + { + "epoch": 0.6251276813074566, + "grad_norm": 1.1766411066055298, + "learning_rate": 1e-06, + "loss": 0.0074, + "step": 3672 + }, + { + "epoch": 0.6252979230507321, + "grad_norm": 1.0214406251907349, + "learning_rate": 1e-06, + "loss": 0.007, + "step": 3673 + }, + { + "epoch": 0.6254681647940075, + "grad_norm": 1.4715005159378052, + "learning_rate": 1e-06, + "loss": 0.0187, + "step": 3674 + }, + { + "epoch": 0.625638406537283, + "grad_norm": 0.9039010405540466, + "learning_rate": 1e-06, + "loss": 0.0059, + "step": 3675 + }, + { + "epoch": 0.6258086482805584, + "grad_norm": 1.0967904329299927, + "learning_rate": 1e-06, + "loss": 0.0069, + "step": 3676 + }, + { + "epoch": 0.6259788900238339, + "grad_norm": 1.013299822807312, + "learning_rate": 1e-06, + "loss": 0.0072, + "step": 3677 + }, + { + "epoch": 0.6261491317671093, + "grad_norm": 1.229809045791626, + "learning_rate": 1e-06, + "loss": 0.0094, + "step": 3678 + }, + { + "epoch": 0.6263193735103847, + "grad_norm": 1.207761287689209, + "learning_rate": 1e-06, + "loss": 0.0063, + "step": 3679 + }, + { + "epoch": 0.6264896152536602, + "grad_norm": 1.036696195602417, + "learning_rate": 1e-06, + "loss": 0.0091, + "step": 3680 + }, + { + "epoch": 0.6266598569969356, + "grad_norm": 1.182930827140808, + "learning_rate": 1e-06, + "loss": 0.0099, + "step": 3681 + }, + { + "epoch": 0.6268300987402111, + "grad_norm": 1.3283963203430176, + "learning_rate": 1e-06, + "loss": 0.0164, + "step": 3682 + }, + { + "epoch": 0.6270003404834865, + "grad_norm": 1.021653175354004, + "learning_rate": 1e-06, + "loss": 0.0093, + "step": 3683 + }, + { + "epoch": 0.627170582226762, + "grad_norm": 1.5096207857131958, + "learning_rate": 1e-06, + "loss": 0.0142, + "step": 3684 + }, + { + "epoch": 0.6273408239700374, + "grad_norm": 1.1412389278411865, + "learning_rate": 1e-06, + "loss": 0.0098, + "step": 3685 + }, + { + "epoch": 0.6275110657133129, + "grad_norm": 1.141716718673706, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 3686 + }, + { + "epoch": 0.6276813074565883, + "grad_norm": 1.1129909753799438, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 3687 + }, + { + "epoch": 0.6278515491998639, + "grad_norm": 0.8182403445243835, + "learning_rate": 1e-06, + "loss": 0.0064, + "step": 3688 + }, + { + "epoch": 0.6280217909431393, + "grad_norm": 1.2895714044570923, + "learning_rate": 1e-06, + "loss": 0.0184, + "step": 3689 + }, + { + "epoch": 0.6281920326864147, + "grad_norm": 0.8942720293998718, + "learning_rate": 1e-06, + "loss": 0.0069, + "step": 3690 + }, + { + "epoch": 0.6283622744296902, + "grad_norm": 1.0091437101364136, + "learning_rate": 1e-06, + "loss": 0.0076, + "step": 3691 + }, + { + "epoch": 0.6285325161729656, + "grad_norm": 1.1240427494049072, + "learning_rate": 1e-06, + "loss": 0.0086, + "step": 3692 + }, + { + "epoch": 0.6287027579162411, + "grad_norm": 1.328053593635559, + "learning_rate": 1e-06, + "loss": 0.0125, + "step": 3693 + }, + { + "epoch": 0.6288729996595165, + "grad_norm": 0.9419410824775696, + "learning_rate": 1e-06, + "loss": 0.0087, + "step": 3694 + }, + { + "epoch": 0.629043241402792, + "grad_norm": 0.7652695178985596, + "learning_rate": 1e-06, + "loss": 0.0057, + "step": 3695 + }, + { + "epoch": 0.6292134831460674, + "grad_norm": 1.2899373769760132, + "learning_rate": 1e-06, + "loss": 0.0091, + "step": 3696 + }, + { + "epoch": 0.6293837248893429, + "grad_norm": 1.0259873867034912, + "learning_rate": 1e-06, + "loss": 0.0086, + "step": 3697 + }, + { + "epoch": 0.6295539666326183, + "grad_norm": 1.1988884210586548, + "learning_rate": 1e-06, + "loss": 0.0072, + "step": 3698 + }, + { + "epoch": 0.6297242083758938, + "grad_norm": 0.9181260466575623, + "learning_rate": 1e-06, + "loss": 0.0071, + "step": 3699 + }, + { + "epoch": 0.6298944501191692, + "grad_norm": 0.9410200119018555, + "learning_rate": 1e-06, + "loss": 0.0074, + "step": 3700 + }, + { + "epoch": 0.6300646918624446, + "grad_norm": 1.4462769031524658, + "learning_rate": 1e-06, + "loss": 0.0113, + "step": 3701 + }, + { + "epoch": 0.6302349336057201, + "grad_norm": 1.0774480104446411, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 3702 + }, + { + "epoch": 0.6304051753489955, + "grad_norm": 1.004542589187622, + "learning_rate": 1e-06, + "loss": 0.0117, + "step": 3703 + }, + { + "epoch": 0.630575417092271, + "grad_norm": 1.9113359451293945, + "learning_rate": 1e-06, + "loss": 0.0168, + "step": 3704 + }, + { + "epoch": 0.6307456588355465, + "grad_norm": 1.03765869140625, + "learning_rate": 1e-06, + "loss": 0.0076, + "step": 3705 + }, + { + "epoch": 0.630915900578822, + "grad_norm": 1.1582138538360596, + "learning_rate": 1e-06, + "loss": 0.0072, + "step": 3706 + }, + { + "epoch": 0.6310861423220974, + "grad_norm": 1.195564866065979, + "learning_rate": 1e-06, + "loss": 0.0094, + "step": 3707 + }, + { + "epoch": 0.6312563840653729, + "grad_norm": 0.9630500674247742, + "learning_rate": 1e-06, + "loss": 0.0084, + "step": 3708 + }, + { + "epoch": 0.6314266258086483, + "grad_norm": 1.4461884498596191, + "learning_rate": 1e-06, + "loss": 0.0123, + "step": 3709 + }, + { + "epoch": 0.6315968675519237, + "grad_norm": 0.9124757051467896, + "learning_rate": 1e-06, + "loss": 0.0071, + "step": 3710 + }, + { + "epoch": 0.6317671092951992, + "grad_norm": 1.134900450706482, + "learning_rate": 1e-06, + "loss": 0.0111, + "step": 3711 + }, + { + "epoch": 0.6319373510384746, + "grad_norm": 1.1023823022842407, + "learning_rate": 1e-06, + "loss": 0.0074, + "step": 3712 + }, + { + "epoch": 0.6321075927817501, + "grad_norm": 1.0747976303100586, + "learning_rate": 1e-06, + "loss": 0.0111, + "step": 3713 + }, + { + "epoch": 0.6322778345250255, + "grad_norm": 0.8265736103057861, + "learning_rate": 1e-06, + "loss": 0.007, + "step": 3714 + }, + { + "epoch": 0.632448076268301, + "grad_norm": 1.0795259475708008, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 3715 + }, + { + "epoch": 0.6326183180115764, + "grad_norm": 1.0878645181655884, + "learning_rate": 1e-06, + "loss": 0.0074, + "step": 3716 + }, + { + "epoch": 0.6327885597548519, + "grad_norm": 0.9701595306396484, + "learning_rate": 1e-06, + "loss": 0.0072, + "step": 3717 + }, + { + "epoch": 0.6329588014981273, + "grad_norm": 1.2201334238052368, + "learning_rate": 1e-06, + "loss": 0.0087, + "step": 3718 + }, + { + "epoch": 0.6331290432414028, + "grad_norm": 1.0813226699829102, + "learning_rate": 1e-06, + "loss": 0.0087, + "step": 3719 + }, + { + "epoch": 0.6332992849846782, + "grad_norm": 0.7975080609321594, + "learning_rate": 1e-06, + "loss": 0.0056, + "step": 3720 + }, + { + "epoch": 0.6334695267279536, + "grad_norm": 1.270108938217163, + "learning_rate": 1e-06, + "loss": 0.0127, + "step": 3721 + }, + { + "epoch": 0.6336397684712292, + "grad_norm": 1.3048897981643677, + "learning_rate": 1e-06, + "loss": 0.008, + "step": 3722 + }, + { + "epoch": 0.6338100102145046, + "grad_norm": 2.3595523834228516, + "learning_rate": 1e-06, + "loss": 0.0136, + "step": 3723 + }, + { + "epoch": 0.6339802519577801, + "grad_norm": 1.565253734588623, + "learning_rate": 1e-06, + "loss": 0.0177, + "step": 3724 + }, + { + "epoch": 0.6341504937010555, + "grad_norm": 0.9214725494384766, + "learning_rate": 1e-06, + "loss": 0.0073, + "step": 3725 + }, + { + "epoch": 0.634320735444331, + "grad_norm": 0.9782243967056274, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 3726 + }, + { + "epoch": 0.6344909771876064, + "grad_norm": 0.9782243967056274, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 3727 + }, + { + "epoch": 0.6346612189308819, + "grad_norm": 1.1668390035629272, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 3728 + }, + { + "epoch": 0.6348314606741573, + "grad_norm": 1.1563256978988647, + "learning_rate": 1e-06, + "loss": 0.0085, + "step": 3729 + }, + { + "epoch": 0.6350017024174327, + "grad_norm": 0.9238855242729187, + "learning_rate": 1e-06, + "loss": 0.0089, + "step": 3730 + }, + { + "epoch": 0.6351719441607082, + "grad_norm": 0.9837780594825745, + "learning_rate": 1e-06, + "loss": 0.0054, + "step": 3731 + }, + { + "epoch": 0.6353421859039836, + "grad_norm": 1.0795369148254395, + "learning_rate": 1e-06, + "loss": 0.0092, + "step": 3732 + }, + { + "epoch": 0.6355124276472591, + "grad_norm": 0.8233152627944946, + "learning_rate": 1e-06, + "loss": 0.0056, + "step": 3733 + }, + { + "epoch": 0.6356826693905345, + "grad_norm": 0.9861051440238953, + "learning_rate": 1e-06, + "loss": 0.0087, + "step": 3734 + }, + { + "epoch": 0.63585291113381, + "grad_norm": 1.1408077478408813, + "learning_rate": 1e-06, + "loss": 0.0087, + "step": 3735 + }, + { + "epoch": 0.6360231528770854, + "grad_norm": 1.1144981384277344, + "learning_rate": 1e-06, + "loss": 0.0089, + "step": 3736 + }, + { + "epoch": 0.636193394620361, + "grad_norm": 1.3315359354019165, + "learning_rate": 1e-06, + "loss": 0.0085, + "step": 3737 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 0.9346138834953308, + "learning_rate": 1e-06, + "loss": 0.0102, + "step": 3738 + }, + { + "epoch": 0.6365338781069119, + "grad_norm": 1.3445444107055664, + "learning_rate": 1e-06, + "loss": 0.0218, + "step": 3739 + }, + { + "epoch": 0.6367041198501873, + "grad_norm": 1.3266631364822388, + "learning_rate": 1e-06, + "loss": 0.0081, + "step": 3740 + }, + { + "epoch": 0.6368743615934627, + "grad_norm": 0.7501433491706848, + "learning_rate": 1e-06, + "loss": 0.0066, + "step": 3741 + }, + { + "epoch": 0.6370446033367382, + "grad_norm": 0.9168145060539246, + "learning_rate": 1e-06, + "loss": 0.0071, + "step": 3742 + }, + { + "epoch": 0.6372148450800136, + "grad_norm": 0.8834933042526245, + "learning_rate": 1e-06, + "loss": 0.006, + "step": 3743 + }, + { + "epoch": 0.6373850868232891, + "grad_norm": 1.115196943283081, + "learning_rate": 1e-06, + "loss": 0.009, + "step": 3744 + }, + { + "epoch": 0.6375553285665645, + "grad_norm": 1.0395985841751099, + "learning_rate": 1e-06, + "loss": 0.0068, + "step": 3745 + }, + { + "epoch": 0.63772557030984, + "grad_norm": 1.1208879947662354, + "learning_rate": 1e-06, + "loss": 0.0079, + "step": 3746 + }, + { + "epoch": 0.6378958120531154, + "grad_norm": 1.482139229774475, + "learning_rate": 1e-06, + "loss": 0.0107, + "step": 3747 + }, + { + "epoch": 0.6380660537963909, + "grad_norm": 1.3802002668380737, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 3748 + }, + { + "epoch": 0.6382362955396663, + "grad_norm": 1.1360260248184204, + "learning_rate": 1e-06, + "loss": 0.0093, + "step": 3749 + }, + { + "epoch": 0.6384065372829418, + "grad_norm": 1.353955864906311, + "learning_rate": 1e-06, + "loss": 0.0098, + "step": 3750 + }, + { + "epoch": 0.6385767790262172, + "grad_norm": 1.0488437414169312, + "learning_rate": 1e-06, + "loss": 0.0069, + "step": 3751 + }, + { + "epoch": 0.6387470207694926, + "grad_norm": 1.547584891319275, + "learning_rate": 1e-06, + "loss": 0.0152, + "step": 3752 + }, + { + "epoch": 0.6389172625127681, + "grad_norm": 1.0610159635543823, + "learning_rate": 1e-06, + "loss": 0.0074, + "step": 3753 + }, + { + "epoch": 0.6390875042560435, + "grad_norm": 1.1896774768829346, + "learning_rate": 1e-06, + "loss": 0.0072, + "step": 3754 + }, + { + "epoch": 0.6392577459993191, + "grad_norm": 1.3751659393310547, + "learning_rate": 1e-06, + "loss": 0.0152, + "step": 3755 + }, + { + "epoch": 0.6394279877425945, + "grad_norm": 1.0783185958862305, + "learning_rate": 1e-06, + "loss": 0.0087, + "step": 3756 + }, + { + "epoch": 0.63959822948587, + "grad_norm": 1.002148985862732, + "learning_rate": 1e-06, + "loss": 0.0065, + "step": 3757 + }, + { + "epoch": 0.6397684712291454, + "grad_norm": 0.9721300005912781, + "learning_rate": 1e-06, + "loss": 0.0083, + "step": 3758 + }, + { + "epoch": 0.6399387129724209, + "grad_norm": 1.0367679595947266, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 3759 + }, + { + "epoch": 0.6401089547156963, + "grad_norm": 1.3424270153045654, + "learning_rate": 1e-06, + "loss": 0.0119, + "step": 3760 + }, + { + "epoch": 0.6402791964589717, + "grad_norm": 1.1471754312515259, + "learning_rate": 1e-06, + "loss": 0.0085, + "step": 3761 + }, + { + "epoch": 0.6404494382022472, + "grad_norm": 1.1951358318328857, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 3762 + }, + { + "epoch": 0.6406196799455226, + "grad_norm": 1.077262282371521, + "learning_rate": 1e-06, + "loss": 0.0077, + "step": 3763 + }, + { + "epoch": 0.6407899216887981, + "grad_norm": 2.4920833110809326, + "learning_rate": 1e-06, + "loss": 0.0159, + "step": 3764 + }, + { + "epoch": 0.6409601634320735, + "grad_norm": 1.1345974206924438, + "learning_rate": 1e-06, + "loss": 0.01, + "step": 3765 + }, + { + "epoch": 0.641130405175349, + "grad_norm": 0.8907074928283691, + "learning_rate": 1e-06, + "loss": 0.007, + "step": 3766 + }, + { + "epoch": 0.6413006469186244, + "grad_norm": 1.1611642837524414, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 3767 + }, + { + "epoch": 0.6414708886618999, + "grad_norm": 1.0396606922149658, + "learning_rate": 1e-06, + "loss": 0.0104, + "step": 3768 + }, + { + "epoch": 0.6416411304051753, + "grad_norm": 0.9746201634407043, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 3769 + }, + { + "epoch": 0.6418113721484509, + "grad_norm": 0.9750457406044006, + "learning_rate": 1e-06, + "loss": 0.0108, + "step": 3770 + }, + { + "epoch": 0.6419816138917263, + "grad_norm": 1.3506128787994385, + "learning_rate": 1e-06, + "loss": 0.0099, + "step": 3771 + }, + { + "epoch": 0.6421518556350017, + "grad_norm": 1.9846067428588867, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3772 + }, + { + "epoch": 0.6423220973782772, + "grad_norm": 0.9080547094345093, + "learning_rate": 1e-06, + "loss": 0.008, + "step": 3773 + }, + { + "epoch": 0.6424923391215526, + "grad_norm": 1.4013069868087769, + "learning_rate": 1e-06, + "loss": 0.0107, + "step": 3774 + }, + { + "epoch": 0.6426625808648281, + "grad_norm": 1.1114245653152466, + "learning_rate": 1e-06, + "loss": 0.0146, + "step": 3775 + }, + { + "epoch": 0.6428328226081035, + "grad_norm": 0.9145612120628357, + "learning_rate": 1e-06, + "loss": 0.0092, + "step": 3776 + }, + { + "epoch": 0.643003064351379, + "grad_norm": 0.9096139073371887, + "learning_rate": 1e-06, + "loss": 0.0089, + "step": 3777 + }, + { + "epoch": 0.6431733060946544, + "grad_norm": 2.0348570346832275, + "learning_rate": 1e-06, + "loss": 0.0102, + "step": 3778 + }, + { + "epoch": 0.6433435478379299, + "grad_norm": 1.3212345838546753, + "learning_rate": 1e-06, + "loss": 0.0129, + "step": 3779 + }, + { + "epoch": 0.6435137895812053, + "grad_norm": 0.8319016098976135, + "learning_rate": 1e-06, + "loss": 0.0072, + "step": 3780 + }, + { + "epoch": 0.6436840313244807, + "grad_norm": 0.9304143786430359, + "learning_rate": 1e-06, + "loss": 0.0067, + "step": 3781 + }, + { + "epoch": 0.6438542730677562, + "grad_norm": 1.048282504081726, + "learning_rate": 1e-06, + "loss": 0.0089, + "step": 3782 + }, + { + "epoch": 0.6440245148110316, + "grad_norm": 1.1110150814056396, + "learning_rate": 1e-06, + "loss": 0.0103, + "step": 3783 + }, + { + "epoch": 0.6441947565543071, + "grad_norm": 1.329409122467041, + "learning_rate": 1e-06, + "loss": 0.0162, + "step": 3784 + }, + { + "epoch": 0.6443649982975825, + "grad_norm": 0.838369607925415, + "learning_rate": 1e-06, + "loss": 0.0064, + "step": 3785 + }, + { + "epoch": 0.644535240040858, + "grad_norm": 1.109646201133728, + "learning_rate": 1e-06, + "loss": 0.0122, + "step": 3786 + }, + { + "epoch": 0.6447054817841334, + "grad_norm": 1.0034140348434448, + "learning_rate": 1e-06, + "loss": 0.0072, + "step": 3787 + }, + { + "epoch": 0.644875723527409, + "grad_norm": 1.1564267873764038, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 3788 + }, + { + "epoch": 0.6450459652706844, + "grad_norm": 1.3622796535491943, + "learning_rate": 1e-06, + "loss": 0.0152, + "step": 3789 + }, + { + "epoch": 0.6452162070139599, + "grad_norm": 0.9166209697723389, + "learning_rate": 1e-06, + "loss": 0.0066, + "step": 3790 + }, + { + "epoch": 0.6453864487572353, + "grad_norm": 1.0101425647735596, + "learning_rate": 1e-06, + "loss": 0.0113, + "step": 3791 + }, + { + "epoch": 0.6455566905005107, + "grad_norm": 1.1960997581481934, + "learning_rate": 1e-06, + "loss": 0.0143, + "step": 3792 + }, + { + "epoch": 0.6457269322437862, + "grad_norm": 1.1836118698120117, + "learning_rate": 1e-06, + "loss": 0.0083, + "step": 3793 + }, + { + "epoch": 0.6458971739870616, + "grad_norm": 1.131833553314209, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 3794 + }, + { + "epoch": 0.6460674157303371, + "grad_norm": 1.1071782112121582, + "learning_rate": 1e-06, + "loss": 0.0067, + "step": 3795 + }, + { + "epoch": 0.6462376574736125, + "grad_norm": 1.1466583013534546, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 3796 + }, + { + "epoch": 0.646407899216888, + "grad_norm": 1.2023851871490479, + "learning_rate": 1e-06, + "loss": 0.0079, + "step": 3797 + }, + { + "epoch": 0.6465781409601634, + "grad_norm": 1.4373587369918823, + "learning_rate": 1e-06, + "loss": 0.0123, + "step": 3798 + }, + { + "epoch": 0.6467483827034389, + "grad_norm": 0.9874997138977051, + "learning_rate": 1e-06, + "loss": 0.007, + "step": 3799 + }, + { + "epoch": 0.6469186244467143, + "grad_norm": 1.1962223052978516, + "learning_rate": 1e-06, + "loss": 0.0114, + "step": 3800 + }, + { + "epoch": 0.6470888661899897, + "grad_norm": 1.095088005065918, + "learning_rate": 1e-06, + "loss": 0.0089, + "step": 3801 + }, + { + "epoch": 0.6472591079332652, + "grad_norm": 1.0901947021484375, + "learning_rate": 1e-06, + "loss": 0.0074, + "step": 3802 + }, + { + "epoch": 0.6474293496765406, + "grad_norm": 1.262510895729065, + "learning_rate": 1e-06, + "loss": 0.0067, + "step": 3803 + }, + { + "epoch": 0.6475995914198162, + "grad_norm": 1.2522022724151611, + "learning_rate": 1e-06, + "loss": 0.0106, + "step": 3804 + }, + { + "epoch": 0.6477698331630916, + "grad_norm": 1.0772358179092407, + "learning_rate": 1e-06, + "loss": 0.0074, + "step": 3805 + }, + { + "epoch": 0.6479400749063671, + "grad_norm": 1.433408498764038, + "learning_rate": 1e-06, + "loss": 0.011, + "step": 3806 + }, + { + "epoch": 0.6481103166496425, + "grad_norm": 1.124001383781433, + "learning_rate": 1e-06, + "loss": 0.0076, + "step": 3807 + }, + { + "epoch": 0.648280558392918, + "grad_norm": 1.0808985233306885, + "learning_rate": 1e-06, + "loss": 0.0074, + "step": 3808 + }, + { + "epoch": 0.6484508001361934, + "grad_norm": 1.0477313995361328, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 3809 + }, + { + "epoch": 0.6486210418794689, + "grad_norm": 2.4625258445739746, + "learning_rate": 1e-06, + "loss": 0.0164, + "step": 3810 + }, + { + "epoch": 0.6487912836227443, + "grad_norm": 0.9473697543144226, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 3811 + }, + { + "epoch": 0.6489615253660197, + "grad_norm": 0.9401687383651733, + "learning_rate": 1e-06, + "loss": 0.0076, + "step": 3812 + }, + { + "epoch": 0.6491317671092952, + "grad_norm": 1.2510778903961182, + "learning_rate": 1e-06, + "loss": 0.0086, + "step": 3813 + }, + { + "epoch": 0.6493020088525706, + "grad_norm": 0.9812240600585938, + "learning_rate": 1e-06, + "loss": 0.01, + "step": 3814 + }, + { + "epoch": 0.6494722505958461, + "grad_norm": 0.99561607837677, + "learning_rate": 1e-06, + "loss": 0.0077, + "step": 3815 + }, + { + "epoch": 0.6496424923391215, + "grad_norm": 1.579348087310791, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3816 + }, + { + "epoch": 0.649812734082397, + "grad_norm": 0.7100584506988525, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 3817 + }, + { + "epoch": 0.6499829758256724, + "grad_norm": 1.0388001203536987, + "learning_rate": 1e-06, + "loss": 0.0071, + "step": 3818 + }, + { + "epoch": 0.650153217568948, + "grad_norm": 1.2186797857284546, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 3819 + }, + { + "epoch": 0.6503234593122234, + "grad_norm": 0.8475635647773743, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 3820 + }, + { + "epoch": 0.6504937010554989, + "grad_norm": 1.1312545537948608, + "learning_rate": 1e-06, + "loss": 0.0067, + "step": 3821 + }, + { + "epoch": 0.6506639427987743, + "grad_norm": 1.2596800327301025, + "learning_rate": 1e-06, + "loss": 0.0117, + "step": 3822 + }, + { + "epoch": 0.6508341845420497, + "grad_norm": 0.8742133378982544, + "learning_rate": 1e-06, + "loss": 0.01, + "step": 3823 + }, + { + "epoch": 0.6510044262853252, + "grad_norm": 0.891171395778656, + "learning_rate": 1e-06, + "loss": 0.0092, + "step": 3824 + }, + { + "epoch": 0.6511746680286006, + "grad_norm": 1.0801339149475098, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 3825 + }, + { + "epoch": 0.6513449097718761, + "grad_norm": 0.9043775200843811, + "learning_rate": 1e-06, + "loss": 0.0094, + "step": 3826 + }, + { + "epoch": 0.6515151515151515, + "grad_norm": 1.2618519067764282, + "learning_rate": 1e-06, + "loss": 0.009, + "step": 3827 + }, + { + "epoch": 0.651685393258427, + "grad_norm": 0.9808958172798157, + "learning_rate": 1e-06, + "loss": 0.0068, + "step": 3828 + }, + { + "epoch": 0.6518556350017024, + "grad_norm": 0.9210644364356995, + "learning_rate": 1e-06, + "loss": 0.0059, + "step": 3829 + }, + { + "epoch": 0.6520258767449779, + "grad_norm": 1.0287270545959473, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 3830 + }, + { + "epoch": 0.6521961184882533, + "grad_norm": 0.9316836595535278, + "learning_rate": 1e-06, + "loss": 0.0077, + "step": 3831 + }, + { + "epoch": 0.6523663602315287, + "grad_norm": 0.9281513094902039, + "learning_rate": 1e-06, + "loss": 0.0077, + "step": 3832 + }, + { + "epoch": 0.6525366019748042, + "grad_norm": 1.0260971784591675, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 3833 + }, + { + "epoch": 0.6527068437180796, + "grad_norm": 1.012954831123352, + "learning_rate": 1e-06, + "loss": 0.0099, + "step": 3834 + }, + { + "epoch": 0.6528770854613551, + "grad_norm": 1.0251330137252808, + "learning_rate": 1e-06, + "loss": 0.0068, + "step": 3835 + }, + { + "epoch": 0.6530473272046305, + "grad_norm": 1.228165864944458, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 3836 + }, + { + "epoch": 0.6532175689479061, + "grad_norm": 0.8660666346549988, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 3837 + }, + { + "epoch": 0.6533878106911815, + "grad_norm": 0.8670229911804199, + "learning_rate": 1e-06, + "loss": 0.0059, + "step": 3838 + }, + { + "epoch": 0.653558052434457, + "grad_norm": 1.1961077451705933, + "learning_rate": 1e-06, + "loss": 0.0134, + "step": 3839 + }, + { + "epoch": 0.6537282941777324, + "grad_norm": 0.9217979311943054, + "learning_rate": 1e-06, + "loss": 0.0065, + "step": 3840 + }, + { + "epoch": 0.6538985359210079, + "grad_norm": 0.9868664741516113, + "learning_rate": 1e-06, + "loss": 0.0073, + "step": 3841 + }, + { + "epoch": 0.6540687776642833, + "grad_norm": 1.3531166315078735, + "learning_rate": 1e-06, + "loss": 0.0118, + "step": 3842 + }, + { + "epoch": 0.6542390194075587, + "grad_norm": 1.6801015138626099, + "learning_rate": 1e-06, + "loss": 0.012, + "step": 3843 + }, + { + "epoch": 0.6544092611508342, + "grad_norm": 1.0095585584640503, + "learning_rate": 1e-06, + "loss": 0.0065, + "step": 3844 + }, + { + "epoch": 0.6545795028941096, + "grad_norm": 0.8943735361099243, + "learning_rate": 1e-06, + "loss": 0.006, + "step": 3845 + }, + { + "epoch": 0.6547497446373851, + "grad_norm": 1.3049546480178833, + "learning_rate": 1e-06, + "loss": 0.0092, + "step": 3846 + }, + { + "epoch": 0.6549199863806605, + "grad_norm": 1.1015139818191528, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 3847 + }, + { + "epoch": 0.655090228123936, + "grad_norm": 1.2883460521697998, + "learning_rate": 1e-06, + "loss": 0.0076, + "step": 3848 + }, + { + "epoch": 0.6552604698672114, + "grad_norm": 0.9433921575546265, + "learning_rate": 1e-06, + "loss": 0.007, + "step": 3849 + }, + { + "epoch": 0.6554307116104869, + "grad_norm": 1.1686891317367554, + "learning_rate": 1e-06, + "loss": 0.0092, + "step": 3850 + }, + { + "epoch": 0.6556009533537623, + "grad_norm": 1.2734119892120361, + "learning_rate": 1e-06, + "loss": 0.0115, + "step": 3851 + }, + { + "epoch": 0.6557711950970377, + "grad_norm": 1.1788426637649536, + "learning_rate": 1e-06, + "loss": 0.007, + "step": 3852 + }, + { + "epoch": 0.6559414368403133, + "grad_norm": 1.1873812675476074, + "learning_rate": 1e-06, + "loss": 0.0148, + "step": 3853 + }, + { + "epoch": 0.6561116785835887, + "grad_norm": 1.2980464696884155, + "learning_rate": 1e-06, + "loss": 0.0079, + "step": 3854 + }, + { + "epoch": 0.6562819203268642, + "grad_norm": 1.1819038391113281, + "learning_rate": 1e-06, + "loss": 0.0116, + "step": 3855 + }, + { + "epoch": 0.6564521620701396, + "grad_norm": 1.5700839757919312, + "learning_rate": 1e-06, + "loss": 0.0128, + "step": 3856 + }, + { + "epoch": 0.6566224038134151, + "grad_norm": 0.907390296459198, + "learning_rate": 1e-06, + "loss": 0.0081, + "step": 3857 + }, + { + "epoch": 0.6567926455566905, + "grad_norm": 1.2444868087768555, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 3858 + }, + { + "epoch": 0.656962887299966, + "grad_norm": 0.8420805931091309, + "learning_rate": 1e-06, + "loss": 0.007, + "step": 3859 + }, + { + "epoch": 0.6571331290432414, + "grad_norm": 0.9762222170829773, + "learning_rate": 1e-06, + "loss": 0.0091, + "step": 3860 + }, + { + "epoch": 0.6573033707865169, + "grad_norm": 1.3171446323394775, + "learning_rate": 1e-06, + "loss": 0.0071, + "step": 3861 + }, + { + "epoch": 0.6574736125297923, + "grad_norm": 0.9845103025436401, + "learning_rate": 1e-06, + "loss": 0.0092, + "step": 3862 + }, + { + "epoch": 0.6576438542730677, + "grad_norm": 1.1617891788482666, + "learning_rate": 1e-06, + "loss": 0.0102, + "step": 3863 + }, + { + "epoch": 0.6578140960163432, + "grad_norm": 0.9630821943283081, + "learning_rate": 1e-06, + "loss": 0.0073, + "step": 3864 + }, + { + "epoch": 0.6579843377596186, + "grad_norm": 1.274949312210083, + "learning_rate": 1e-06, + "loss": 0.0079, + "step": 3865 + }, + { + "epoch": 0.6581545795028941, + "grad_norm": 1.40157151222229, + "learning_rate": 1e-06, + "loss": 0.0071, + "step": 3866 + }, + { + "epoch": 0.6583248212461695, + "grad_norm": 1.0424250364303589, + "learning_rate": 1e-06, + "loss": 0.0094, + "step": 3867 + }, + { + "epoch": 0.658495062989445, + "grad_norm": 1.3154240846633911, + "learning_rate": 1e-06, + "loss": 0.0126, + "step": 3868 + }, + { + "epoch": 0.6586653047327204, + "grad_norm": 1.1080321073532104, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 3869 + }, + { + "epoch": 0.658835546475996, + "grad_norm": 1.0510450601577759, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 3870 + }, + { + "epoch": 0.6590057882192714, + "grad_norm": 1.3665117025375366, + "learning_rate": 1e-06, + "loss": 0.0115, + "step": 3871 + }, + { + "epoch": 0.6591760299625468, + "grad_norm": 0.914477527141571, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 3872 + }, + { + "epoch": 0.6593462717058223, + "grad_norm": 1.2469263076782227, + "learning_rate": 1e-06, + "loss": 0.0094, + "step": 3873 + }, + { + "epoch": 0.6595165134490977, + "grad_norm": 1.1164896488189697, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 3874 + }, + { + "epoch": 0.6596867551923732, + "grad_norm": 1.092367172241211, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 3875 + }, + { + "epoch": 0.6598569969356486, + "grad_norm": 1.0838600397109985, + "learning_rate": 1e-06, + "loss": 0.0116, + "step": 3876 + }, + { + "epoch": 0.6600272386789241, + "grad_norm": 1.1364141702651978, + "learning_rate": 1e-06, + "loss": 0.0063, + "step": 3877 + }, + { + "epoch": 0.6601974804221995, + "grad_norm": 1.0023877620697021, + "learning_rate": 1e-06, + "loss": 0.0073, + "step": 3878 + }, + { + "epoch": 0.660367722165475, + "grad_norm": 0.9350517392158508, + "learning_rate": 1e-06, + "loss": 0.0076, + "step": 3879 + }, + { + "epoch": 0.6605379639087504, + "grad_norm": 0.807680606842041, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 3880 + }, + { + "epoch": 0.6607082056520259, + "grad_norm": 0.7163336277008057, + "learning_rate": 1e-06, + "loss": 0.0057, + "step": 3881 + }, + { + "epoch": 0.6608784473953013, + "grad_norm": 1.0191099643707275, + "learning_rate": 1e-06, + "loss": 0.0085, + "step": 3882 + }, + { + "epoch": 0.6610486891385767, + "grad_norm": 0.8287161588668823, + "learning_rate": 1e-06, + "loss": 0.0083, + "step": 3883 + }, + { + "epoch": 0.6612189308818522, + "grad_norm": 1.2603466510772705, + "learning_rate": 1e-06, + "loss": 0.0077, + "step": 3884 + }, + { + "epoch": 0.6613891726251276, + "grad_norm": 1.042250633239746, + "learning_rate": 1e-06, + "loss": 0.0123, + "step": 3885 + }, + { + "epoch": 0.6615594143684032, + "grad_norm": 1.064152479171753, + "learning_rate": 1e-06, + "loss": 0.011, + "step": 3886 + }, + { + "epoch": 0.6617296561116786, + "grad_norm": 1.2731701135635376, + "learning_rate": 1e-06, + "loss": 0.0217, + "step": 3887 + }, + { + "epoch": 0.6618998978549541, + "grad_norm": 1.1830432415008545, + "learning_rate": 1e-06, + "loss": 0.0084, + "step": 3888 + }, + { + "epoch": 0.6620701395982295, + "grad_norm": 1.008324384689331, + "learning_rate": 1e-06, + "loss": 0.0064, + "step": 3889 + }, + { + "epoch": 0.662240381341505, + "grad_norm": 0.9876424074172974, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 3890 + }, + { + "epoch": 0.6624106230847804, + "grad_norm": 1.2349107265472412, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 3891 + }, + { + "epoch": 0.6625808648280559, + "grad_norm": 1.1617462635040283, + "learning_rate": 1e-06, + "loss": 0.0142, + "step": 3892 + }, + { + "epoch": 0.6627511065713313, + "grad_norm": 0.8870407938957214, + "learning_rate": 1e-06, + "loss": 0.0073, + "step": 3893 + }, + { + "epoch": 0.6629213483146067, + "grad_norm": 1.0737135410308838, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 3894 + }, + { + "epoch": 0.6630915900578822, + "grad_norm": 1.0294846296310425, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 3895 + }, + { + "epoch": 0.6632618318011576, + "grad_norm": 0.8994992971420288, + "learning_rate": 1e-06, + "loss": 0.0068, + "step": 3896 + }, + { + "epoch": 0.6634320735444331, + "grad_norm": 1.0250681638717651, + "learning_rate": 1e-06, + "loss": 0.0105, + "step": 3897 + }, + { + "epoch": 0.6636023152877085, + "grad_norm": 1.2135937213897705, + "learning_rate": 1e-06, + "loss": 0.0092, + "step": 3898 + }, + { + "epoch": 0.663772557030984, + "grad_norm": 0.907864511013031, + "learning_rate": 1e-06, + "loss": 0.0098, + "step": 3899 + }, + { + "epoch": 0.6639427987742594, + "grad_norm": 1.0360740423202515, + "learning_rate": 1e-06, + "loss": 0.0065, + "step": 3900 + }, + { + "epoch": 0.664113040517535, + "grad_norm": 1.1412363052368164, + "learning_rate": 1e-06, + "loss": 0.0126, + "step": 3901 + }, + { + "epoch": 0.6642832822608103, + "grad_norm": 1.2569540739059448, + "learning_rate": 1e-06, + "loss": 0.0121, + "step": 3902 + }, + { + "epoch": 0.6644535240040857, + "grad_norm": 1.2806369066238403, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 3903 + }, + { + "epoch": 0.6646237657473613, + "grad_norm": 1.199313759803772, + "learning_rate": 1e-06, + "loss": 0.0137, + "step": 3904 + }, + { + "epoch": 0.6647940074906367, + "grad_norm": 1.0673506259918213, + "learning_rate": 1e-06, + "loss": 0.0132, + "step": 3905 + }, + { + "epoch": 0.6649642492339122, + "grad_norm": 1.1850353479385376, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 3906 + }, + { + "epoch": 0.6651344909771876, + "grad_norm": 1.9815788269042969, + "learning_rate": 1e-06, + "loss": 0.0108, + "step": 3907 + }, + { + "epoch": 0.6653047327204631, + "grad_norm": 1.181032657623291, + "learning_rate": 1e-06, + "loss": 0.007, + "step": 3908 + }, + { + "epoch": 0.6654749744637385, + "grad_norm": 1.2337028980255127, + "learning_rate": 1e-06, + "loss": 0.008, + "step": 3909 + }, + { + "epoch": 0.665645216207014, + "grad_norm": 1.0751506090164185, + "learning_rate": 1e-06, + "loss": 0.0065, + "step": 3910 + }, + { + "epoch": 0.6658154579502894, + "grad_norm": 1.4897148609161377, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 3911 + }, + { + "epoch": 0.6659856996935649, + "grad_norm": 1.2287198305130005, + "learning_rate": 1e-06, + "loss": 0.0085, + "step": 3912 + }, + { + "epoch": 0.6661559414368403, + "grad_norm": 1.3234024047851562, + "learning_rate": 1e-06, + "loss": 0.008, + "step": 3913 + }, + { + "epoch": 0.6663261831801157, + "grad_norm": 1.0900822877883911, + "learning_rate": 1e-06, + "loss": 0.0114, + "step": 3914 + }, + { + "epoch": 0.6664964249233912, + "grad_norm": 0.9296737313270569, + "learning_rate": 1e-06, + "loss": 0.0063, + "step": 3915 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.1464951038360596, + "learning_rate": 1e-06, + "loss": 0.0083, + "step": 3916 + }, + { + "epoch": 0.6668369084099421, + "grad_norm": 1.5186680555343628, + "learning_rate": 1e-06, + "loss": 0.013, + "step": 3917 + }, + { + "epoch": 0.6670071501532175, + "grad_norm": 1.0801197290420532, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 3918 + }, + { + "epoch": 0.667177391896493, + "grad_norm": 1.058134913444519, + "learning_rate": 1e-06, + "loss": 0.0066, + "step": 3919 + }, + { + "epoch": 0.6673476336397685, + "grad_norm": 0.9633292555809021, + "learning_rate": 1e-06, + "loss": 0.0066, + "step": 3920 + }, + { + "epoch": 0.667517875383044, + "grad_norm": 1.1550352573394775, + "learning_rate": 1e-06, + "loss": 0.0091, + "step": 3921 + }, + { + "epoch": 0.6676881171263194, + "grad_norm": 1.437263011932373, + "learning_rate": 1e-06, + "loss": 0.0086, + "step": 3922 + }, + { + "epoch": 0.6678583588695948, + "grad_norm": 0.9223193526268005, + "learning_rate": 1e-06, + "loss": 0.0059, + "step": 3923 + }, + { + "epoch": 0.6680286006128703, + "grad_norm": 0.9309805631637573, + "learning_rate": 1e-06, + "loss": 0.0076, + "step": 3924 + }, + { + "epoch": 0.6681988423561457, + "grad_norm": 1.1240042448043823, + "learning_rate": 1e-06, + "loss": 0.0081, + "step": 3925 + }, + { + "epoch": 0.6683690840994212, + "grad_norm": 1.074847936630249, + "learning_rate": 1e-06, + "loss": 0.0116, + "step": 3926 + }, + { + "epoch": 0.6685393258426966, + "grad_norm": 0.9684469103813171, + "learning_rate": 1e-06, + "loss": 0.0073, + "step": 3927 + }, + { + "epoch": 0.6687095675859721, + "grad_norm": 1.2416397333145142, + "learning_rate": 1e-06, + "loss": 0.0099, + "step": 3928 + }, + { + "epoch": 0.6688798093292475, + "grad_norm": 0.9240944981575012, + "learning_rate": 1e-06, + "loss": 0.0067, + "step": 3929 + }, + { + "epoch": 0.669050051072523, + "grad_norm": 0.9790809154510498, + "learning_rate": 1e-06, + "loss": 0.0092, + "step": 3930 + }, + { + "epoch": 0.6692202928157984, + "grad_norm": 0.9496670961380005, + "learning_rate": 1e-06, + "loss": 0.0069, + "step": 3931 + }, + { + "epoch": 0.6693905345590739, + "grad_norm": 0.8220587968826294, + "learning_rate": 1e-06, + "loss": 0.008, + "step": 3932 + }, + { + "epoch": 0.6695607763023493, + "grad_norm": 1.191679835319519, + "learning_rate": 1e-06, + "loss": 0.0107, + "step": 3933 + }, + { + "epoch": 0.6697310180456247, + "grad_norm": 1.0436556339263916, + "learning_rate": 1e-06, + "loss": 0.0058, + "step": 3934 + }, + { + "epoch": 0.6699012597889002, + "grad_norm": 1.3039947748184204, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 3935 + }, + { + "epoch": 0.6700715015321757, + "grad_norm": 1.2773553133010864, + "learning_rate": 1e-06, + "loss": 0.008, + "step": 3936 + }, + { + "epoch": 0.6702417432754512, + "grad_norm": 1.6559447050094604, + "learning_rate": 1e-06, + "loss": 0.0166, + "step": 3937 + }, + { + "epoch": 0.6704119850187266, + "grad_norm": 1.3342622518539429, + "learning_rate": 1e-06, + "loss": 0.0092, + "step": 3938 + }, + { + "epoch": 0.6705822267620021, + "grad_norm": 1.0363655090332031, + "learning_rate": 1e-06, + "loss": 0.0063, + "step": 3939 + }, + { + "epoch": 0.6707524685052775, + "grad_norm": 1.0175873041152954, + "learning_rate": 1e-06, + "loss": 0.0106, + "step": 3940 + }, + { + "epoch": 0.670922710248553, + "grad_norm": 0.8500238656997681, + "learning_rate": 1e-06, + "loss": 0.0099, + "step": 3941 + }, + { + "epoch": 0.6710929519918284, + "grad_norm": 4.019010543823242, + "learning_rate": 1e-06, + "loss": 0.0548, + "step": 3942 + }, + { + "epoch": 0.6712631937351039, + "grad_norm": 0.9771483540534973, + "learning_rate": 1e-06, + "loss": 0.006, + "step": 3943 + }, + { + "epoch": 0.6714334354783793, + "grad_norm": 1.0605002641677856, + "learning_rate": 1e-06, + "loss": 0.0065, + "step": 3944 + }, + { + "epoch": 0.6716036772216547, + "grad_norm": 1.0014209747314453, + "learning_rate": 1e-06, + "loss": 0.0066, + "step": 3945 + }, + { + "epoch": 0.6717739189649302, + "grad_norm": 1.0529136657714844, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 3946 + }, + { + "epoch": 0.6719441607082056, + "grad_norm": 1.1662652492523193, + "learning_rate": 1e-06, + "loss": 0.0168, + "step": 3947 + }, + { + "epoch": 0.6721144024514811, + "grad_norm": 1.425837755203247, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 3948 + }, + { + "epoch": 0.6722846441947565, + "grad_norm": 0.8996866345405579, + "learning_rate": 1e-06, + "loss": 0.0063, + "step": 3949 + }, + { + "epoch": 0.672454885938032, + "grad_norm": 0.9349283576011658, + "learning_rate": 1e-06, + "loss": 0.007, + "step": 3950 + }, + { + "epoch": 0.6726251276813074, + "grad_norm": 0.9103063344955444, + "learning_rate": 1e-06, + "loss": 0.0073, + "step": 3951 + }, + { + "epoch": 0.672795369424583, + "grad_norm": 1.297703504562378, + "learning_rate": 1e-06, + "loss": 0.016, + "step": 3952 + }, + { + "epoch": 0.6729656111678584, + "grad_norm": 1.0639750957489014, + "learning_rate": 1e-06, + "loss": 0.0079, + "step": 3953 + }, + { + "epoch": 0.6731358529111338, + "grad_norm": 1.0544763803482056, + "learning_rate": 1e-06, + "loss": 0.0083, + "step": 3954 + }, + { + "epoch": 0.6733060946544093, + "grad_norm": 1.192960500717163, + "learning_rate": 1e-06, + "loss": 0.0064, + "step": 3955 + }, + { + "epoch": 0.6734763363976847, + "grad_norm": 0.8247693777084351, + "learning_rate": 1e-06, + "loss": 0.0074, + "step": 3956 + }, + { + "epoch": 0.6736465781409602, + "grad_norm": 0.7883464694023132, + "learning_rate": 1e-06, + "loss": 0.0067, + "step": 3957 + }, + { + "epoch": 0.6738168198842356, + "grad_norm": 0.9810028672218323, + "learning_rate": 1e-06, + "loss": 0.0069, + "step": 3958 + }, + { + "epoch": 0.6739870616275111, + "grad_norm": 0.9418751001358032, + "learning_rate": 1e-06, + "loss": 0.0064, + "step": 3959 + }, + { + "epoch": 0.6741573033707865, + "grad_norm": 1.339374303817749, + "learning_rate": 1e-06, + "loss": 0.0092, + "step": 3960 + }, + { + "epoch": 0.674327545114062, + "grad_norm": 1.082369089126587, + "learning_rate": 1e-06, + "loss": 0.0089, + "step": 3961 + }, + { + "epoch": 0.6744977868573374, + "grad_norm": 1.2776871919631958, + "learning_rate": 1e-06, + "loss": 0.0102, + "step": 3962 + }, + { + "epoch": 0.6746680286006129, + "grad_norm": 1.1005768775939941, + "learning_rate": 1e-06, + "loss": 0.0081, + "step": 3963 + }, + { + "epoch": 0.6748382703438883, + "grad_norm": 1.0241467952728271, + "learning_rate": 1e-06, + "loss": 0.0056, + "step": 3964 + }, + { + "epoch": 0.6750085120871637, + "grad_norm": 0.992150068283081, + "learning_rate": 1e-06, + "loss": 0.0069, + "step": 3965 + }, + { + "epoch": 0.6751787538304392, + "grad_norm": 0.8683270215988159, + "learning_rate": 1e-06, + "loss": 0.0072, + "step": 3966 + }, + { + "epoch": 0.6753489955737146, + "grad_norm": 1.0942341089248657, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 3967 + }, + { + "epoch": 0.6755192373169902, + "grad_norm": 1.0581514835357666, + "learning_rate": 1e-06, + "loss": 0.0063, + "step": 3968 + }, + { + "epoch": 0.6756894790602656, + "grad_norm": 1.0217764377593994, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 3969 + }, + { + "epoch": 0.6758597208035411, + "grad_norm": 1.3109872341156006, + "learning_rate": 1e-06, + "loss": 0.0068, + "step": 3970 + }, + { + "epoch": 0.6760299625468165, + "grad_norm": 0.9485663771629333, + "learning_rate": 1e-06, + "loss": 0.0077, + "step": 3971 + }, + { + "epoch": 0.676200204290092, + "grad_norm": 0.8911463618278503, + "learning_rate": 1e-06, + "loss": 0.0069, + "step": 3972 + }, + { + "epoch": 0.6763704460333674, + "grad_norm": 0.908300518989563, + "learning_rate": 1e-06, + "loss": 0.0073, + "step": 3973 + }, + { + "epoch": 0.6765406877766428, + "grad_norm": 0.9515292644500732, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 3974 + }, + { + "epoch": 0.6767109295199183, + "grad_norm": 1.1426764726638794, + "learning_rate": 1e-06, + "loss": 0.0085, + "step": 3975 + }, + { + "epoch": 0.6768811712631937, + "grad_norm": 1.1612918376922607, + "learning_rate": 1e-06, + "loss": 0.0117, + "step": 3976 + }, + { + "epoch": 0.6770514130064692, + "grad_norm": 0.757738471031189, + "learning_rate": 1e-06, + "loss": 0.0056, + "step": 3977 + }, + { + "epoch": 0.6772216547497446, + "grad_norm": 1.1565226316452026, + "learning_rate": 1e-06, + "loss": 0.0062, + "step": 3978 + }, + { + "epoch": 0.6773918964930201, + "grad_norm": 1.3450309038162231, + "learning_rate": 1e-06, + "loss": 0.0113, + "step": 3979 + }, + { + "epoch": 0.6775621382362955, + "grad_norm": 1.1107460260391235, + "learning_rate": 1e-06, + "loss": 0.0102, + "step": 3980 + }, + { + "epoch": 0.677732379979571, + "grad_norm": 1.5839945077896118, + "learning_rate": 1e-06, + "loss": 0.0118, + "step": 3981 + }, + { + "epoch": 0.6779026217228464, + "grad_norm": 1.143868088722229, + "learning_rate": 1e-06, + "loss": 0.0079, + "step": 3982 + }, + { + "epoch": 0.678072863466122, + "grad_norm": 1.0564833879470825, + "learning_rate": 1e-06, + "loss": 0.0064, + "step": 3983 + }, + { + "epoch": 0.6782431052093973, + "grad_norm": 1.1876205205917358, + "learning_rate": 1e-06, + "loss": 0.0073, + "step": 3984 + }, + { + "epoch": 0.6784133469526727, + "grad_norm": 1.2426191568374634, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 3985 + }, + { + "epoch": 0.6785835886959483, + "grad_norm": 1.0779424905776978, + "learning_rate": 1e-06, + "loss": 0.0081, + "step": 3986 + }, + { + "epoch": 0.6787538304392237, + "grad_norm": 1.3028185367584229, + "learning_rate": 1e-06, + "loss": 0.0068, + "step": 3987 + }, + { + "epoch": 0.6789240721824992, + "grad_norm": 1.04373300075531, + "learning_rate": 1e-06, + "loss": 0.0089, + "step": 3988 + }, + { + "epoch": 0.6790943139257746, + "grad_norm": 0.9343636631965637, + "learning_rate": 1e-06, + "loss": 0.0084, + "step": 3989 + }, + { + "epoch": 0.6792645556690501, + "grad_norm": 1.0329524278640747, + "learning_rate": 1e-06, + "loss": 0.0081, + "step": 3990 + }, + { + "epoch": 0.6794347974123255, + "grad_norm": 0.9664596319198608, + "learning_rate": 1e-06, + "loss": 0.0077, + "step": 3991 + }, + { + "epoch": 0.679605039155601, + "grad_norm": 0.9661188721656799, + "learning_rate": 1e-06, + "loss": 0.0071, + "step": 3992 + }, + { + "epoch": 0.6797752808988764, + "grad_norm": 1.0348271131515503, + "learning_rate": 1e-06, + "loss": 0.008, + "step": 3993 + }, + { + "epoch": 0.6799455226421518, + "grad_norm": 0.9838294386863708, + "learning_rate": 1e-06, + "loss": 0.0056, + "step": 3994 + }, + { + "epoch": 0.6801157643854273, + "grad_norm": 1.4004563093185425, + "learning_rate": 1e-06, + "loss": 0.01, + "step": 3995 + }, + { + "epoch": 0.6802860061287027, + "grad_norm": 0.8205869793891907, + "learning_rate": 1e-06, + "loss": 0.0053, + "step": 3996 + }, + { + "epoch": 0.6804562478719782, + "grad_norm": 0.8853178024291992, + "learning_rate": 1e-06, + "loss": 0.007, + "step": 3997 + }, + { + "epoch": 0.6806264896152536, + "grad_norm": 0.8219318985939026, + "learning_rate": 1e-06, + "loss": 0.0076, + "step": 3998 + }, + { + "epoch": 0.6807967313585291, + "grad_norm": 1.133307695388794, + "learning_rate": 1e-06, + "loss": 0.0068, + "step": 3999 + }, + { + "epoch": 0.6809669731018045, + "grad_norm": 0.9907300472259521, + "learning_rate": 1e-06, + "loss": 0.0083, + "step": 4000 + }, + { + "epoch": 0.6809669731018045, + "eval_loss": 0.34189823269844055, + "eval_runtime": 23.8668, + "eval_samples_per_second": 12.57, + "eval_steps_per_second": 0.335, + "step": 4000 + } + ], + "logging_steps": 1.0, + "max_steps": 17622, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000, + "total_flos": 2.3577906965334983e+19, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +}