{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999617985254231, "eval_steps": 2182, "global_step": 6544, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00015280589830767466, "grad_norm": 0.3537859320640564, "learning_rate": 8.000000000000001e-07, "loss": 0.941, "step": 1 }, { "epoch": 0.00015280589830767466, "eval_loss": 0.8535330295562744, "eval_runtime": 1566.4875, "eval_samples_per_second": 7.119, "eval_steps_per_second": 3.56, "step": 1 }, { "epoch": 0.0003056117966153493, "grad_norm": 0.3782769739627838, "learning_rate": 1.6000000000000001e-06, "loss": 0.7386, "step": 2 }, { "epoch": 0.00045841769492302404, "grad_norm": 0.475812703371048, "learning_rate": 2.4000000000000003e-06, "loss": 1.059, "step": 3 }, { "epoch": 0.0006112235932306987, "grad_norm": 0.4409228265285492, "learning_rate": 3.2000000000000003e-06, "loss": 0.825, "step": 4 }, { "epoch": 0.0007640294915383734, "grad_norm": 0.39886385202407837, "learning_rate": 4.000000000000001e-06, "loss": 0.7462, "step": 5 }, { "epoch": 0.0009168353898460481, "grad_norm": 0.32536882162094116, "learning_rate": 4.800000000000001e-06, "loss": 0.9611, "step": 6 }, { "epoch": 0.0010696412881537228, "grad_norm": 0.41293051838874817, "learning_rate": 5.600000000000001e-06, "loss": 0.8808, "step": 7 }, { "epoch": 0.0012224471864613973, "grad_norm": 0.3373503088951111, "learning_rate": 6.4000000000000006e-06, "loss": 0.9459, "step": 8 }, { "epoch": 0.001375253084769072, "grad_norm": 0.3855852484703064, "learning_rate": 7.2e-06, "loss": 0.9431, "step": 9 }, { "epoch": 0.0015280589830767469, "grad_norm": 0.3200698792934418, "learning_rate": 8.000000000000001e-06, "loss": 0.9505, "step": 10 }, { "epoch": 0.0016808648813844214, "grad_norm": 0.3219761550426483, "learning_rate": 8.8e-06, "loss": 0.8947, "step": 11 }, { "epoch": 0.0018336707796920962, "grad_norm": 0.28418421745300293, "learning_rate": 9.600000000000001e-06, "loss": 0.7695, "step": 12 }, { "epoch": 0.001986476677999771, "grad_norm": 0.3636055290699005, "learning_rate": 1.04e-05, "loss": 0.7646, "step": 13 }, { "epoch": 0.0021392825763074455, "grad_norm": 0.26769423484802246, "learning_rate": 1.1200000000000001e-05, "loss": 0.6816, "step": 14 }, { "epoch": 0.00229208847461512, "grad_norm": 0.28433066606521606, "learning_rate": 1.2e-05, "loss": 0.7629, "step": 15 }, { "epoch": 0.0024448943729227946, "grad_norm": 0.33892345428466797, "learning_rate": 1.2800000000000001e-05, "loss": 0.9303, "step": 16 }, { "epoch": 0.0025977002712304696, "grad_norm": 0.31409522891044617, "learning_rate": 1.3600000000000002e-05, "loss": 0.805, "step": 17 }, { "epoch": 0.002750506169538144, "grad_norm": 0.34053927659988403, "learning_rate": 1.44e-05, "loss": 0.7556, "step": 18 }, { "epoch": 0.0029033120678458187, "grad_norm": 0.3334382176399231, "learning_rate": 1.52e-05, "loss": 0.7494, "step": 19 }, { "epoch": 0.0030561179661534937, "grad_norm": 0.3846895694732666, "learning_rate": 1.6000000000000003e-05, "loss": 0.6788, "step": 20 }, { "epoch": 0.0032089238644611683, "grad_norm": 0.46727222204208374, "learning_rate": 1.6800000000000002e-05, "loss": 0.8987, "step": 21 }, { "epoch": 0.003361729762768843, "grad_norm": 0.4377021789550781, "learning_rate": 1.76e-05, "loss": 0.7235, "step": 22 }, { "epoch": 0.0035145356610765174, "grad_norm": 0.4573345482349396, "learning_rate": 1.84e-05, "loss": 0.5838, "step": 23 }, { "epoch": 0.0036673415593841924, "grad_norm": 0.3256567716598511, "learning_rate": 1.9200000000000003e-05, "loss": 0.5452, "step": 24 }, { "epoch": 0.003820147457691867, "grad_norm": 0.2252429723739624, "learning_rate": 2e-05, "loss": 0.7252, "step": 25 }, { "epoch": 0.003972953355999542, "grad_norm": 0.22351256012916565, "learning_rate": 2.08e-05, "loss": 0.7701, "step": 26 }, { "epoch": 0.0041257592543072165, "grad_norm": 0.24318568408489227, "learning_rate": 2.16e-05, "loss": 0.6135, "step": 27 }, { "epoch": 0.004278565152614891, "grad_norm": 0.3728923201560974, "learning_rate": 2.2400000000000002e-05, "loss": 0.7592, "step": 28 }, { "epoch": 0.0044313710509225656, "grad_norm": 0.2996881306171417, "learning_rate": 2.32e-05, "loss": 0.6952, "step": 29 }, { "epoch": 0.00458417694923024, "grad_norm": 0.23991453647613525, "learning_rate": 2.4e-05, "loss": 0.6947, "step": 30 }, { "epoch": 0.004736982847537915, "grad_norm": 0.2515174150466919, "learning_rate": 2.48e-05, "loss": 1.1102, "step": 31 }, { "epoch": 0.004889788745845589, "grad_norm": 0.220277339220047, "learning_rate": 2.5600000000000002e-05, "loss": 0.654, "step": 32 }, { "epoch": 0.005042594644153265, "grad_norm": 0.24221166968345642, "learning_rate": 2.64e-05, "loss": 0.7946, "step": 33 }, { "epoch": 0.005195400542460939, "grad_norm": 0.22481025755405426, "learning_rate": 2.7200000000000004e-05, "loss": 0.646, "step": 34 }, { "epoch": 0.005348206440768614, "grad_norm": 0.200043722987175, "learning_rate": 2.8000000000000003e-05, "loss": 0.8636, "step": 35 }, { "epoch": 0.005501012339076288, "grad_norm": 0.3696175217628479, "learning_rate": 2.88e-05, "loss": 0.8136, "step": 36 }, { "epoch": 0.005653818237383963, "grad_norm": 0.2078743427991867, "learning_rate": 2.96e-05, "loss": 0.76, "step": 37 }, { "epoch": 0.005806624135691637, "grad_norm": 0.18780824542045593, "learning_rate": 3.04e-05, "loss": 0.6602, "step": 38 }, { "epoch": 0.005959430033999312, "grad_norm": 0.3369501829147339, "learning_rate": 3.12e-05, "loss": 0.782, "step": 39 }, { "epoch": 0.006112235932306987, "grad_norm": 0.19364964962005615, "learning_rate": 3.2000000000000005e-05, "loss": 0.617, "step": 40 }, { "epoch": 0.006265041830614662, "grad_norm": 0.24052347242832184, "learning_rate": 3.2800000000000004e-05, "loss": 0.763, "step": 41 }, { "epoch": 0.0064178477289223365, "grad_norm": 0.3821535110473633, "learning_rate": 3.3600000000000004e-05, "loss": 0.7338, "step": 42 }, { "epoch": 0.006570653627230011, "grad_norm": 0.25892436504364014, "learning_rate": 3.4399999999999996e-05, "loss": 0.8406, "step": 43 }, { "epoch": 0.006723459525537686, "grad_norm": 0.21732579171657562, "learning_rate": 3.52e-05, "loss": 0.6437, "step": 44 }, { "epoch": 0.00687626542384536, "grad_norm": 0.21630685031414032, "learning_rate": 3.6e-05, "loss": 0.8539, "step": 45 }, { "epoch": 0.007029071322153035, "grad_norm": 0.2213805615901947, "learning_rate": 3.68e-05, "loss": 0.6046, "step": 46 }, { "epoch": 0.00718187722046071, "grad_norm": 0.29060035943984985, "learning_rate": 3.76e-05, "loss": 0.874, "step": 47 }, { "epoch": 0.007334683118768385, "grad_norm": 0.32261431217193604, "learning_rate": 3.8400000000000005e-05, "loss": 0.6077, "step": 48 }, { "epoch": 0.007487489017076059, "grad_norm": 0.3036012053489685, "learning_rate": 3.9200000000000004e-05, "loss": 0.7008, "step": 49 }, { "epoch": 0.007640294915383734, "grad_norm": 0.8190217018127441, "learning_rate": 4e-05, "loss": 0.9121, "step": 50 }, { "epoch": 0.007793100813691408, "grad_norm": 0.19872790575027466, "learning_rate": 4.08e-05, "loss": 0.6881, "step": 51 }, { "epoch": 0.007945906711999084, "grad_norm": 0.2575259506702423, "learning_rate": 4.16e-05, "loss": 0.6109, "step": 52 }, { "epoch": 0.008098712610306757, "grad_norm": 0.19350558519363403, "learning_rate": 4.24e-05, "loss": 0.7016, "step": 53 }, { "epoch": 0.008251518508614433, "grad_norm": 0.23708432912826538, "learning_rate": 4.32e-05, "loss": 0.7601, "step": 54 }, { "epoch": 0.008404324406922107, "grad_norm": 0.42168232798576355, "learning_rate": 4.4000000000000006e-05, "loss": 0.7687, "step": 55 }, { "epoch": 0.008557130305229782, "grad_norm": 0.2412991225719452, "learning_rate": 4.4800000000000005e-05, "loss": 0.7205, "step": 56 }, { "epoch": 0.008709936203537456, "grad_norm": 0.2611636519432068, "learning_rate": 4.5600000000000004e-05, "loss": 0.9159, "step": 57 }, { "epoch": 0.008862742101845131, "grad_norm": 0.4061261713504791, "learning_rate": 4.64e-05, "loss": 0.9651, "step": 58 }, { "epoch": 0.009015548000152807, "grad_norm": 0.2744627892971039, "learning_rate": 4.72e-05, "loss": 0.8742, "step": 59 }, { "epoch": 0.00916835389846048, "grad_norm": 0.19657334685325623, "learning_rate": 4.8e-05, "loss": 0.5806, "step": 60 }, { "epoch": 0.009321159796768156, "grad_norm": 0.24348127841949463, "learning_rate": 4.88e-05, "loss": 0.6486, "step": 61 }, { "epoch": 0.00947396569507583, "grad_norm": 0.21159450709819794, "learning_rate": 4.96e-05, "loss": 0.7542, "step": 62 }, { "epoch": 0.009626771593383505, "grad_norm": 0.23291338980197906, "learning_rate": 5.0400000000000005e-05, "loss": 0.5697, "step": 63 }, { "epoch": 0.009779577491691178, "grad_norm": 0.2656891644001007, "learning_rate": 5.1200000000000004e-05, "loss": 0.5819, "step": 64 }, { "epoch": 0.009932383389998854, "grad_norm": 0.21467728912830353, "learning_rate": 5.2000000000000004e-05, "loss": 0.7506, "step": 65 }, { "epoch": 0.01008518928830653, "grad_norm": 0.25314462184906006, "learning_rate": 5.28e-05, "loss": 0.4424, "step": 66 }, { "epoch": 0.010237995186614203, "grad_norm": 0.2386377453804016, "learning_rate": 5.360000000000001e-05, "loss": 0.77, "step": 67 }, { "epoch": 0.010390801084921878, "grad_norm": 0.24037358164787292, "learning_rate": 5.440000000000001e-05, "loss": 0.6511, "step": 68 }, { "epoch": 0.010543606983229552, "grad_norm": 0.2473539263010025, "learning_rate": 5.520000000000001e-05, "loss": 0.7086, "step": 69 }, { "epoch": 0.010696412881537228, "grad_norm": 0.23620954155921936, "learning_rate": 5.6000000000000006e-05, "loss": 0.82, "step": 70 }, { "epoch": 0.010849218779844901, "grad_norm": 0.20047105848789215, "learning_rate": 5.68e-05, "loss": 0.6568, "step": 71 }, { "epoch": 0.011002024678152577, "grad_norm": 0.21529246866703033, "learning_rate": 5.76e-05, "loss": 0.5856, "step": 72 }, { "epoch": 0.011154830576460252, "grad_norm": 0.2424297332763672, "learning_rate": 5.8399999999999997e-05, "loss": 0.6073, "step": 73 }, { "epoch": 0.011307636474767926, "grad_norm": 0.2489442229270935, "learning_rate": 5.92e-05, "loss": 0.6807, "step": 74 }, { "epoch": 0.011460442373075601, "grad_norm": 0.35431531071662903, "learning_rate": 6e-05, "loss": 0.6741, "step": 75 }, { "epoch": 0.011613248271383275, "grad_norm": 0.24680747091770172, "learning_rate": 6.08e-05, "loss": 0.7881, "step": 76 }, { "epoch": 0.01176605416969095, "grad_norm": 0.2189926654100418, "learning_rate": 6.16e-05, "loss": 0.4981, "step": 77 }, { "epoch": 0.011918860067998624, "grad_norm": 0.29724177718162537, "learning_rate": 6.24e-05, "loss": 0.8307, "step": 78 }, { "epoch": 0.0120716659663063, "grad_norm": 0.2065054178237915, "learning_rate": 6.32e-05, "loss": 0.8858, "step": 79 }, { "epoch": 0.012224471864613975, "grad_norm": 0.21010780334472656, "learning_rate": 6.400000000000001e-05, "loss": 0.5629, "step": 80 }, { "epoch": 0.012377277762921648, "grad_norm": 0.26801830530166626, "learning_rate": 6.48e-05, "loss": 0.7186, "step": 81 }, { "epoch": 0.012530083661229324, "grad_norm": 0.3203904628753662, "learning_rate": 6.560000000000001e-05, "loss": 0.8373, "step": 82 }, { "epoch": 0.012682889559536998, "grad_norm": 0.2379075288772583, "learning_rate": 6.64e-05, "loss": 0.6567, "step": 83 }, { "epoch": 0.012835695457844673, "grad_norm": 0.2070106416940689, "learning_rate": 6.720000000000001e-05, "loss": 0.5546, "step": 84 }, { "epoch": 0.012988501356152347, "grad_norm": 0.27992406487464905, "learning_rate": 6.800000000000001e-05, "loss": 0.7352, "step": 85 }, { "epoch": 0.013141307254460022, "grad_norm": 0.21248190104961395, "learning_rate": 6.879999999999999e-05, "loss": 0.6147, "step": 86 }, { "epoch": 0.013294113152767698, "grad_norm": 0.23391371965408325, "learning_rate": 6.96e-05, "loss": 0.7289, "step": 87 }, { "epoch": 0.013446919051075371, "grad_norm": 0.2129083275794983, "learning_rate": 7.04e-05, "loss": 0.8087, "step": 88 }, { "epoch": 0.013599724949383047, "grad_norm": 0.20840856432914734, "learning_rate": 7.12e-05, "loss": 0.6256, "step": 89 }, { "epoch": 0.01375253084769072, "grad_norm": 0.2114286720752716, "learning_rate": 7.2e-05, "loss": 0.5729, "step": 90 }, { "epoch": 0.013905336745998396, "grad_norm": 0.36645349860191345, "learning_rate": 7.280000000000001e-05, "loss": 0.7079, "step": 91 }, { "epoch": 0.01405814264430607, "grad_norm": 0.25490131974220276, "learning_rate": 7.36e-05, "loss": 0.7698, "step": 92 }, { "epoch": 0.014210948542613745, "grad_norm": 0.3339272141456604, "learning_rate": 7.44e-05, "loss": 0.6126, "step": 93 }, { "epoch": 0.01436375444092142, "grad_norm": 0.23325824737548828, "learning_rate": 7.52e-05, "loss": 0.8602, "step": 94 }, { "epoch": 0.014516560339229094, "grad_norm": 0.2818077504634857, "learning_rate": 7.6e-05, "loss": 0.7726, "step": 95 }, { "epoch": 0.01466936623753677, "grad_norm": 0.23820696771144867, "learning_rate": 7.680000000000001e-05, "loss": 0.7344, "step": 96 }, { "epoch": 0.014822172135844443, "grad_norm": 0.25046974420547485, "learning_rate": 7.76e-05, "loss": 0.5652, "step": 97 }, { "epoch": 0.014974978034152119, "grad_norm": 0.23637717962265015, "learning_rate": 7.840000000000001e-05, "loss": 0.9834, "step": 98 }, { "epoch": 0.015127783932459792, "grad_norm": 0.20385268330574036, "learning_rate": 7.920000000000001e-05, "loss": 0.685, "step": 99 }, { "epoch": 0.015280589830767468, "grad_norm": 0.22909928858280182, "learning_rate": 8e-05, "loss": 0.8559, "step": 100 }, { "epoch": 0.015433395729075143, "grad_norm": 0.22465063631534576, "learning_rate": 8.080000000000001e-05, "loss": 0.7211, "step": 101 }, { "epoch": 0.015586201627382817, "grad_norm": 0.24429404735565186, "learning_rate": 8.16e-05, "loss": 0.8082, "step": 102 }, { "epoch": 0.01573900752569049, "grad_norm": 0.23806914687156677, "learning_rate": 8.24e-05, "loss": 0.8902, "step": 103 }, { "epoch": 0.015891813423998168, "grad_norm": 0.6740613579750061, "learning_rate": 8.32e-05, "loss": 0.6925, "step": 104 }, { "epoch": 0.01604461932230584, "grad_norm": 0.21556046605110168, "learning_rate": 8.4e-05, "loss": 0.4671, "step": 105 }, { "epoch": 0.016197425220613515, "grad_norm": 0.23331165313720703, "learning_rate": 8.48e-05, "loss": 0.8063, "step": 106 }, { "epoch": 0.01635023111892119, "grad_norm": 0.2387675642967224, "learning_rate": 8.560000000000001e-05, "loss": 0.7208, "step": 107 }, { "epoch": 0.016503037017228866, "grad_norm": 0.24151624739170074, "learning_rate": 8.64e-05, "loss": 0.6599, "step": 108 }, { "epoch": 0.01665584291553654, "grad_norm": 0.24208898842334747, "learning_rate": 8.72e-05, "loss": 0.8813, "step": 109 }, { "epoch": 0.016808648813844213, "grad_norm": 0.2825816571712494, "learning_rate": 8.800000000000001e-05, "loss": 0.6972, "step": 110 }, { "epoch": 0.01696145471215189, "grad_norm": 0.20937465131282806, "learning_rate": 8.88e-05, "loss": 0.6302, "step": 111 }, { "epoch": 0.017114260610459564, "grad_norm": 0.5450260043144226, "learning_rate": 8.960000000000001e-05, "loss": 0.7495, "step": 112 }, { "epoch": 0.017267066508767238, "grad_norm": 0.23792274296283722, "learning_rate": 9.04e-05, "loss": 0.8158, "step": 113 }, { "epoch": 0.01741987240707491, "grad_norm": 0.2838549315929413, "learning_rate": 9.120000000000001e-05, "loss": 0.9494, "step": 114 }, { "epoch": 0.01757267830538259, "grad_norm": 0.19924430549144745, "learning_rate": 9.200000000000001e-05, "loss": 0.7721, "step": 115 }, { "epoch": 0.017725484203690262, "grad_norm": 0.18079274892807007, "learning_rate": 9.28e-05, "loss": 0.5387, "step": 116 }, { "epoch": 0.017878290101997936, "grad_norm": 0.20002222061157227, "learning_rate": 9.360000000000001e-05, "loss": 0.717, "step": 117 }, { "epoch": 0.018031096000305613, "grad_norm": 0.193673238158226, "learning_rate": 9.44e-05, "loss": 0.6842, "step": 118 }, { "epoch": 0.018183901898613287, "grad_norm": 0.21627160906791687, "learning_rate": 9.52e-05, "loss": 0.6531, "step": 119 }, { "epoch": 0.01833670779692096, "grad_norm": 0.2337784618139267, "learning_rate": 9.6e-05, "loss": 0.7066, "step": 120 }, { "epoch": 0.018489513695228634, "grad_norm": 0.21653355658054352, "learning_rate": 9.680000000000001e-05, "loss": 0.7411, "step": 121 }, { "epoch": 0.01864231959353631, "grad_norm": 0.26810961961746216, "learning_rate": 9.76e-05, "loss": 0.6168, "step": 122 }, { "epoch": 0.018795125491843985, "grad_norm": 0.21840594708919525, "learning_rate": 9.84e-05, "loss": 0.6318, "step": 123 }, { "epoch": 0.01894793139015166, "grad_norm": 0.26883718371391296, "learning_rate": 9.92e-05, "loss": 0.7906, "step": 124 }, { "epoch": 0.019100737288459336, "grad_norm": 0.40301695466041565, "learning_rate": 0.0001, "loss": 0.6718, "step": 125 }, { "epoch": 0.01925354318676701, "grad_norm": 0.36299192905426025, "learning_rate": 0.00010080000000000001, "loss": 0.9555, "step": 126 }, { "epoch": 0.019406349085074683, "grad_norm": 0.40861931443214417, "learning_rate": 0.0001016, "loss": 0.7246, "step": 127 }, { "epoch": 0.019559154983382357, "grad_norm": 0.2326318919658661, "learning_rate": 0.00010240000000000001, "loss": 0.7032, "step": 128 }, { "epoch": 0.019711960881690034, "grad_norm": 0.22199535369873047, "learning_rate": 0.0001032, "loss": 0.6007, "step": 129 }, { "epoch": 0.019864766779997708, "grad_norm": 0.2680632174015045, "learning_rate": 0.00010400000000000001, "loss": 0.6804, "step": 130 }, { "epoch": 0.02001757267830538, "grad_norm": 0.21533040702342987, "learning_rate": 0.00010480000000000001, "loss": 0.8305, "step": 131 }, { "epoch": 0.02017037857661306, "grad_norm": 0.22990071773529053, "learning_rate": 0.0001056, "loss": 0.7334, "step": 132 }, { "epoch": 0.020323184474920732, "grad_norm": 0.2372717261314392, "learning_rate": 0.00010640000000000001, "loss": 0.5291, "step": 133 }, { "epoch": 0.020475990373228406, "grad_norm": 0.19138963520526886, "learning_rate": 0.00010720000000000002, "loss": 0.6131, "step": 134 }, { "epoch": 0.02062879627153608, "grad_norm": 0.2097582370042801, "learning_rate": 0.00010800000000000001, "loss": 0.6131, "step": 135 }, { "epoch": 0.020781602169843757, "grad_norm": 0.19639591872692108, "learning_rate": 0.00010880000000000002, "loss": 0.5467, "step": 136 }, { "epoch": 0.02093440806815143, "grad_norm": 0.5305817723274231, "learning_rate": 0.00010960000000000001, "loss": 0.6327, "step": 137 }, { "epoch": 0.021087213966459104, "grad_norm": 0.2177964448928833, "learning_rate": 0.00011040000000000001, "loss": 0.6252, "step": 138 }, { "epoch": 0.02124001986476678, "grad_norm": 0.18753781914710999, "learning_rate": 0.00011120000000000002, "loss": 0.8267, "step": 139 }, { "epoch": 0.021392825763074455, "grad_norm": 0.26264771819114685, "learning_rate": 0.00011200000000000001, "loss": 0.737, "step": 140 }, { "epoch": 0.02154563166138213, "grad_norm": 0.2190270870923996, "learning_rate": 0.00011279999999999999, "loss": 0.6809, "step": 141 }, { "epoch": 0.021698437559689802, "grad_norm": 0.21061022579669952, "learning_rate": 0.0001136, "loss": 0.7108, "step": 142 }, { "epoch": 0.02185124345799748, "grad_norm": 0.23190730810165405, "learning_rate": 0.0001144, "loss": 0.625, "step": 143 }, { "epoch": 0.022004049356305153, "grad_norm": 0.21410205960273743, "learning_rate": 0.0001152, "loss": 0.7908, "step": 144 }, { "epoch": 0.022156855254612827, "grad_norm": 0.19211190938949585, "learning_rate": 0.000116, "loss": 0.6662, "step": 145 }, { "epoch": 0.022309661152920504, "grad_norm": 0.43506669998168945, "learning_rate": 0.00011679999999999999, "loss": 0.7876, "step": 146 }, { "epoch": 0.022462467051228178, "grad_norm": 0.24997620284557343, "learning_rate": 0.0001176, "loss": 0.5589, "step": 147 }, { "epoch": 0.02261527294953585, "grad_norm": 0.22067512571811676, "learning_rate": 0.0001184, "loss": 0.5908, "step": 148 }, { "epoch": 0.022768078847843525, "grad_norm": 0.5890689492225647, "learning_rate": 0.0001192, "loss": 0.7447, "step": 149 }, { "epoch": 0.022920884746151202, "grad_norm": 0.2859780192375183, "learning_rate": 0.00012, "loss": 0.7093, "step": 150 }, { "epoch": 0.023073690644458876, "grad_norm": 0.20324255526065826, "learning_rate": 0.0001208, "loss": 0.6664, "step": 151 }, { "epoch": 0.02322649654276655, "grad_norm": 0.2541416883468628, "learning_rate": 0.0001216, "loss": 0.861, "step": 152 }, { "epoch": 0.023379302441074227, "grad_norm": 0.22396203875541687, "learning_rate": 0.0001224, "loss": 0.6853, "step": 153 }, { "epoch": 0.0235321083393819, "grad_norm": 0.3173479735851288, "learning_rate": 0.0001232, "loss": 0.7057, "step": 154 }, { "epoch": 0.023684914237689574, "grad_norm": 0.2800653576850891, "learning_rate": 0.000124, "loss": 0.9188, "step": 155 }, { "epoch": 0.023837720135997248, "grad_norm": 0.18186135590076447, "learning_rate": 0.0001248, "loss": 0.7335, "step": 156 }, { "epoch": 0.023990526034304925, "grad_norm": 0.25458452105522156, "learning_rate": 0.00012560000000000002, "loss": 0.7153, "step": 157 }, { "epoch": 0.0241433319326126, "grad_norm": 0.21995219588279724, "learning_rate": 0.0001264, "loss": 0.43, "step": 158 }, { "epoch": 0.024296137830920272, "grad_norm": 1.0608121156692505, "learning_rate": 0.0001272, "loss": 0.6193, "step": 159 }, { "epoch": 0.02444894372922795, "grad_norm": 0.2779378592967987, "learning_rate": 0.00012800000000000002, "loss": 0.9149, "step": 160 }, { "epoch": 0.024601749627535623, "grad_norm": 0.1996106058359146, "learning_rate": 0.00012880000000000001, "loss": 0.6259, "step": 161 }, { "epoch": 0.024754555525843297, "grad_norm": 0.2813643515110016, "learning_rate": 0.0001296, "loss": 0.6519, "step": 162 }, { "epoch": 0.02490736142415097, "grad_norm": 0.16814516484737396, "learning_rate": 0.0001304, "loss": 0.736, "step": 163 }, { "epoch": 0.025060167322458648, "grad_norm": 0.2353413999080658, "learning_rate": 0.00013120000000000002, "loss": 0.6421, "step": 164 }, { "epoch": 0.02521297322076632, "grad_norm": 0.1907549351453781, "learning_rate": 0.000132, "loss": 0.6655, "step": 165 }, { "epoch": 0.025365779119073995, "grad_norm": 0.20261786878108978, "learning_rate": 0.0001328, "loss": 0.5768, "step": 166 }, { "epoch": 0.025518585017381672, "grad_norm": 0.19534656405448914, "learning_rate": 0.00013360000000000002, "loss": 0.6831, "step": 167 }, { "epoch": 0.025671390915689346, "grad_norm": 0.18376581370830536, "learning_rate": 0.00013440000000000001, "loss": 0.8075, "step": 168 }, { "epoch": 0.02582419681399702, "grad_norm": 0.23888923227787018, "learning_rate": 0.0001352, "loss": 0.6131, "step": 169 }, { "epoch": 0.025977002712304693, "grad_norm": 0.23357047140598297, "learning_rate": 0.00013600000000000003, "loss": 0.6604, "step": 170 }, { "epoch": 0.02612980861061237, "grad_norm": 0.3035596013069153, "learning_rate": 0.00013680000000000002, "loss": 0.6949, "step": 171 }, { "epoch": 0.026282614508920044, "grad_norm": 0.22164690494537354, "learning_rate": 0.00013759999999999998, "loss": 0.8732, "step": 172 }, { "epoch": 0.026435420407227718, "grad_norm": 0.21173541247844696, "learning_rate": 0.0001384, "loss": 0.7322, "step": 173 }, { "epoch": 0.026588226305535395, "grad_norm": 0.20340844988822937, "learning_rate": 0.0001392, "loss": 0.684, "step": 174 }, { "epoch": 0.02674103220384307, "grad_norm": 0.21223647892475128, "learning_rate": 0.00014, "loss": 0.7615, "step": 175 }, { "epoch": 0.026893838102150742, "grad_norm": 0.25785163044929504, "learning_rate": 0.0001408, "loss": 0.665, "step": 176 }, { "epoch": 0.027046644000458416, "grad_norm": 0.2169693559408188, "learning_rate": 0.0001416, "loss": 0.553, "step": 177 }, { "epoch": 0.027199449898766093, "grad_norm": 0.22600002586841583, "learning_rate": 0.0001424, "loss": 0.5225, "step": 178 }, { "epoch": 0.027352255797073767, "grad_norm": 0.21666403114795685, "learning_rate": 0.0001432, "loss": 0.5592, "step": 179 }, { "epoch": 0.02750506169538144, "grad_norm": 0.19408009946346283, "learning_rate": 0.000144, "loss": 0.6251, "step": 180 }, { "epoch": 0.027657867593689118, "grad_norm": 0.22444888949394226, "learning_rate": 0.0001448, "loss": 0.6119, "step": 181 }, { "epoch": 0.02781067349199679, "grad_norm": 0.1960359364748001, "learning_rate": 0.00014560000000000002, "loss": 0.8866, "step": 182 }, { "epoch": 0.027963479390304465, "grad_norm": 0.298685759305954, "learning_rate": 0.0001464, "loss": 0.5952, "step": 183 }, { "epoch": 0.02811628528861214, "grad_norm": 0.21745067834854126, "learning_rate": 0.0001472, "loss": 1.0509, "step": 184 }, { "epoch": 0.028269091186919816, "grad_norm": 0.44042158126831055, "learning_rate": 0.000148, "loss": 0.8793, "step": 185 }, { "epoch": 0.02842189708522749, "grad_norm": 0.22677303850650787, "learning_rate": 0.0001488, "loss": 0.6196, "step": 186 }, { "epoch": 0.028574702983535163, "grad_norm": 0.2111995816230774, "learning_rate": 0.0001496, "loss": 0.7332, "step": 187 }, { "epoch": 0.02872750888184284, "grad_norm": 0.19154132902622223, "learning_rate": 0.0001504, "loss": 0.766, "step": 188 }, { "epoch": 0.028880314780150514, "grad_norm": 0.24843589961528778, "learning_rate": 0.00015120000000000002, "loss": 0.929, "step": 189 }, { "epoch": 0.029033120678458188, "grad_norm": 0.22019292414188385, "learning_rate": 0.000152, "loss": 0.5629, "step": 190 }, { "epoch": 0.02918592657676586, "grad_norm": 0.23560045659542084, "learning_rate": 0.0001528, "loss": 0.6681, "step": 191 }, { "epoch": 0.02933873247507354, "grad_norm": 0.19246064126491547, "learning_rate": 0.00015360000000000002, "loss": 0.6445, "step": 192 }, { "epoch": 0.029491538373381213, "grad_norm": 0.21508120000362396, "learning_rate": 0.0001544, "loss": 0.6329, "step": 193 }, { "epoch": 0.029644344271688886, "grad_norm": 0.2356320321559906, "learning_rate": 0.0001552, "loss": 0.6302, "step": 194 }, { "epoch": 0.029797150169996563, "grad_norm": 0.22980546951293945, "learning_rate": 0.00015600000000000002, "loss": 0.5325, "step": 195 }, { "epoch": 0.029949956068304237, "grad_norm": 0.2617977559566498, "learning_rate": 0.00015680000000000002, "loss": 0.8915, "step": 196 }, { "epoch": 0.03010276196661191, "grad_norm": 0.19717738032341003, "learning_rate": 0.0001576, "loss": 0.7757, "step": 197 }, { "epoch": 0.030255567864919584, "grad_norm": 0.20106296241283417, "learning_rate": 0.00015840000000000003, "loss": 0.7742, "step": 198 }, { "epoch": 0.03040837376322726, "grad_norm": 0.226706400513649, "learning_rate": 0.00015920000000000002, "loss": 0.5814, "step": 199 }, { "epoch": 0.030561179661534935, "grad_norm": 0.18126145005226135, "learning_rate": 0.00016, "loss": 0.6697, "step": 200 }, { "epoch": 0.03071398555984261, "grad_norm": 1.2944668531417847, "learning_rate": 0.0001608, "loss": 0.6111, "step": 201 }, { "epoch": 0.030866791458150286, "grad_norm": 0.19455716013908386, "learning_rate": 0.00016160000000000002, "loss": 0.6571, "step": 202 }, { "epoch": 0.03101959735645796, "grad_norm": 0.23030945658683777, "learning_rate": 0.00016240000000000002, "loss": 0.8378, "step": 203 }, { "epoch": 0.031172403254765634, "grad_norm": 0.22586551308631897, "learning_rate": 0.0001632, "loss": 0.8245, "step": 204 }, { "epoch": 0.03132520915307331, "grad_norm": 0.2673279643058777, "learning_rate": 0.000164, "loss": 0.701, "step": 205 }, { "epoch": 0.03147801505138098, "grad_norm": 0.22940319776535034, "learning_rate": 0.0001648, "loss": 1.0499, "step": 206 }, { "epoch": 0.03163082094968866, "grad_norm": 0.33147504925727844, "learning_rate": 0.0001656, "loss": 0.6698, "step": 207 }, { "epoch": 0.031783626847996335, "grad_norm": 0.22897526621818542, "learning_rate": 0.0001664, "loss": 0.7872, "step": 208 }, { "epoch": 0.031936432746304005, "grad_norm": 0.23269681632518768, "learning_rate": 0.0001672, "loss": 0.6758, "step": 209 }, { "epoch": 0.03208923864461168, "grad_norm": 0.25892311334609985, "learning_rate": 0.000168, "loss": 0.6459, "step": 210 }, { "epoch": 0.03224204454291936, "grad_norm": 0.2470550239086151, "learning_rate": 0.0001688, "loss": 0.6778, "step": 211 }, { "epoch": 0.03239485044122703, "grad_norm": 0.23179148137569427, "learning_rate": 0.0001696, "loss": 0.6098, "step": 212 }, { "epoch": 0.03254765633953471, "grad_norm": 0.23430663347244263, "learning_rate": 0.0001704, "loss": 0.7551, "step": 213 }, { "epoch": 0.03270046223784238, "grad_norm": 0.18951766192913055, "learning_rate": 0.00017120000000000001, "loss": 0.7198, "step": 214 }, { "epoch": 0.032853268136150054, "grad_norm": 0.2654738128185272, "learning_rate": 0.000172, "loss": 0.6006, "step": 215 }, { "epoch": 0.03300607403445773, "grad_norm": 0.22690650820732117, "learning_rate": 0.0001728, "loss": 0.7018, "step": 216 }, { "epoch": 0.0331588799327654, "grad_norm": 0.22692647576332092, "learning_rate": 0.00017360000000000002, "loss": 0.7319, "step": 217 }, { "epoch": 0.03331168583107308, "grad_norm": 0.20025219023227692, "learning_rate": 0.0001744, "loss": 0.6302, "step": 218 }, { "epoch": 0.033464491729380756, "grad_norm": 0.19332559406757355, "learning_rate": 0.0001752, "loss": 0.6756, "step": 219 }, { "epoch": 0.033617297627688426, "grad_norm": 0.25213485956192017, "learning_rate": 0.00017600000000000002, "loss": 0.865, "step": 220 }, { "epoch": 0.033770103525996104, "grad_norm": 0.2248384654521942, "learning_rate": 0.00017680000000000001, "loss": 0.6936, "step": 221 }, { "epoch": 0.03392290942430378, "grad_norm": 0.23252415657043457, "learning_rate": 0.0001776, "loss": 0.8629, "step": 222 }, { "epoch": 0.03407571532261145, "grad_norm": 0.2784040570259094, "learning_rate": 0.0001784, "loss": 0.9894, "step": 223 }, { "epoch": 0.03422852122091913, "grad_norm": 0.23547817766666412, "learning_rate": 0.00017920000000000002, "loss": 0.6912, "step": 224 }, { "epoch": 0.034381327119226805, "grad_norm": 0.22327569127082825, "learning_rate": 0.00018, "loss": 0.7035, "step": 225 }, { "epoch": 0.034534133017534475, "grad_norm": 0.22189348936080933, "learning_rate": 0.0001808, "loss": 0.5271, "step": 226 }, { "epoch": 0.03468693891584215, "grad_norm": 0.19266308844089508, "learning_rate": 0.00018160000000000002, "loss": 0.4573, "step": 227 }, { "epoch": 0.03483974481414982, "grad_norm": 0.23664893209934235, "learning_rate": 0.00018240000000000002, "loss": 0.6511, "step": 228 }, { "epoch": 0.0349925507124575, "grad_norm": 0.20202231407165527, "learning_rate": 0.0001832, "loss": 0.6569, "step": 229 }, { "epoch": 0.03514535661076518, "grad_norm": 0.23481759428977966, "learning_rate": 0.00018400000000000003, "loss": 0.6054, "step": 230 }, { "epoch": 0.03529816250907285, "grad_norm": 0.2738634943962097, "learning_rate": 0.00018480000000000002, "loss": 0.8319, "step": 231 }, { "epoch": 0.035450968407380525, "grad_norm": 0.24060329794883728, "learning_rate": 0.0001856, "loss": 0.6179, "step": 232 }, { "epoch": 0.0356037743056882, "grad_norm": 0.2128535658121109, "learning_rate": 0.00018640000000000003, "loss": 0.7123, "step": 233 }, { "epoch": 0.03575658020399587, "grad_norm": 0.1951960027217865, "learning_rate": 0.00018720000000000002, "loss": 0.5302, "step": 234 }, { "epoch": 0.03590938610230355, "grad_norm": 0.3803926110267639, "learning_rate": 0.000188, "loss": 0.6614, "step": 235 }, { "epoch": 0.036062192000611226, "grad_norm": 0.19294387102127075, "learning_rate": 0.0001888, "loss": 0.6031, "step": 236 }, { "epoch": 0.036214997898918896, "grad_norm": 0.24113322794437408, "learning_rate": 0.0001896, "loss": 0.8938, "step": 237 }, { "epoch": 0.036367803797226574, "grad_norm": 0.19767731428146362, "learning_rate": 0.0001904, "loss": 0.5464, "step": 238 }, { "epoch": 0.03652060969553425, "grad_norm": 0.2186284363269806, "learning_rate": 0.0001912, "loss": 0.7288, "step": 239 }, { "epoch": 0.03667341559384192, "grad_norm": 0.5541898608207703, "learning_rate": 0.000192, "loss": 0.6484, "step": 240 }, { "epoch": 0.0368262214921496, "grad_norm": 0.22552861273288727, "learning_rate": 0.0001928, "loss": 0.834, "step": 241 }, { "epoch": 0.03697902739045727, "grad_norm": 0.3038541078567505, "learning_rate": 0.00019360000000000002, "loss": 0.6914, "step": 242 }, { "epoch": 0.037131833288764945, "grad_norm": 0.27954229712486267, "learning_rate": 0.0001944, "loss": 0.6588, "step": 243 }, { "epoch": 0.03728463918707262, "grad_norm": 0.5024107098579407, "learning_rate": 0.0001952, "loss": 0.688, "step": 244 }, { "epoch": 0.03743744508538029, "grad_norm": 0.23389217257499695, "learning_rate": 0.000196, "loss": 0.7403, "step": 245 }, { "epoch": 0.03759025098368797, "grad_norm": 0.22935818135738373, "learning_rate": 0.0001968, "loss": 0.6697, "step": 246 }, { "epoch": 0.03774305688199565, "grad_norm": 0.2132337987422943, "learning_rate": 0.0001976, "loss": 0.7081, "step": 247 }, { "epoch": 0.03789586278030332, "grad_norm": 0.22637519240379333, "learning_rate": 0.0001984, "loss": 0.5676, "step": 248 }, { "epoch": 0.038048668678610995, "grad_norm": 0.2421012669801712, "learning_rate": 0.00019920000000000002, "loss": 0.6939, "step": 249 }, { "epoch": 0.03820147457691867, "grad_norm": 0.36056315898895264, "learning_rate": 0.0002, "loss": 0.7907, "step": 250 }, { "epoch": 0.03835428047522634, "grad_norm": 0.2190164178609848, "learning_rate": 0.00019999998754291972, "loss": 0.6726, "step": 251 }, { "epoch": 0.03850708637353402, "grad_norm": 0.2309923619031906, "learning_rate": 0.00019999995017168197, "loss": 0.6444, "step": 252 }, { "epoch": 0.038659892271841696, "grad_norm": 0.32520991563796997, "learning_rate": 0.00019999988788629606, "loss": 0.748, "step": 253 }, { "epoch": 0.038812698170149366, "grad_norm": 0.2230103313922882, "learning_rate": 0.00019999980068677745, "loss": 0.5999, "step": 254 }, { "epoch": 0.038965504068457044, "grad_norm": 0.21019278466701508, "learning_rate": 0.00019999968857314798, "loss": 0.6995, "step": 255 }, { "epoch": 0.039118309966764714, "grad_norm": 1.701196312904358, "learning_rate": 0.00019999955154543554, "loss": 0.7642, "step": 256 }, { "epoch": 0.03927111586507239, "grad_norm": 0.3295258581638336, "learning_rate": 0.0001999993896036742, "loss": 0.6656, "step": 257 }, { "epoch": 0.03942392176338007, "grad_norm": 0.2401845008134842, "learning_rate": 0.00019999920274790437, "loss": 0.6979, "step": 258 }, { "epoch": 0.03957672766168774, "grad_norm": 0.2586315870285034, "learning_rate": 0.00019999899097817263, "loss": 0.6154, "step": 259 }, { "epoch": 0.039729533559995416, "grad_norm": 0.2295006513595581, "learning_rate": 0.00019999875429453168, "loss": 0.7028, "step": 260 }, { "epoch": 0.03988233945830309, "grad_norm": 0.23985904455184937, "learning_rate": 0.0001999984926970405, "loss": 0.553, "step": 261 }, { "epoch": 0.04003514535661076, "grad_norm": 0.20894868671894073, "learning_rate": 0.00019999820618576427, "loss": 0.5702, "step": 262 }, { "epoch": 0.04018795125491844, "grad_norm": 0.257010817527771, "learning_rate": 0.00019999789476077441, "loss": 0.9264, "step": 263 }, { "epoch": 0.04034075715322612, "grad_norm": 0.28151246905326843, "learning_rate": 0.00019999755842214846, "loss": 0.6658, "step": 264 }, { "epoch": 0.04049356305153379, "grad_norm": 0.22812116146087646, "learning_rate": 0.00019999719716997025, "loss": 0.736, "step": 265 }, { "epoch": 0.040646368949841465, "grad_norm": 0.20041359961032867, "learning_rate": 0.00019999681100432977, "loss": 0.7334, "step": 266 }, { "epoch": 0.04079917484814914, "grad_norm": 0.2387220561504364, "learning_rate": 0.0001999963999253232, "loss": 0.5816, "step": 267 }, { "epoch": 0.04095198074645681, "grad_norm": 0.23148131370544434, "learning_rate": 0.00019999596393305296, "loss": 0.7564, "step": 268 }, { "epoch": 0.04110478664476449, "grad_norm": 0.30160292983055115, "learning_rate": 0.00019999550302762776, "loss": 0.805, "step": 269 }, { "epoch": 0.04125759254307216, "grad_norm": 0.25093773007392883, "learning_rate": 0.0001999950172091623, "loss": 0.7729, "step": 270 }, { "epoch": 0.041410398441379836, "grad_norm": 0.18739305436611176, "learning_rate": 0.00019999450647777774, "loss": 0.6284, "step": 271 }, { "epoch": 0.041563204339687514, "grad_norm": 0.2319766879081726, "learning_rate": 0.00019999397083360126, "loss": 0.4766, "step": 272 }, { "epoch": 0.041716010237995184, "grad_norm": 0.24282965064048767, "learning_rate": 0.0001999934102767663, "loss": 0.8018, "step": 273 }, { "epoch": 0.04186881613630286, "grad_norm": 0.21952565014362335, "learning_rate": 0.00019999282480741255, "loss": 0.6215, "step": 274 }, { "epoch": 0.04202162203461054, "grad_norm": 0.5087156295776367, "learning_rate": 0.00019999221442568586, "loss": 0.7481, "step": 275 }, { "epoch": 0.04217442793291821, "grad_norm": 0.2524573504924774, "learning_rate": 0.00019999157913173828, "loss": 0.7204, "step": 276 }, { "epoch": 0.042327233831225886, "grad_norm": 0.2968989312648773, "learning_rate": 0.00019999091892572817, "loss": 0.5803, "step": 277 }, { "epoch": 0.04248003972953356, "grad_norm": 0.24576199054718018, "learning_rate": 0.0001999902338078199, "loss": 0.8048, "step": 278 }, { "epoch": 0.04263284562784123, "grad_norm": 0.2983776032924652, "learning_rate": 0.00019998952377818426, "loss": 0.6864, "step": 279 }, { "epoch": 0.04278565152614891, "grad_norm": 0.21671080589294434, "learning_rate": 0.0001999887888369981, "loss": 0.7487, "step": 280 }, { "epoch": 0.04293845742445659, "grad_norm": 0.24726702272891998, "learning_rate": 0.00019998802898444452, "loss": 0.6923, "step": 281 }, { "epoch": 0.04309126332276426, "grad_norm": 0.20502431690692902, "learning_rate": 0.00019998724422071282, "loss": 0.8878, "step": 282 }, { "epoch": 0.043244069221071935, "grad_norm": 0.20872731506824493, "learning_rate": 0.00019998643454599856, "loss": 0.7725, "step": 283 }, { "epoch": 0.043396875119379605, "grad_norm": 0.3191676139831543, "learning_rate": 0.00019998559996050347, "loss": 0.8263, "step": 284 }, { "epoch": 0.04354968101768728, "grad_norm": 0.39908912777900696, "learning_rate": 0.00019998474046443546, "loss": 0.6558, "step": 285 }, { "epoch": 0.04370248691599496, "grad_norm": 0.1785343736410141, "learning_rate": 0.0001999838560580086, "loss": 0.8449, "step": 286 }, { "epoch": 0.04385529281430263, "grad_norm": 0.2524084448814392, "learning_rate": 0.00019998294674144332, "loss": 0.7286, "step": 287 }, { "epoch": 0.04400809871261031, "grad_norm": 0.22183741629123688, "learning_rate": 0.00019998201251496617, "loss": 0.6983, "step": 288 }, { "epoch": 0.044160904610917984, "grad_norm": 0.351361483335495, "learning_rate": 0.00019998105337880984, "loss": 0.8788, "step": 289 }, { "epoch": 0.044313710509225654, "grad_norm": 0.28324541449546814, "learning_rate": 0.00019998006933321332, "loss": 0.6135, "step": 290 }, { "epoch": 0.04446651640753333, "grad_norm": 0.2555903494358063, "learning_rate": 0.00019997906037842183, "loss": 0.5704, "step": 291 }, { "epoch": 0.04461932230584101, "grad_norm": 0.2161989063024521, "learning_rate": 0.00019997802651468665, "loss": 0.6411, "step": 292 }, { "epoch": 0.04477212820414868, "grad_norm": 0.2448689043521881, "learning_rate": 0.00019997696774226543, "loss": 0.6679, "step": 293 }, { "epoch": 0.044924934102456356, "grad_norm": 0.2556678056716919, "learning_rate": 0.00019997588406142188, "loss": 0.8401, "step": 294 }, { "epoch": 0.04507774000076403, "grad_norm": 0.24582459032535553, "learning_rate": 0.00019997477547242608, "loss": 0.8461, "step": 295 }, { "epoch": 0.0452305458990717, "grad_norm": 0.19258172810077667, "learning_rate": 0.0001999736419755542, "loss": 0.749, "step": 296 }, { "epoch": 0.04538335179737938, "grad_norm": 0.2357243299484253, "learning_rate": 0.0001999724835710886, "loss": 0.6866, "step": 297 }, { "epoch": 0.04553615769568705, "grad_norm": 0.2215932160615921, "learning_rate": 0.00019997130025931788, "loss": 0.7202, "step": 298 }, { "epoch": 0.04568896359399473, "grad_norm": 0.22760243713855743, "learning_rate": 0.00019997009204053695, "loss": 0.7233, "step": 299 }, { "epoch": 0.045841769492302405, "grad_norm": 0.23306317627429962, "learning_rate": 0.00019996885891504672, "loss": 0.6106, "step": 300 }, { "epoch": 0.045994575390610075, "grad_norm": 0.2906085252761841, "learning_rate": 0.00019996760088315444, "loss": 0.8809, "step": 301 }, { "epoch": 0.04614738128891775, "grad_norm": 0.21287627518177032, "learning_rate": 0.0001999663179451736, "loss": 0.7354, "step": 302 }, { "epoch": 0.04630018718722543, "grad_norm": 0.25051966309547424, "learning_rate": 0.00019996501010142377, "loss": 0.5903, "step": 303 }, { "epoch": 0.0464529930855331, "grad_norm": 0.2597728371620178, "learning_rate": 0.00019996367735223078, "loss": 0.9319, "step": 304 }, { "epoch": 0.04660579898384078, "grad_norm": 0.26951470971107483, "learning_rate": 0.00019996231969792672, "loss": 0.7461, "step": 305 }, { "epoch": 0.046758604882148454, "grad_norm": 1.7744990587234497, "learning_rate": 0.00019996093713884981, "loss": 0.768, "step": 306 }, { "epoch": 0.046911410780456124, "grad_norm": 0.3497793674468994, "learning_rate": 0.0001999595296753445, "loss": 0.9868, "step": 307 }, { "epoch": 0.0470642166787638, "grad_norm": 0.25556042790412903, "learning_rate": 0.00019995809730776146, "loss": 0.7797, "step": 308 }, { "epoch": 0.04721702257707148, "grad_norm": 0.21160712838172913, "learning_rate": 0.00019995664003645756, "loss": 0.5969, "step": 309 }, { "epoch": 0.04736982847537915, "grad_norm": 0.20472657680511475, "learning_rate": 0.00019995515786179583, "loss": 0.4852, "step": 310 }, { "epoch": 0.047522634373686826, "grad_norm": 0.2371402531862259, "learning_rate": 0.0001999536507841456, "loss": 0.6591, "step": 311 }, { "epoch": 0.047675440271994496, "grad_norm": 0.2097351998090744, "learning_rate": 0.0001999521188038823, "loss": 0.6849, "step": 312 }, { "epoch": 0.04782824617030217, "grad_norm": 0.27859818935394287, "learning_rate": 0.0001999505619213876, "loss": 0.6925, "step": 313 }, { "epoch": 0.04798105206860985, "grad_norm": 0.22203388810157776, "learning_rate": 0.0001999489801370494, "loss": 0.8685, "step": 314 }, { "epoch": 0.04813385796691752, "grad_norm": 0.23791301250457764, "learning_rate": 0.00019994737345126185, "loss": 0.7374, "step": 315 }, { "epoch": 0.0482866638652252, "grad_norm": 0.21271444857120514, "learning_rate": 0.00019994574186442513, "loss": 0.6281, "step": 316 }, { "epoch": 0.048439469763532875, "grad_norm": 0.24780192971229553, "learning_rate": 0.00019994408537694585, "loss": 0.6525, "step": 317 }, { "epoch": 0.048592275661840545, "grad_norm": 0.2564080059528351, "learning_rate": 0.0001999424039892366, "loss": 0.8889, "step": 318 }, { "epoch": 0.04874508156014822, "grad_norm": 0.22218337655067444, "learning_rate": 0.00019994069770171637, "loss": 0.7197, "step": 319 }, { "epoch": 0.0488978874584559, "grad_norm": 0.2387639880180359, "learning_rate": 0.00019993896651481022, "loss": 0.7767, "step": 320 }, { "epoch": 0.04905069335676357, "grad_norm": 0.2468961626291275, "learning_rate": 0.0001999372104289495, "loss": 0.568, "step": 321 }, { "epoch": 0.04920349925507125, "grad_norm": 0.23209026455879211, "learning_rate": 0.00019993542944457166, "loss": 0.7791, "step": 322 }, { "epoch": 0.049356305153378924, "grad_norm": 0.21054136753082275, "learning_rate": 0.0001999336235621205, "loss": 0.5405, "step": 323 }, { "epoch": 0.049509111051686594, "grad_norm": 0.36745285987854004, "learning_rate": 0.00019993179278204583, "loss": 0.604, "step": 324 }, { "epoch": 0.04966191694999427, "grad_norm": 0.25285887718200684, "learning_rate": 0.0001999299371048039, "loss": 0.6, "step": 325 }, { "epoch": 0.04981472284830194, "grad_norm": 0.25242236256599426, "learning_rate": 0.00019992805653085697, "loss": 0.5935, "step": 326 }, { "epoch": 0.04996752874660962, "grad_norm": 0.2363995909690857, "learning_rate": 0.00019992615106067353, "loss": 0.9162, "step": 327 }, { "epoch": 0.050120334644917296, "grad_norm": 0.24165673553943634, "learning_rate": 0.0001999242206947284, "loss": 0.7828, "step": 328 }, { "epoch": 0.050273140543224966, "grad_norm": 0.23123116791248322, "learning_rate": 0.00019992226543350246, "loss": 0.7983, "step": 329 }, { "epoch": 0.05042594644153264, "grad_norm": 0.26019179821014404, "learning_rate": 0.00019992028527748287, "loss": 0.7911, "step": 330 }, { "epoch": 0.05057875233984032, "grad_norm": 0.22852295637130737, "learning_rate": 0.00019991828022716295, "loss": 0.6836, "step": 331 }, { "epoch": 0.05073155823814799, "grad_norm": 0.313212513923645, "learning_rate": 0.00019991625028304224, "loss": 0.7387, "step": 332 }, { "epoch": 0.05088436413645567, "grad_norm": 0.22907061874866486, "learning_rate": 0.00019991419544562652, "loss": 0.7111, "step": 333 }, { "epoch": 0.051037170034763345, "grad_norm": 0.24576780200004578, "learning_rate": 0.0001999121157154277, "loss": 0.6846, "step": 334 }, { "epoch": 0.051189975933071015, "grad_norm": 0.27017709612846375, "learning_rate": 0.00019991001109296392, "loss": 0.6491, "step": 335 }, { "epoch": 0.05134278183137869, "grad_norm": 0.30673229694366455, "learning_rate": 0.00019990788157875955, "loss": 0.7643, "step": 336 }, { "epoch": 0.05149558772968637, "grad_norm": 0.26924848556518555, "learning_rate": 0.00019990572717334514, "loss": 0.6674, "step": 337 }, { "epoch": 0.05164839362799404, "grad_norm": 0.23859171569347382, "learning_rate": 0.00019990354787725742, "loss": 0.6755, "step": 338 }, { "epoch": 0.05180119952630172, "grad_norm": 0.3418155312538147, "learning_rate": 0.00019990134369103938, "loss": 0.9036, "step": 339 }, { "epoch": 0.05195400542460939, "grad_norm": 0.22035688161849976, "learning_rate": 0.00019989911461524017, "loss": 0.7557, "step": 340 }, { "epoch": 0.052106811322917064, "grad_norm": 0.21298931539058685, "learning_rate": 0.0001998968606504151, "loss": 0.7085, "step": 341 }, { "epoch": 0.05225961722122474, "grad_norm": 0.30325108766555786, "learning_rate": 0.0001998945817971258, "loss": 0.6753, "step": 342 }, { "epoch": 0.05241242311953241, "grad_norm": 0.20659081637859344, "learning_rate": 0.00019989227805593994, "loss": 0.7346, "step": 343 }, { "epoch": 0.05256522901784009, "grad_norm": 0.18806900084018707, "learning_rate": 0.00019988994942743153, "loss": 0.6469, "step": 344 }, { "epoch": 0.052718034916147766, "grad_norm": 0.23977094888687134, "learning_rate": 0.00019988759591218073, "loss": 0.6006, "step": 345 }, { "epoch": 0.052870840814455436, "grad_norm": 0.19300661981105804, "learning_rate": 0.0001998852175107739, "loss": 0.6165, "step": 346 }, { "epoch": 0.05302364671276311, "grad_norm": 0.2542365491390228, "learning_rate": 0.00019988281422380358, "loss": 0.615, "step": 347 }, { "epoch": 0.05317645261107079, "grad_norm": 0.33191439509391785, "learning_rate": 0.00019988038605186855, "loss": 0.5821, "step": 348 }, { "epoch": 0.05332925850937846, "grad_norm": 0.2477361559867859, "learning_rate": 0.0001998779329955737, "loss": 0.7881, "step": 349 }, { "epoch": 0.05348206440768614, "grad_norm": 0.45226341485977173, "learning_rate": 0.00019987545505553028, "loss": 0.8674, "step": 350 }, { "epoch": 0.053634870305993815, "grad_norm": 0.2678709030151367, "learning_rate": 0.00019987295223235566, "loss": 0.9906, "step": 351 }, { "epoch": 0.053787676204301485, "grad_norm": 0.2115461379289627, "learning_rate": 0.00019987042452667328, "loss": 0.5388, "step": 352 }, { "epoch": 0.05394048210260916, "grad_norm": 0.24016588926315308, "learning_rate": 0.00019986787193911298, "loss": 0.6162, "step": 353 }, { "epoch": 0.05409328800091683, "grad_norm": 0.20813335478305817, "learning_rate": 0.00019986529447031074, "loss": 0.7018, "step": 354 }, { "epoch": 0.05424609389922451, "grad_norm": 0.22935904562473297, "learning_rate": 0.00019986269212090863, "loss": 0.6674, "step": 355 }, { "epoch": 0.05439889979753219, "grad_norm": 0.210664764046669, "learning_rate": 0.00019986006489155508, "loss": 0.8623, "step": 356 }, { "epoch": 0.05455170569583986, "grad_norm": 0.25411364436149597, "learning_rate": 0.00019985741278290457, "loss": 0.6944, "step": 357 }, { "epoch": 0.054704511594147534, "grad_norm": 0.21661530435085297, "learning_rate": 0.00019985473579561794, "loss": 0.7631, "step": 358 }, { "epoch": 0.05485731749245521, "grad_norm": 0.21907834708690643, "learning_rate": 0.00019985203393036206, "loss": 0.751, "step": 359 }, { "epoch": 0.05501012339076288, "grad_norm": 0.24429883062839508, "learning_rate": 0.00019984930718781012, "loss": 0.4515, "step": 360 }, { "epoch": 0.05516292928907056, "grad_norm": 0.23436371982097626, "learning_rate": 0.00019984655556864146, "loss": 0.6239, "step": 361 }, { "epoch": 0.055315735187378236, "grad_norm": 0.2612748146057129, "learning_rate": 0.0001998437790735416, "loss": 0.6628, "step": 362 }, { "epoch": 0.055468541085685906, "grad_norm": 0.2066948562860489, "learning_rate": 0.00019984097770320235, "loss": 0.6496, "step": 363 }, { "epoch": 0.05562134698399358, "grad_norm": 0.2153586447238922, "learning_rate": 0.00019983815145832153, "loss": 0.5914, "step": 364 }, { "epoch": 0.05577415288230126, "grad_norm": 0.2186291664838791, "learning_rate": 0.00019983530033960335, "loss": 0.7676, "step": 365 }, { "epoch": 0.05592695878060893, "grad_norm": 0.26197004318237305, "learning_rate": 0.00019983242434775815, "loss": 0.637, "step": 366 }, { "epoch": 0.05607976467891661, "grad_norm": 0.2043347805738449, "learning_rate": 0.00019982952348350245, "loss": 0.6166, "step": 367 }, { "epoch": 0.05623257057722428, "grad_norm": 0.29757070541381836, "learning_rate": 0.00019982659774755895, "loss": 0.7035, "step": 368 }, { "epoch": 0.056385376475531955, "grad_norm": 0.20683075487613678, "learning_rate": 0.0001998236471406566, "loss": 0.7503, "step": 369 }, { "epoch": 0.05653818237383963, "grad_norm": 0.25130218267440796, "learning_rate": 0.0001998206716635305, "loss": 0.6997, "step": 370 }, { "epoch": 0.0566909882721473, "grad_norm": 0.23295602202415466, "learning_rate": 0.00019981767131692198, "loss": 0.6887, "step": 371 }, { "epoch": 0.05684379417045498, "grad_norm": 0.3534759283065796, "learning_rate": 0.00019981464610157855, "loss": 0.8564, "step": 372 }, { "epoch": 0.05699660006876266, "grad_norm": 0.25935059785842896, "learning_rate": 0.0001998115960182539, "loss": 0.7129, "step": 373 }, { "epoch": 0.05714940596707033, "grad_norm": 0.2791917026042938, "learning_rate": 0.00019980852106770797, "loss": 0.6854, "step": 374 }, { "epoch": 0.057302211865378004, "grad_norm": 0.31893056631088257, "learning_rate": 0.0001998054212507068, "loss": 0.8234, "step": 375 }, { "epoch": 0.05745501776368568, "grad_norm": 0.24410194158554077, "learning_rate": 0.00019980229656802273, "loss": 0.5903, "step": 376 }, { "epoch": 0.05760782366199335, "grad_norm": 0.21686480939388275, "learning_rate": 0.00019979914702043423, "loss": 0.7364, "step": 377 }, { "epoch": 0.05776062956030103, "grad_norm": 0.25539955496788025, "learning_rate": 0.00019979597260872601, "loss": 0.8077, "step": 378 }, { "epoch": 0.0579134354586087, "grad_norm": 0.26865750551223755, "learning_rate": 0.00019979277333368888, "loss": 0.742, "step": 379 }, { "epoch": 0.058066241356916376, "grad_norm": 0.20411862432956696, "learning_rate": 0.00019978954919612, "loss": 0.6866, "step": 380 }, { "epoch": 0.05821904725522405, "grad_norm": 0.21796946227550507, "learning_rate": 0.0001997863001968226, "loss": 0.7437, "step": 381 }, { "epoch": 0.05837185315353172, "grad_norm": 0.27662667632102966, "learning_rate": 0.0001997830263366061, "loss": 0.6777, "step": 382 }, { "epoch": 0.0585246590518394, "grad_norm": 0.283934623003006, "learning_rate": 0.0001997797276162862, "loss": 0.6572, "step": 383 }, { "epoch": 0.05867746495014708, "grad_norm": 0.22082751989364624, "learning_rate": 0.00019977640403668476, "loss": 0.6067, "step": 384 }, { "epoch": 0.05883027084845475, "grad_norm": 0.2193826287984848, "learning_rate": 0.00019977305559862977, "loss": 0.7021, "step": 385 }, { "epoch": 0.058983076746762425, "grad_norm": 0.24583138525485992, "learning_rate": 0.00019976968230295554, "loss": 0.8803, "step": 386 }, { "epoch": 0.0591358826450701, "grad_norm": 0.30317580699920654, "learning_rate": 0.00019976628415050246, "loss": 0.8169, "step": 387 }, { "epoch": 0.05928868854337777, "grad_norm": 0.2399805188179016, "learning_rate": 0.0001997628611421171, "loss": 0.8351, "step": 388 }, { "epoch": 0.05944149444168545, "grad_norm": 0.2840050458908081, "learning_rate": 0.00019975941327865233, "loss": 0.738, "step": 389 }, { "epoch": 0.05959430033999313, "grad_norm": 0.3253823518753052, "learning_rate": 0.00019975594056096717, "loss": 0.5278, "step": 390 }, { "epoch": 0.0597471062383008, "grad_norm": 0.24650508165359497, "learning_rate": 0.00019975244298992676, "loss": 0.7123, "step": 391 }, { "epoch": 0.059899912136608474, "grad_norm": 0.3255913555622101, "learning_rate": 0.00019974892056640257, "loss": 0.6411, "step": 392 }, { "epoch": 0.060052718034916144, "grad_norm": 0.21249303221702576, "learning_rate": 0.00019974537329127209, "loss": 0.9045, "step": 393 }, { "epoch": 0.06020552393322382, "grad_norm": 0.23240245878696442, "learning_rate": 0.0001997418011654192, "loss": 0.7853, "step": 394 }, { "epoch": 0.0603583298315315, "grad_norm": 0.42459583282470703, "learning_rate": 0.00019973820418973376, "loss": 0.7974, "step": 395 }, { "epoch": 0.06051113572983917, "grad_norm": 0.34886565804481506, "learning_rate": 0.000199734582365112, "loss": 0.7454, "step": 396 }, { "epoch": 0.060663941628146846, "grad_norm": 0.23069462180137634, "learning_rate": 0.0001997309356924562, "loss": 0.7182, "step": 397 }, { "epoch": 0.06081674752645452, "grad_norm": 0.25458237528800964, "learning_rate": 0.00019972726417267497, "loss": 0.6431, "step": 398 }, { "epoch": 0.06096955342476219, "grad_norm": 0.21412460505962372, "learning_rate": 0.000199723567806683, "loss": 0.6802, "step": 399 }, { "epoch": 0.06112235932306987, "grad_norm": 0.21457818150520325, "learning_rate": 0.0001997198465954012, "loss": 0.6221, "step": 400 }, { "epoch": 0.06127516522137755, "grad_norm": 0.2284182459115982, "learning_rate": 0.0001997161005397567, "loss": 0.6552, "step": 401 }, { "epoch": 0.06142797111968522, "grad_norm": 0.2774156630039215, "learning_rate": 0.00019971232964068283, "loss": 0.647, "step": 402 }, { "epoch": 0.061580777017992895, "grad_norm": 0.2522546350955963, "learning_rate": 0.000199708533899119, "loss": 0.5137, "step": 403 }, { "epoch": 0.06173358291630057, "grad_norm": 0.28763559460639954, "learning_rate": 0.00019970471331601095, "loss": 0.6016, "step": 404 }, { "epoch": 0.06188638881460824, "grad_norm": 0.7510436177253723, "learning_rate": 0.0001997008678923105, "loss": 0.533, "step": 405 }, { "epoch": 0.06203919471291592, "grad_norm": 0.25240078568458557, "learning_rate": 0.00019969699762897576, "loss": 0.6597, "step": 406 }, { "epoch": 0.06219200061122359, "grad_norm": 0.2635802924633026, "learning_rate": 0.0001996931025269709, "loss": 0.7477, "step": 407 }, { "epoch": 0.06234480650953127, "grad_norm": 0.3156009614467621, "learning_rate": 0.00019968918258726642, "loss": 0.8181, "step": 408 }, { "epoch": 0.062497612407838944, "grad_norm": 0.24074885249137878, "learning_rate": 0.0001996852378108389, "loss": 0.6273, "step": 409 }, { "epoch": 0.06265041830614662, "grad_norm": 0.336436003446579, "learning_rate": 0.00019968126819867117, "loss": 0.7783, "step": 410 }, { "epoch": 0.06280322420445429, "grad_norm": 0.24637262523174286, "learning_rate": 0.00019967727375175222, "loss": 0.68, "step": 411 }, { "epoch": 0.06295603010276196, "grad_norm": 0.3599497377872467, "learning_rate": 0.00019967325447107722, "loss": 0.6744, "step": 412 }, { "epoch": 0.06310883600106965, "grad_norm": 0.29385289549827576, "learning_rate": 0.00019966921035764756, "loss": 0.9023, "step": 413 }, { "epoch": 0.06326164189937732, "grad_norm": 0.2313985675573349, "learning_rate": 0.00019966514141247078, "loss": 0.6057, "step": 414 }, { "epoch": 0.06341444779768499, "grad_norm": 0.27997085452079773, "learning_rate": 0.00019966104763656064, "loss": 0.6791, "step": 415 }, { "epoch": 0.06356725369599267, "grad_norm": 0.2614806890487671, "learning_rate": 0.00019965692903093705, "loss": 0.6162, "step": 416 }, { "epoch": 0.06372005959430034, "grad_norm": 0.2659102976322174, "learning_rate": 0.00019965278559662614, "loss": 0.5931, "step": 417 }, { "epoch": 0.06387286549260801, "grad_norm": 0.22393980622291565, "learning_rate": 0.0001996486173346602, "loss": 0.6176, "step": 418 }, { "epoch": 0.0640256713909157, "grad_norm": 0.25976166129112244, "learning_rate": 0.00019964442424607774, "loss": 0.6612, "step": 419 }, { "epoch": 0.06417847728922337, "grad_norm": 0.3219575881958008, "learning_rate": 0.00019964020633192342, "loss": 0.8215, "step": 420 }, { "epoch": 0.06433128318753104, "grad_norm": 0.24153344333171844, "learning_rate": 0.0001996359635932481, "loss": 0.8909, "step": 421 }, { "epoch": 0.06448408908583872, "grad_norm": 0.7330458760261536, "learning_rate": 0.00019963169603110878, "loss": 0.743, "step": 422 }, { "epoch": 0.06463689498414639, "grad_norm": 0.278689980506897, "learning_rate": 0.00019962740364656874, "loss": 0.7773, "step": 423 }, { "epoch": 0.06478970088245406, "grad_norm": 0.23073042929172516, "learning_rate": 0.00019962308644069744, "loss": 0.6776, "step": 424 }, { "epoch": 0.06494250678076174, "grad_norm": 0.20433606207370758, "learning_rate": 0.00019961874441457034, "loss": 0.6218, "step": 425 }, { "epoch": 0.06509531267906941, "grad_norm": 0.25579705834388733, "learning_rate": 0.00019961437756926934, "loss": 0.7303, "step": 426 }, { "epoch": 0.06524811857737708, "grad_norm": 0.2474275529384613, "learning_rate": 0.00019960998590588233, "loss": 0.6678, "step": 427 }, { "epoch": 0.06540092447568475, "grad_norm": 0.29059740900993347, "learning_rate": 0.0001996055694255035, "loss": 0.738, "step": 428 }, { "epoch": 0.06555373037399244, "grad_norm": 0.21308888494968414, "learning_rate": 0.00019960112812923312, "loss": 0.5843, "step": 429 }, { "epoch": 0.06570653627230011, "grad_norm": 0.27173516154289246, "learning_rate": 0.00019959666201817776, "loss": 0.6929, "step": 430 }, { "epoch": 0.06585934217060778, "grad_norm": 0.29579100012779236, "learning_rate": 0.00019959217109345013, "loss": 0.7057, "step": 431 }, { "epoch": 0.06601214806891546, "grad_norm": 0.22613683342933655, "learning_rate": 0.00019958765535616906, "loss": 0.7614, "step": 432 }, { "epoch": 0.06616495396722313, "grad_norm": 0.24712997674942017, "learning_rate": 0.0001995831148074596, "loss": 0.6486, "step": 433 }, { "epoch": 0.0663177598655308, "grad_norm": 0.26547789573669434, "learning_rate": 0.00019957854944845305, "loss": 0.8202, "step": 434 }, { "epoch": 0.06647056576383849, "grad_norm": 0.20722678303718567, "learning_rate": 0.00019957395928028675, "loss": 0.6485, "step": 435 }, { "epoch": 0.06662337166214616, "grad_norm": 0.2522684335708618, "learning_rate": 0.00019956934430410438, "loss": 0.7336, "step": 436 }, { "epoch": 0.06677617756045383, "grad_norm": 0.22733008861541748, "learning_rate": 0.00019956470452105562, "loss": 0.7137, "step": 437 }, { "epoch": 0.06692898345876151, "grad_norm": 0.29240962862968445, "learning_rate": 0.00019956003993229656, "loss": 0.733, "step": 438 }, { "epoch": 0.06708178935706918, "grad_norm": 0.3016742467880249, "learning_rate": 0.00019955535053898927, "loss": 0.7911, "step": 439 }, { "epoch": 0.06723459525537685, "grad_norm": 0.28375834226608276, "learning_rate": 0.0001995506363423021, "loss": 0.6417, "step": 440 }, { "epoch": 0.06738740115368454, "grad_norm": 0.2522740066051483, "learning_rate": 0.00019954589734340949, "loss": 0.9409, "step": 441 }, { "epoch": 0.06754020705199221, "grad_norm": 0.23570802807807922, "learning_rate": 0.0001995411335434922, "loss": 0.6449, "step": 442 }, { "epoch": 0.06769301295029988, "grad_norm": 0.4781047999858856, "learning_rate": 0.00019953634494373706, "loss": 0.8234, "step": 443 }, { "epoch": 0.06784581884860756, "grad_norm": 0.21055488288402557, "learning_rate": 0.0001995315315453371, "loss": 0.5919, "step": 444 }, { "epoch": 0.06799862474691523, "grad_norm": 0.24888089299201965, "learning_rate": 0.00019952669334949156, "loss": 0.823, "step": 445 }, { "epoch": 0.0681514306452229, "grad_norm": 0.4187317192554474, "learning_rate": 0.0001995218303574058, "loss": 0.691, "step": 446 }, { "epoch": 0.06830423654353059, "grad_norm": 0.23317286372184753, "learning_rate": 0.00019951694257029146, "loss": 0.6538, "step": 447 }, { "epoch": 0.06845704244183826, "grad_norm": 0.2808625400066376, "learning_rate": 0.0001995120299893662, "loss": 0.5262, "step": 448 }, { "epoch": 0.06860984834014593, "grad_norm": 0.2186977118253708, "learning_rate": 0.00019950709261585403, "loss": 0.7305, "step": 449 }, { "epoch": 0.06876265423845361, "grad_norm": 0.25195857882499695, "learning_rate": 0.00019950213045098503, "loss": 0.7507, "step": 450 }, { "epoch": 0.06891546013676128, "grad_norm": 0.2823186218738556, "learning_rate": 0.00019949714349599545, "loss": 0.6588, "step": 451 }, { "epoch": 0.06906826603506895, "grad_norm": 0.27081775665283203, "learning_rate": 0.00019949213175212774, "loss": 0.5578, "step": 452 }, { "epoch": 0.06922107193337663, "grad_norm": 0.2283964604139328, "learning_rate": 0.00019948709522063063, "loss": 0.575, "step": 453 }, { "epoch": 0.0693738778316843, "grad_norm": 0.24064375460147858, "learning_rate": 0.00019948203390275884, "loss": 0.7462, "step": 454 }, { "epoch": 0.06952668372999198, "grad_norm": 0.4738442599773407, "learning_rate": 0.00019947694779977337, "loss": 0.6016, "step": 455 }, { "epoch": 0.06967948962829965, "grad_norm": 0.2238418608903885, "learning_rate": 0.0001994718369129414, "loss": 0.7509, "step": 456 }, { "epoch": 0.06983229552660733, "grad_norm": 0.2132556140422821, "learning_rate": 0.00019946670124353622, "loss": 0.7021, "step": 457 }, { "epoch": 0.069985101424915, "grad_norm": 0.249766007065773, "learning_rate": 0.00019946154079283744, "loss": 0.7264, "step": 458 }, { "epoch": 0.07013790732322267, "grad_norm": 0.23593513667583466, "learning_rate": 0.00019945635556213064, "loss": 0.559, "step": 459 }, { "epoch": 0.07029071322153035, "grad_norm": 0.21727308630943298, "learning_rate": 0.00019945114555270768, "loss": 0.6118, "step": 460 }, { "epoch": 0.07044351911983802, "grad_norm": 0.5952973365783691, "learning_rate": 0.00019944591076586664, "loss": 0.7533, "step": 461 }, { "epoch": 0.0705963250181457, "grad_norm": 0.43861424922943115, "learning_rate": 0.00019944065120291175, "loss": 0.6487, "step": 462 }, { "epoch": 0.07074913091645338, "grad_norm": 0.2683066725730896, "learning_rate": 0.0001994353668651533, "loss": 0.7177, "step": 463 }, { "epoch": 0.07090193681476105, "grad_norm": 0.23011751472949982, "learning_rate": 0.0001994300577539079, "loss": 0.5861, "step": 464 }, { "epoch": 0.07105474271306872, "grad_norm": 0.20545627176761627, "learning_rate": 0.00019942472387049823, "loss": 0.5209, "step": 465 }, { "epoch": 0.0712075486113764, "grad_norm": 0.28548967838287354, "learning_rate": 0.0001994193652162532, "loss": 0.6958, "step": 466 }, { "epoch": 0.07136035450968407, "grad_norm": 0.26860255002975464, "learning_rate": 0.0001994139817925079, "loss": 0.5867, "step": 467 }, { "epoch": 0.07151316040799174, "grad_norm": 0.2905493676662445, "learning_rate": 0.00019940857360060355, "loss": 0.6942, "step": 468 }, { "epoch": 0.07166596630629943, "grad_norm": 0.31361424922943115, "learning_rate": 0.00019940314064188753, "loss": 0.6028, "step": 469 }, { "epoch": 0.0718187722046071, "grad_norm": 0.2557202875614166, "learning_rate": 0.0001993976829177134, "loss": 0.7129, "step": 470 }, { "epoch": 0.07197157810291477, "grad_norm": 0.22137515246868134, "learning_rate": 0.00019939220042944098, "loss": 0.8681, "step": 471 }, { "epoch": 0.07212438400122245, "grad_norm": 0.18441233038902283, "learning_rate": 0.00019938669317843614, "loss": 0.5655, "step": 472 }, { "epoch": 0.07227718989953012, "grad_norm": 0.22239898145198822, "learning_rate": 0.00019938116116607096, "loss": 0.7098, "step": 473 }, { "epoch": 0.07242999579783779, "grad_norm": 0.25017887353897095, "learning_rate": 0.00019937560439372372, "loss": 0.8911, "step": 474 }, { "epoch": 0.07258280169614548, "grad_norm": 0.25843703746795654, "learning_rate": 0.00019937002286277882, "loss": 0.7774, "step": 475 }, { "epoch": 0.07273560759445315, "grad_norm": 0.24830183386802673, "learning_rate": 0.00019936441657462687, "loss": 0.5783, "step": 476 }, { "epoch": 0.07288841349276082, "grad_norm": 0.6441646814346313, "learning_rate": 0.00019935878553066462, "loss": 0.7754, "step": 477 }, { "epoch": 0.0730412193910685, "grad_norm": 0.2400471419095993, "learning_rate": 0.000199353129732295, "loss": 0.7317, "step": 478 }, { "epoch": 0.07319402528937617, "grad_norm": 0.3065354824066162, "learning_rate": 0.00019934744918092707, "loss": 0.6605, "step": 479 }, { "epoch": 0.07334683118768384, "grad_norm": 0.2226812243461609, "learning_rate": 0.00019934174387797613, "loss": 0.6817, "step": 480 }, { "epoch": 0.07349963708599153, "grad_norm": 0.224257230758667, "learning_rate": 0.00019933601382486363, "loss": 0.7232, "step": 481 }, { "epoch": 0.0736524429842992, "grad_norm": 0.2415555864572525, "learning_rate": 0.0001993302590230171, "loss": 0.8262, "step": 482 }, { "epoch": 0.07380524888260687, "grad_norm": 0.20679202675819397, "learning_rate": 0.00019932447947387037, "loss": 0.5378, "step": 483 }, { "epoch": 0.07395805478091454, "grad_norm": 0.1950317621231079, "learning_rate": 0.00019931867517886332, "loss": 0.6533, "step": 484 }, { "epoch": 0.07411086067922222, "grad_norm": 0.2754247486591339, "learning_rate": 0.00019931284613944206, "loss": 0.8435, "step": 485 }, { "epoch": 0.07426366657752989, "grad_norm": 0.23111988604068756, "learning_rate": 0.00019930699235705884, "loss": 0.6632, "step": 486 }, { "epoch": 0.07441647247583756, "grad_norm": 0.2312602549791336, "learning_rate": 0.00019930111383317204, "loss": 0.6834, "step": 487 }, { "epoch": 0.07456927837414525, "grad_norm": 0.24196754395961761, "learning_rate": 0.00019929521056924633, "loss": 0.7273, "step": 488 }, { "epoch": 0.07472208427245292, "grad_norm": 0.26272863149642944, "learning_rate": 0.00019928928256675242, "loss": 0.7997, "step": 489 }, { "epoch": 0.07487489017076059, "grad_norm": 0.22681844234466553, "learning_rate": 0.0001992833298271672, "loss": 0.6919, "step": 490 }, { "epoch": 0.07502769606906827, "grad_norm": 0.22710320353507996, "learning_rate": 0.00019927735235197375, "loss": 0.7005, "step": 491 }, { "epoch": 0.07518050196737594, "grad_norm": 0.27059561014175415, "learning_rate": 0.00019927135014266134, "loss": 0.8002, "step": 492 }, { "epoch": 0.07533330786568361, "grad_norm": 0.2716640532016754, "learning_rate": 0.00019926532320072536, "loss": 0.7222, "step": 493 }, { "epoch": 0.0754861137639913, "grad_norm": 0.2126639038324356, "learning_rate": 0.00019925927152766735, "loss": 0.5937, "step": 494 }, { "epoch": 0.07563891966229896, "grad_norm": 0.24724045395851135, "learning_rate": 0.00019925319512499506, "loss": 0.552, "step": 495 }, { "epoch": 0.07579172556060663, "grad_norm": 0.2463061362504959, "learning_rate": 0.00019924709399422232, "loss": 0.7636, "step": 496 }, { "epoch": 0.07594453145891432, "grad_norm": 0.24597403407096863, "learning_rate": 0.00019924096813686923, "loss": 0.8017, "step": 497 }, { "epoch": 0.07609733735722199, "grad_norm": 0.37607234716415405, "learning_rate": 0.000199234817554462, "loss": 0.7762, "step": 498 }, { "epoch": 0.07625014325552966, "grad_norm": 0.2166277915239334, "learning_rate": 0.00019922864224853297, "loss": 0.8255, "step": 499 }, { "epoch": 0.07640294915383734, "grad_norm": 0.3068873882293701, "learning_rate": 0.00019922244222062067, "loss": 0.8931, "step": 500 }, { "epoch": 0.07655575505214501, "grad_norm": 0.2293839454650879, "learning_rate": 0.00019921621747226976, "loss": 0.5859, "step": 501 }, { "epoch": 0.07670856095045268, "grad_norm": 0.20842444896697998, "learning_rate": 0.0001992099680050312, "loss": 0.6295, "step": 502 }, { "epoch": 0.07686136684876037, "grad_norm": 0.24702678620815277, "learning_rate": 0.00019920369382046181, "loss": 0.8282, "step": 503 }, { "epoch": 0.07701417274706804, "grad_norm": 0.3347665071487427, "learning_rate": 0.0001991973949201249, "loss": 0.696, "step": 504 }, { "epoch": 0.07716697864537571, "grad_norm": 0.21844464540481567, "learning_rate": 0.0001991910713055897, "loss": 0.6584, "step": 505 }, { "epoch": 0.07731978454368339, "grad_norm": 0.2989930808544159, "learning_rate": 0.00019918472297843174, "loss": 0.7263, "step": 506 }, { "epoch": 0.07747259044199106, "grad_norm": 0.22986134886741638, "learning_rate": 0.0001991783499402326, "loss": 0.6598, "step": 507 }, { "epoch": 0.07762539634029873, "grad_norm": 0.2555464506149292, "learning_rate": 0.00019917195219258012, "loss": 0.6093, "step": 508 }, { "epoch": 0.07777820223860642, "grad_norm": 0.22957204282283783, "learning_rate": 0.00019916552973706824, "loss": 0.872, "step": 509 }, { "epoch": 0.07793100813691409, "grad_norm": 0.2668517529964447, "learning_rate": 0.00019915908257529702, "loss": 0.907, "step": 510 }, { "epoch": 0.07808381403522176, "grad_norm": 0.29907530546188354, "learning_rate": 0.00019915261070887276, "loss": 0.7975, "step": 511 }, { "epoch": 0.07823661993352943, "grad_norm": 0.2448134869337082, "learning_rate": 0.00019914611413940784, "loss": 0.9981, "step": 512 }, { "epoch": 0.07838942583183711, "grad_norm": 0.5516696572303772, "learning_rate": 0.00019913959286852083, "loss": 0.6658, "step": 513 }, { "epoch": 0.07854223173014478, "grad_norm": 0.30436620116233826, "learning_rate": 0.00019913304689783646, "loss": 0.6571, "step": 514 }, { "epoch": 0.07869503762845245, "grad_norm": 0.24897192418575287, "learning_rate": 0.00019912647622898563, "loss": 0.7244, "step": 515 }, { "epoch": 0.07884784352676014, "grad_norm": 0.25573232769966125, "learning_rate": 0.00019911988086360533, "loss": 0.6093, "step": 516 }, { "epoch": 0.0790006494250678, "grad_norm": 0.21649105846881866, "learning_rate": 0.00019911326080333875, "loss": 0.6002, "step": 517 }, { "epoch": 0.07915345532337548, "grad_norm": 0.3718641996383667, "learning_rate": 0.0001991066160498352, "loss": 0.6874, "step": 518 }, { "epoch": 0.07930626122168316, "grad_norm": 0.23083071410655975, "learning_rate": 0.00019909994660475023, "loss": 0.7339, "step": 519 }, { "epoch": 0.07945906711999083, "grad_norm": 0.2922573983669281, "learning_rate": 0.0001990932524697454, "loss": 0.7994, "step": 520 }, { "epoch": 0.0796118730182985, "grad_norm": 0.26380589604377747, "learning_rate": 0.00019908653364648853, "loss": 0.7204, "step": 521 }, { "epoch": 0.07976467891660619, "grad_norm": 0.28353989124298096, "learning_rate": 0.00019907979013665357, "loss": 0.5284, "step": 522 }, { "epoch": 0.07991748481491386, "grad_norm": 0.2280929833650589, "learning_rate": 0.00019907302194192058, "loss": 0.7311, "step": 523 }, { "epoch": 0.08007029071322153, "grad_norm": 0.24350902438163757, "learning_rate": 0.00019906622906397582, "loss": 0.6621, "step": 524 }, { "epoch": 0.08022309661152921, "grad_norm": 0.22378286719322205, "learning_rate": 0.0001990594115045117, "loss": 0.6478, "step": 525 }, { "epoch": 0.08037590250983688, "grad_norm": 0.2641814053058624, "learning_rate": 0.00019905256926522672, "loss": 0.6855, "step": 526 }, { "epoch": 0.08052870840814455, "grad_norm": 0.2580053210258484, "learning_rate": 0.00019904570234782556, "loss": 0.6963, "step": 527 }, { "epoch": 0.08068151430645223, "grad_norm": 0.37223583459854126, "learning_rate": 0.00019903881075401908, "loss": 0.8338, "step": 528 }, { "epoch": 0.0808343202047599, "grad_norm": 0.3101045489311218, "learning_rate": 0.0001990318944855243, "loss": 0.6928, "step": 529 }, { "epoch": 0.08098712610306757, "grad_norm": 0.22513873875141144, "learning_rate": 0.00019902495354406425, "loss": 0.7592, "step": 530 }, { "epoch": 0.08113993200137526, "grad_norm": 0.2204548716545105, "learning_rate": 0.00019901798793136829, "loss": 0.6437, "step": 531 }, { "epoch": 0.08129273789968293, "grad_norm": 0.30862662196159363, "learning_rate": 0.0001990109976491718, "loss": 0.7737, "step": 532 }, { "epoch": 0.0814455437979906, "grad_norm": 0.2376687079668045, "learning_rate": 0.00019900398269921636, "loss": 0.7087, "step": 533 }, { "epoch": 0.08159834969629828, "grad_norm": 0.3251570463180542, "learning_rate": 0.0001989969430832497, "loss": 0.688, "step": 534 }, { "epoch": 0.08175115559460595, "grad_norm": 0.2662348449230194, "learning_rate": 0.00019898987880302574, "loss": 0.8595, "step": 535 }, { "epoch": 0.08190396149291362, "grad_norm": 0.21960824728012085, "learning_rate": 0.00019898278986030436, "loss": 0.6531, "step": 536 }, { "epoch": 0.08205676739122131, "grad_norm": 0.34016793966293335, "learning_rate": 0.00019897567625685176, "loss": 0.6846, "step": 537 }, { "epoch": 0.08220957328952898, "grad_norm": 0.23817062377929688, "learning_rate": 0.00019896853799444028, "loss": 0.7138, "step": 538 }, { "epoch": 0.08236237918783665, "grad_norm": 0.31925493478775024, "learning_rate": 0.00019896137507484834, "loss": 0.76, "step": 539 }, { "epoch": 0.08251518508614432, "grad_norm": 0.2391405999660492, "learning_rate": 0.00019895418749986047, "loss": 0.6027, "step": 540 }, { "epoch": 0.082667990984452, "grad_norm": 0.21949400007724762, "learning_rate": 0.00019894697527126742, "loss": 0.6997, "step": 541 }, { "epoch": 0.08282079688275967, "grad_norm": 0.23678694665431976, "learning_rate": 0.00019893973839086608, "loss": 0.5587, "step": 542 }, { "epoch": 0.08297360278106734, "grad_norm": 0.2640646994113922, "learning_rate": 0.00019893247686045946, "loss": 0.6838, "step": 543 }, { "epoch": 0.08312640867937503, "grad_norm": 0.2536254823207855, "learning_rate": 0.0001989251906818567, "loss": 0.5734, "step": 544 }, { "epoch": 0.0832792145776827, "grad_norm": 0.20557624101638794, "learning_rate": 0.00019891787985687308, "loss": 0.7211, "step": 545 }, { "epoch": 0.08343202047599037, "grad_norm": 0.26330065727233887, "learning_rate": 0.00019891054438732998, "loss": 0.704, "step": 546 }, { "epoch": 0.08358482637429805, "grad_norm": 0.2792690396308899, "learning_rate": 0.0001989031842750551, "loss": 0.5176, "step": 547 }, { "epoch": 0.08373763227260572, "grad_norm": 0.3036953806877136, "learning_rate": 0.00019889579952188204, "loss": 0.858, "step": 548 }, { "epoch": 0.08389043817091339, "grad_norm": 0.2737971842288971, "learning_rate": 0.00019888839012965068, "loss": 0.6229, "step": 549 }, { "epoch": 0.08404324406922108, "grad_norm": 0.2955757975578308, "learning_rate": 0.000198880956100207, "loss": 0.7844, "step": 550 }, { "epoch": 0.08419604996752875, "grad_norm": 0.31220605969429016, "learning_rate": 0.0001988734974354032, "loss": 0.9625, "step": 551 }, { "epoch": 0.08434885586583642, "grad_norm": 0.23816774785518646, "learning_rate": 0.0001988660141370974, "loss": 0.6341, "step": 552 }, { "epoch": 0.0845016617641441, "grad_norm": 0.26111069321632385, "learning_rate": 0.00019885850620715413, "loss": 0.7624, "step": 553 }, { "epoch": 0.08465446766245177, "grad_norm": 0.2268974632024765, "learning_rate": 0.0001988509736474439, "loss": 0.68, "step": 554 }, { "epoch": 0.08480727356075944, "grad_norm": 0.24625982344150543, "learning_rate": 0.00019884341645984332, "loss": 0.7494, "step": 555 }, { "epoch": 0.08496007945906713, "grad_norm": 0.2776056230068207, "learning_rate": 0.00019883583464623525, "loss": 0.6182, "step": 556 }, { "epoch": 0.0851128853573748, "grad_norm": 0.3328106105327606, "learning_rate": 0.00019882822820850866, "loss": 0.663, "step": 557 }, { "epoch": 0.08526569125568247, "grad_norm": 0.2359543889760971, "learning_rate": 0.00019882059714855857, "loss": 0.9259, "step": 558 }, { "epoch": 0.08541849715399015, "grad_norm": 0.2494177222251892, "learning_rate": 0.00019881294146828626, "loss": 0.6741, "step": 559 }, { "epoch": 0.08557130305229782, "grad_norm": 0.25279515981674194, "learning_rate": 0.000198805261169599, "loss": 0.824, "step": 560 }, { "epoch": 0.08572410895060549, "grad_norm": 0.2714499235153198, "learning_rate": 0.00019879755625441033, "loss": 0.6873, "step": 561 }, { "epoch": 0.08587691484891317, "grad_norm": 0.26827186346054077, "learning_rate": 0.0001987898267246399, "loss": 0.7667, "step": 562 }, { "epoch": 0.08602972074722084, "grad_norm": 0.2813642621040344, "learning_rate": 0.00019878207258221332, "loss": 0.7336, "step": 563 }, { "epoch": 0.08618252664552851, "grad_norm": 0.25137224793434143, "learning_rate": 0.00019877429382906262, "loss": 0.838, "step": 564 }, { "epoch": 0.08633533254383619, "grad_norm": 0.23676906526088715, "learning_rate": 0.00019876649046712572, "loss": 0.8007, "step": 565 }, { "epoch": 0.08648813844214387, "grad_norm": 0.25752153992652893, "learning_rate": 0.00019875866249834681, "loss": 0.8034, "step": 566 }, { "epoch": 0.08664094434045154, "grad_norm": 0.4326179027557373, "learning_rate": 0.0001987508099246761, "loss": 0.6266, "step": 567 }, { "epoch": 0.08679375023875921, "grad_norm": 0.23379918932914734, "learning_rate": 0.0001987429327480701, "loss": 0.7669, "step": 568 }, { "epoch": 0.0869465561370669, "grad_norm": 1.726702094078064, "learning_rate": 0.00019873503097049124, "loss": 0.6843, "step": 569 }, { "epoch": 0.08709936203537456, "grad_norm": 0.39877480268478394, "learning_rate": 0.0001987271045939082, "loss": 0.7714, "step": 570 }, { "epoch": 0.08725216793368223, "grad_norm": 0.46413347125053406, "learning_rate": 0.00019871915362029583, "loss": 0.8398, "step": 571 }, { "epoch": 0.08740497383198992, "grad_norm": 0.2789576053619385, "learning_rate": 0.000198711178051635, "loss": 0.677, "step": 572 }, { "epoch": 0.08755777973029759, "grad_norm": 0.47314074635505676, "learning_rate": 0.00019870317788991276, "loss": 0.6724, "step": 573 }, { "epoch": 0.08771058562860526, "grad_norm": 0.3048081398010254, "learning_rate": 0.0001986951531371223, "loss": 0.5743, "step": 574 }, { "epoch": 0.08786339152691294, "grad_norm": 0.2774280309677124, "learning_rate": 0.00019868710379526287, "loss": 0.6805, "step": 575 }, { "epoch": 0.08801619742522061, "grad_norm": 0.24514140188694, "learning_rate": 0.00019867902986633995, "loss": 0.5526, "step": 576 }, { "epoch": 0.08816900332352828, "grad_norm": 0.279148131608963, "learning_rate": 0.0001986709313523651, "loss": 0.7963, "step": 577 }, { "epoch": 0.08832180922183597, "grad_norm": 0.23727378249168396, "learning_rate": 0.00019866280825535593, "loss": 0.6729, "step": 578 }, { "epoch": 0.08847461512014364, "grad_norm": 0.3538941442966461, "learning_rate": 0.0001986546605773363, "loss": 0.7902, "step": 579 }, { "epoch": 0.08862742101845131, "grad_norm": 0.521626353263855, "learning_rate": 0.00019864648832033612, "loss": 0.7231, "step": 580 }, { "epoch": 0.08878022691675899, "grad_norm": 0.23101353645324707, "learning_rate": 0.00019863829148639142, "loss": 0.6654, "step": 581 }, { "epoch": 0.08893303281506666, "grad_norm": 0.23713093996047974, "learning_rate": 0.00019863007007754445, "loss": 0.7468, "step": 582 }, { "epoch": 0.08908583871337433, "grad_norm": 0.2619504928588867, "learning_rate": 0.00019862182409584339, "loss": 0.676, "step": 583 }, { "epoch": 0.08923864461168202, "grad_norm": 0.24824418127536774, "learning_rate": 0.00019861355354334272, "loss": 0.7717, "step": 584 }, { "epoch": 0.08939145050998969, "grad_norm": 0.35055017471313477, "learning_rate": 0.00019860525842210297, "loss": 0.7893, "step": 585 }, { "epoch": 0.08954425640829736, "grad_norm": 0.3420320153236389, "learning_rate": 0.00019859693873419082, "loss": 0.8808, "step": 586 }, { "epoch": 0.08969706230660504, "grad_norm": 0.2500932812690735, "learning_rate": 0.000198588594481679, "loss": 0.6469, "step": 587 }, { "epoch": 0.08984986820491271, "grad_norm": 0.24625301361083984, "learning_rate": 0.00019858022566664646, "loss": 0.7172, "step": 588 }, { "epoch": 0.09000267410322038, "grad_norm": 0.2936956286430359, "learning_rate": 0.0001985718322911782, "loss": 0.7343, "step": 589 }, { "epoch": 0.09015548000152807, "grad_norm": 0.2684360146522522, "learning_rate": 0.00019856341435736538, "loss": 0.8843, "step": 590 }, { "epoch": 0.09030828589983574, "grad_norm": 0.39383620023727417, "learning_rate": 0.0001985549718673052, "loss": 0.6679, "step": 591 }, { "epoch": 0.0904610917981434, "grad_norm": 0.26644083857536316, "learning_rate": 0.00019854650482310112, "loss": 0.8205, "step": 592 }, { "epoch": 0.09061389769645108, "grad_norm": 0.2587982416152954, "learning_rate": 0.00019853801322686256, "loss": 0.6291, "step": 593 }, { "epoch": 0.09076670359475876, "grad_norm": 0.24516814947128296, "learning_rate": 0.00019852949708070515, "loss": 0.8208, "step": 594 }, { "epoch": 0.09091950949306643, "grad_norm": 0.2279921919107437, "learning_rate": 0.00019852095638675063, "loss": 0.5292, "step": 595 }, { "epoch": 0.0910723153913741, "grad_norm": 0.2680317461490631, "learning_rate": 0.00019851239114712684, "loss": 0.6363, "step": 596 }, { "epoch": 0.09122512128968178, "grad_norm": 0.2625105679035187, "learning_rate": 0.00019850380136396774, "loss": 0.7019, "step": 597 }, { "epoch": 0.09137792718798946, "grad_norm": 0.2557803690433502, "learning_rate": 0.00019849518703941337, "loss": 0.6597, "step": 598 }, { "epoch": 0.09153073308629713, "grad_norm": 0.2681661546230316, "learning_rate": 0.00019848654817560996, "loss": 0.5386, "step": 599 }, { "epoch": 0.09168353898460481, "grad_norm": 0.2524600028991699, "learning_rate": 0.0001984778847747098, "loss": 0.8093, "step": 600 }, { "epoch": 0.09183634488291248, "grad_norm": 0.23507723212242126, "learning_rate": 0.00019846919683887127, "loss": 0.7312, "step": 601 }, { "epoch": 0.09198915078122015, "grad_norm": 0.2504274845123291, "learning_rate": 0.00019846048437025893, "loss": 0.5854, "step": 602 }, { "epoch": 0.09214195667952783, "grad_norm": 0.26254919171333313, "learning_rate": 0.0001984517473710434, "loss": 0.5426, "step": 603 }, { "epoch": 0.0922947625778355, "grad_norm": 0.26088839769363403, "learning_rate": 0.00019844298584340147, "loss": 0.8402, "step": 604 }, { "epoch": 0.09244756847614317, "grad_norm": 0.22979454696178436, "learning_rate": 0.00019843419978951595, "loss": 0.8721, "step": 605 }, { "epoch": 0.09260037437445086, "grad_norm": 0.2631072700023651, "learning_rate": 0.00019842538921157585, "loss": 0.7218, "step": 606 }, { "epoch": 0.09275318027275853, "grad_norm": 0.2664624750614166, "learning_rate": 0.00019841655411177622, "loss": 0.704, "step": 607 }, { "epoch": 0.0929059861710662, "grad_norm": 0.28286877274513245, "learning_rate": 0.00019840769449231828, "loss": 0.6592, "step": 608 }, { "epoch": 0.09305879206937388, "grad_norm": 0.24730631709098816, "learning_rate": 0.00019839881035540931, "loss": 0.8396, "step": 609 }, { "epoch": 0.09321159796768155, "grad_norm": 0.30036839842796326, "learning_rate": 0.00019838990170326272, "loss": 0.6895, "step": 610 }, { "epoch": 0.09336440386598922, "grad_norm": 0.394290030002594, "learning_rate": 0.000198380968538098, "loss": 0.8033, "step": 611 }, { "epoch": 0.09351720976429691, "grad_norm": 0.22317548096179962, "learning_rate": 0.00019837201086214085, "loss": 0.5986, "step": 612 }, { "epoch": 0.09367001566260458, "grad_norm": 0.3567953109741211, "learning_rate": 0.00019836302867762292, "loss": 0.5609, "step": 613 }, { "epoch": 0.09382282156091225, "grad_norm": 0.26048195362091064, "learning_rate": 0.0001983540219867821, "loss": 0.74, "step": 614 }, { "epoch": 0.09397562745921993, "grad_norm": 7.787338733673096, "learning_rate": 0.0001983449907918623, "loss": 0.6336, "step": 615 }, { "epoch": 0.0941284333575276, "grad_norm": 0.41731297969818115, "learning_rate": 0.0001983359350951136, "loss": 0.8691, "step": 616 }, { "epoch": 0.09428123925583527, "grad_norm": 0.30972158908843994, "learning_rate": 0.00019832685489879208, "loss": 0.8158, "step": 617 }, { "epoch": 0.09443404515414296, "grad_norm": 0.2101728767156601, "learning_rate": 0.00019831775020516008, "loss": 0.6214, "step": 618 }, { "epoch": 0.09458685105245063, "grad_norm": 0.24316735565662384, "learning_rate": 0.00019830862101648592, "loss": 0.4683, "step": 619 }, { "epoch": 0.0947396569507583, "grad_norm": 0.6694328784942627, "learning_rate": 0.00019829946733504402, "loss": 0.7519, "step": 620 }, { "epoch": 0.09489246284906597, "grad_norm": 0.22268711030483246, "learning_rate": 0.000198290289163115, "loss": 0.7029, "step": 621 }, { "epoch": 0.09504526874737365, "grad_norm": 0.30094367265701294, "learning_rate": 0.00019828108650298554, "loss": 0.9032, "step": 622 }, { "epoch": 0.09519807464568132, "grad_norm": 0.24041011929512024, "learning_rate": 0.00019827185935694834, "loss": 0.7073, "step": 623 }, { "epoch": 0.09535088054398899, "grad_norm": 0.261481910943985, "learning_rate": 0.00019826260772730229, "loss": 0.6153, "step": 624 }, { "epoch": 0.09550368644229668, "grad_norm": 0.249210923910141, "learning_rate": 0.0001982533316163524, "loss": 0.7851, "step": 625 }, { "epoch": 0.09565649234060435, "grad_norm": 0.31153637170791626, "learning_rate": 0.00019824403102640967, "loss": 0.7585, "step": 626 }, { "epoch": 0.09580929823891202, "grad_norm": 0.2985265851020813, "learning_rate": 0.00019823470595979132, "loss": 0.679, "step": 627 }, { "epoch": 0.0959621041372197, "grad_norm": 0.275879830121994, "learning_rate": 0.00019822535641882057, "loss": 0.6026, "step": 628 }, { "epoch": 0.09611491003552737, "grad_norm": 0.2656649053096771, "learning_rate": 0.0001982159824058268, "loss": 1.0268, "step": 629 }, { "epoch": 0.09626771593383504, "grad_norm": 0.3071416914463043, "learning_rate": 0.00019820658392314547, "loss": 0.9391, "step": 630 }, { "epoch": 0.09642052183214272, "grad_norm": 0.22628253698349, "learning_rate": 0.0001981971609731181, "loss": 0.6201, "step": 631 }, { "epoch": 0.0965733277304504, "grad_norm": 0.33724674582481384, "learning_rate": 0.0001981877135580924, "loss": 0.6772, "step": 632 }, { "epoch": 0.09672613362875807, "grad_norm": 0.2758637070655823, "learning_rate": 0.00019817824168042204, "loss": 0.6861, "step": 633 }, { "epoch": 0.09687893952706575, "grad_norm": 0.25251737236976624, "learning_rate": 0.00019816874534246695, "loss": 0.6669, "step": 634 }, { "epoch": 0.09703174542537342, "grad_norm": 0.3473283350467682, "learning_rate": 0.00019815922454659296, "loss": 0.8036, "step": 635 }, { "epoch": 0.09718455132368109, "grad_norm": 0.2671525776386261, "learning_rate": 0.00019814967929517217, "loss": 0.7605, "step": 636 }, { "epoch": 0.09733735722198877, "grad_norm": 0.3378095328807831, "learning_rate": 0.0001981401095905827, "loss": 0.6923, "step": 637 }, { "epoch": 0.09749016312029644, "grad_norm": 0.3124406635761261, "learning_rate": 0.00019813051543520868, "loss": 0.8457, "step": 638 }, { "epoch": 0.09764296901860411, "grad_norm": 0.26661619544029236, "learning_rate": 0.0001981208968314405, "loss": 0.638, "step": 639 }, { "epoch": 0.0977957749169118, "grad_norm": 0.23887763917446136, "learning_rate": 0.00019811125378167452, "loss": 0.7673, "step": 640 }, { "epoch": 0.09794858081521947, "grad_norm": 0.24191714823246002, "learning_rate": 0.00019810158628831323, "loss": 0.6071, "step": 641 }, { "epoch": 0.09810138671352714, "grad_norm": 0.2351231426000595, "learning_rate": 0.0001980918943537652, "loss": 0.8946, "step": 642 }, { "epoch": 0.09825419261183482, "grad_norm": 0.2848316729068756, "learning_rate": 0.00019808217798044514, "loss": 0.5735, "step": 643 }, { "epoch": 0.0984069985101425, "grad_norm": 0.256180077791214, "learning_rate": 0.0001980724371707737, "loss": 0.7506, "step": 644 }, { "epoch": 0.09855980440845016, "grad_norm": 0.22801616787910461, "learning_rate": 0.0001980626719271778, "loss": 0.6963, "step": 645 }, { "epoch": 0.09871261030675785, "grad_norm": 0.2420923262834549, "learning_rate": 0.0001980528822520904, "loss": 0.6607, "step": 646 }, { "epoch": 0.09886541620506552, "grad_norm": 0.2945977747440338, "learning_rate": 0.0001980430681479504, "loss": 0.7108, "step": 647 }, { "epoch": 0.09901822210337319, "grad_norm": 0.26587414741516113, "learning_rate": 0.00019803322961720304, "loss": 0.7858, "step": 648 }, { "epoch": 0.09917102800168086, "grad_norm": 0.2787562608718872, "learning_rate": 0.0001980233666622994, "loss": 0.5684, "step": 649 }, { "epoch": 0.09932383389998854, "grad_norm": 0.22903244197368622, "learning_rate": 0.00019801347928569677, "loss": 0.6586, "step": 650 }, { "epoch": 0.09947663979829621, "grad_norm": 0.22474107146263123, "learning_rate": 0.00019800356748985853, "loss": 0.5589, "step": 651 }, { "epoch": 0.09962944569660388, "grad_norm": 0.26125723123550415, "learning_rate": 0.00019799363127725412, "loss": 0.7504, "step": 652 }, { "epoch": 0.09978225159491157, "grad_norm": 0.263163685798645, "learning_rate": 0.00019798367065035905, "loss": 0.6425, "step": 653 }, { "epoch": 0.09993505749321924, "grad_norm": 0.3001612424850464, "learning_rate": 0.00019797368561165496, "loss": 0.64, "step": 654 }, { "epoch": 0.10008786339152691, "grad_norm": 0.425373911857605, "learning_rate": 0.0001979636761636295, "loss": 0.8331, "step": 655 }, { "epoch": 0.10024066928983459, "grad_norm": 0.358103483915329, "learning_rate": 0.00019795364230877649, "loss": 0.7991, "step": 656 }, { "epoch": 0.10039347518814226, "grad_norm": 0.26949623227119446, "learning_rate": 0.0001979435840495957, "loss": 0.7071, "step": 657 }, { "epoch": 0.10054628108644993, "grad_norm": 0.41269823908805847, "learning_rate": 0.00019793350138859312, "loss": 0.6083, "step": 658 }, { "epoch": 0.10069908698475762, "grad_norm": 0.25923144817352295, "learning_rate": 0.00019792339432828074, "loss": 0.6838, "step": 659 }, { "epoch": 0.10085189288306529, "grad_norm": 0.2757934331893921, "learning_rate": 0.00019791326287117668, "loss": 0.7489, "step": 660 }, { "epoch": 0.10100469878137296, "grad_norm": 0.2592853605747223, "learning_rate": 0.00019790310701980505, "loss": 0.6127, "step": 661 }, { "epoch": 0.10115750467968064, "grad_norm": 0.23879987001419067, "learning_rate": 0.00019789292677669615, "loss": 0.6025, "step": 662 }, { "epoch": 0.10131031057798831, "grad_norm": 0.23186875879764557, "learning_rate": 0.00019788272214438628, "loss": 0.7529, "step": 663 }, { "epoch": 0.10146311647629598, "grad_norm": 0.2402157485485077, "learning_rate": 0.00019787249312541784, "loss": 0.9017, "step": 664 }, { "epoch": 0.10161592237460366, "grad_norm": 0.24203136563301086, "learning_rate": 0.0001978622397223393, "loss": 0.764, "step": 665 }, { "epoch": 0.10176872827291134, "grad_norm": 0.3500341773033142, "learning_rate": 0.00019785196193770522, "loss": 0.7858, "step": 666 }, { "epoch": 0.101921534171219, "grad_norm": 0.357442170381546, "learning_rate": 0.0001978416597740762, "loss": 0.6913, "step": 667 }, { "epoch": 0.10207434006952669, "grad_norm": 0.33510905504226685, "learning_rate": 0.00019783133323401898, "loss": 0.6076, "step": 668 }, { "epoch": 0.10222714596783436, "grad_norm": 0.2523046135902405, "learning_rate": 0.00019782098232010625, "loss": 0.6613, "step": 669 }, { "epoch": 0.10237995186614203, "grad_norm": 1.28267240524292, "learning_rate": 0.00019781060703491697, "loss": 0.8129, "step": 670 }, { "epoch": 0.10253275776444971, "grad_norm": 0.31581321358680725, "learning_rate": 0.00019780020738103594, "loss": 0.7189, "step": 671 }, { "epoch": 0.10268556366275738, "grad_norm": 0.22247004508972168, "learning_rate": 0.00019778978336105425, "loss": 0.6889, "step": 672 }, { "epoch": 0.10283836956106505, "grad_norm": 0.28829923272132874, "learning_rate": 0.00019777933497756885, "loss": 0.7773, "step": 673 }, { "epoch": 0.10299117545937274, "grad_norm": 0.4107334017753601, "learning_rate": 0.00019776886223318299, "loss": 0.7051, "step": 674 }, { "epoch": 0.10314398135768041, "grad_norm": 0.2262149304151535, "learning_rate": 0.00019775836513050577, "loss": 0.6931, "step": 675 }, { "epoch": 0.10329678725598808, "grad_norm": 0.24012112617492676, "learning_rate": 0.0001977478436721525, "loss": 0.6046, "step": 676 }, { "epoch": 0.10344959315429575, "grad_norm": 0.2686203122138977, "learning_rate": 0.00019773729786074447, "loss": 0.6899, "step": 677 }, { "epoch": 0.10360239905260343, "grad_norm": 0.25490519404411316, "learning_rate": 0.00019772672769890912, "loss": 0.8237, "step": 678 }, { "epoch": 0.1037552049509111, "grad_norm": 0.2543981671333313, "learning_rate": 0.00019771613318927988, "loss": 0.669, "step": 679 }, { "epoch": 0.10390801084921877, "grad_norm": 0.2917165756225586, "learning_rate": 0.00019770551433449636, "loss": 0.8482, "step": 680 }, { "epoch": 0.10406081674752646, "grad_norm": 0.25636452436447144, "learning_rate": 0.00019769487113720406, "loss": 0.8233, "step": 681 }, { "epoch": 0.10421362264583413, "grad_norm": 0.3934386670589447, "learning_rate": 0.00019768420360005473, "loss": 0.6585, "step": 682 }, { "epoch": 0.1043664285441418, "grad_norm": 0.2667856514453888, "learning_rate": 0.00019767351172570602, "loss": 0.6018, "step": 683 }, { "epoch": 0.10451923444244948, "grad_norm": 0.5042889714241028, "learning_rate": 0.0001976627955168218, "loss": 0.7258, "step": 684 }, { "epoch": 0.10467204034075715, "grad_norm": 0.274236261844635, "learning_rate": 0.00019765205497607186, "loss": 0.7307, "step": 685 }, { "epoch": 0.10482484623906482, "grad_norm": 0.24832475185394287, "learning_rate": 0.00019764129010613215, "loss": 0.8898, "step": 686 }, { "epoch": 0.1049776521373725, "grad_norm": 0.3612132668495178, "learning_rate": 0.00019763050090968462, "loss": 0.7601, "step": 687 }, { "epoch": 0.10513045803568018, "grad_norm": 0.22813887894153595, "learning_rate": 0.00019761968738941734, "loss": 0.6691, "step": 688 }, { "epoch": 0.10528326393398785, "grad_norm": 0.23167642951011658, "learning_rate": 0.00019760884954802437, "loss": 0.8389, "step": 689 }, { "epoch": 0.10543606983229553, "grad_norm": 0.2619309723377228, "learning_rate": 0.0001975979873882059, "loss": 0.8322, "step": 690 }, { "epoch": 0.1055888757306032, "grad_norm": 0.30721771717071533, "learning_rate": 0.00019758710091266813, "loss": 0.7664, "step": 691 }, { "epoch": 0.10574168162891087, "grad_norm": 0.2530481517314911, "learning_rate": 0.00019757619012412332, "loss": 0.5927, "step": 692 }, { "epoch": 0.10589448752721856, "grad_norm": 0.29600846767425537, "learning_rate": 0.00019756525502528986, "loss": 0.6524, "step": 693 }, { "epoch": 0.10604729342552623, "grad_norm": 0.25071632862091064, "learning_rate": 0.00019755429561889204, "loss": 0.7333, "step": 694 }, { "epoch": 0.1062000993238339, "grad_norm": 0.27111098170280457, "learning_rate": 0.0001975433119076604, "loss": 0.8976, "step": 695 }, { "epoch": 0.10635290522214158, "grad_norm": 0.2631009519100189, "learning_rate": 0.0001975323038943314, "loss": 0.6236, "step": 696 }, { "epoch": 0.10650571112044925, "grad_norm": 0.25254061818122864, "learning_rate": 0.0001975212715816476, "loss": 0.6441, "step": 697 }, { "epoch": 0.10665851701875692, "grad_norm": 0.3293875753879547, "learning_rate": 0.0001975102149723576, "loss": 0.7531, "step": 698 }, { "epoch": 0.1068113229170646, "grad_norm": 0.2423682063817978, "learning_rate": 0.00019749913406921606, "loss": 0.8024, "step": 699 }, { "epoch": 0.10696412881537228, "grad_norm": 0.2802618145942688, "learning_rate": 0.00019748802887498368, "loss": 0.6301, "step": 700 }, { "epoch": 0.10711693471367995, "grad_norm": 0.21464811265468597, "learning_rate": 0.00019747689939242726, "loss": 0.5926, "step": 701 }, { "epoch": 0.10726974061198763, "grad_norm": 0.2736561894416809, "learning_rate": 0.00019746574562431958, "loss": 0.6572, "step": 702 }, { "epoch": 0.1074225465102953, "grad_norm": 0.3253500759601593, "learning_rate": 0.00019745456757343957, "loss": 0.7262, "step": 703 }, { "epoch": 0.10757535240860297, "grad_norm": 0.39992064237594604, "learning_rate": 0.00019744336524257208, "loss": 0.9614, "step": 704 }, { "epoch": 0.10772815830691064, "grad_norm": 0.5507543683052063, "learning_rate": 0.0001974321386345081, "loss": 0.7267, "step": 705 }, { "epoch": 0.10788096420521832, "grad_norm": 0.2982296049594879, "learning_rate": 0.00019742088775204466, "loss": 0.7433, "step": 706 }, { "epoch": 0.108033770103526, "grad_norm": 0.32143905758857727, "learning_rate": 0.0001974096125979848, "loss": 0.5736, "step": 707 }, { "epoch": 0.10818657600183366, "grad_norm": 0.26693427562713623, "learning_rate": 0.00019739831317513767, "loss": 0.6675, "step": 708 }, { "epoch": 0.10833938190014135, "grad_norm": 0.2991078794002533, "learning_rate": 0.00019738698948631837, "loss": 0.7309, "step": 709 }, { "epoch": 0.10849218779844902, "grad_norm": 0.3002963066101074, "learning_rate": 0.00019737564153434812, "loss": 0.6062, "step": 710 }, { "epoch": 0.10864499369675669, "grad_norm": 0.254621684551239, "learning_rate": 0.00019736426932205422, "loss": 0.6951, "step": 711 }, { "epoch": 0.10879779959506437, "grad_norm": 0.25402164459228516, "learning_rate": 0.00019735287285226988, "loss": 0.6384, "step": 712 }, { "epoch": 0.10895060549337204, "grad_norm": 0.31595587730407715, "learning_rate": 0.0001973414521278345, "loss": 0.6293, "step": 713 }, { "epoch": 0.10910341139167971, "grad_norm": 0.3181349039077759, "learning_rate": 0.00019733000715159337, "loss": 0.7432, "step": 714 }, { "epoch": 0.1092562172899874, "grad_norm": 0.24884875118732452, "learning_rate": 0.00019731853792639802, "loss": 0.633, "step": 715 }, { "epoch": 0.10940902318829507, "grad_norm": 0.23580528795719147, "learning_rate": 0.00019730704445510586, "loss": 0.7396, "step": 716 }, { "epoch": 0.10956182908660274, "grad_norm": 0.33131417632102966, "learning_rate": 0.0001972955267405804, "loss": 0.6598, "step": 717 }, { "epoch": 0.10971463498491042, "grad_norm": 0.21372541785240173, "learning_rate": 0.00019728398478569115, "loss": 0.5871, "step": 718 }, { "epoch": 0.10986744088321809, "grad_norm": 0.34117481112480164, "learning_rate": 0.00019727241859331373, "loss": 0.903, "step": 719 }, { "epoch": 0.11002024678152576, "grad_norm": 0.4706093668937683, "learning_rate": 0.00019726082816632975, "loss": 0.605, "step": 720 }, { "epoch": 0.11017305267983345, "grad_norm": 0.23070864379405975, "learning_rate": 0.00019724921350762684, "loss": 0.7316, "step": 721 }, { "epoch": 0.11032585857814112, "grad_norm": 0.4054511487483978, "learning_rate": 0.00019723757462009875, "loss": 0.6363, "step": 722 }, { "epoch": 0.11047866447644879, "grad_norm": 0.23478427529335022, "learning_rate": 0.00019722591150664518, "loss": 0.755, "step": 723 }, { "epoch": 0.11063147037475647, "grad_norm": 0.5467060804367065, "learning_rate": 0.00019721422417017185, "loss": 0.6103, "step": 724 }, { "epoch": 0.11078427627306414, "grad_norm": 0.32651928067207336, "learning_rate": 0.00019720251261359065, "loss": 0.9015, "step": 725 }, { "epoch": 0.11093708217137181, "grad_norm": 0.33620190620422363, "learning_rate": 0.00019719077683981936, "loss": 0.766, "step": 726 }, { "epoch": 0.1110898880696795, "grad_norm": 0.22980651259422302, "learning_rate": 0.0001971790168517819, "loss": 0.551, "step": 727 }, { "epoch": 0.11124269396798717, "grad_norm": 0.2606430649757385, "learning_rate": 0.00019716723265240807, "loss": 0.5819, "step": 728 }, { "epoch": 0.11139549986629484, "grad_norm": 0.25085484981536865, "learning_rate": 0.00019715542424463388, "loss": 0.734, "step": 729 }, { "epoch": 0.11154830576460252, "grad_norm": 0.29355061054229736, "learning_rate": 0.00019714359163140133, "loss": 0.7688, "step": 730 }, { "epoch": 0.11170111166291019, "grad_norm": 0.34205570816993713, "learning_rate": 0.00019713173481565837, "loss": 0.976, "step": 731 }, { "epoch": 0.11185391756121786, "grad_norm": 0.29738330841064453, "learning_rate": 0.000197119853800359, "loss": 0.7573, "step": 732 }, { "epoch": 0.11200672345952553, "grad_norm": 0.26155439019203186, "learning_rate": 0.0001971079485884633, "loss": 0.7691, "step": 733 }, { "epoch": 0.11215952935783322, "grad_norm": 0.3583777844905853, "learning_rate": 0.00019709601918293737, "loss": 0.7932, "step": 734 }, { "epoch": 0.11231233525614089, "grad_norm": 0.285895437002182, "learning_rate": 0.00019708406558675333, "loss": 0.7157, "step": 735 }, { "epoch": 0.11246514115444856, "grad_norm": 0.26534533500671387, "learning_rate": 0.00019707208780288924, "loss": 0.6047, "step": 736 }, { "epoch": 0.11261794705275624, "grad_norm": 0.3675645887851715, "learning_rate": 0.00019706008583432935, "loss": 0.6816, "step": 737 }, { "epoch": 0.11277075295106391, "grad_norm": 0.2926788926124573, "learning_rate": 0.00019704805968406383, "loss": 0.794, "step": 738 }, { "epoch": 0.11292355884937158, "grad_norm": 0.31891047954559326, "learning_rate": 0.00019703600935508888, "loss": 0.856, "step": 739 }, { "epoch": 0.11307636474767926, "grad_norm": 0.32140710949897766, "learning_rate": 0.00019702393485040672, "loss": 0.6825, "step": 740 }, { "epoch": 0.11322917064598693, "grad_norm": 0.2733408212661743, "learning_rate": 0.00019701183617302568, "loss": 0.7611, "step": 741 }, { "epoch": 0.1133819765442946, "grad_norm": 0.22607572376728058, "learning_rate": 0.00019699971332595996, "loss": 0.5884, "step": 742 }, { "epoch": 0.11353478244260229, "grad_norm": 0.29300516843795776, "learning_rate": 0.00019698756631222994, "loss": 0.6787, "step": 743 }, { "epoch": 0.11368758834090996, "grad_norm": 0.39608168601989746, "learning_rate": 0.0001969753951348619, "loss": 0.6543, "step": 744 }, { "epoch": 0.11384039423921763, "grad_norm": 0.2555294632911682, "learning_rate": 0.00019696319979688816, "loss": 0.5899, "step": 745 }, { "epoch": 0.11399320013752531, "grad_norm": 0.2862085700035095, "learning_rate": 0.00019695098030134717, "loss": 0.7661, "step": 746 }, { "epoch": 0.11414600603583298, "grad_norm": 0.2918783128261566, "learning_rate": 0.00019693873665128323, "loss": 0.6101, "step": 747 }, { "epoch": 0.11429881193414065, "grad_norm": 0.22642338275909424, "learning_rate": 0.0001969264688497468, "loss": 0.7746, "step": 748 }, { "epoch": 0.11445161783244834, "grad_norm": 0.3243122398853302, "learning_rate": 0.00019691417689979428, "loss": 0.8686, "step": 749 }, { "epoch": 0.11460442373075601, "grad_norm": 0.45320257544517517, "learning_rate": 0.0001969018608044881, "loss": 0.8523, "step": 750 }, { "epoch": 0.11475722962906368, "grad_norm": 0.27918341755867004, "learning_rate": 0.00019688952056689672, "loss": 0.6287, "step": 751 }, { "epoch": 0.11491003552737136, "grad_norm": 0.24446265399456024, "learning_rate": 0.0001968771561900946, "loss": 0.7383, "step": 752 }, { "epoch": 0.11506284142567903, "grad_norm": 0.24698710441589355, "learning_rate": 0.00019686476767716225, "loss": 0.5625, "step": 753 }, { "epoch": 0.1152156473239867, "grad_norm": 0.2587762773036957, "learning_rate": 0.00019685235503118614, "loss": 0.6205, "step": 754 }, { "epoch": 0.11536845322229439, "grad_norm": 0.2515849769115448, "learning_rate": 0.00019683991825525875, "loss": 0.5296, "step": 755 }, { "epoch": 0.11552125912060206, "grad_norm": 0.2782059907913208, "learning_rate": 0.00019682745735247862, "loss": 0.7873, "step": 756 }, { "epoch": 0.11567406501890973, "grad_norm": 0.2650497257709503, "learning_rate": 0.0001968149723259503, "loss": 0.5342, "step": 757 }, { "epoch": 0.1158268709172174, "grad_norm": 0.2502165138721466, "learning_rate": 0.00019680246317878433, "loss": 0.6457, "step": 758 }, { "epoch": 0.11597967681552508, "grad_norm": 0.2777862250804901, "learning_rate": 0.00019678992991409723, "loss": 0.6767, "step": 759 }, { "epoch": 0.11613248271383275, "grad_norm": 0.213461235165596, "learning_rate": 0.00019677737253501155, "loss": 0.6959, "step": 760 }, { "epoch": 0.11628528861214042, "grad_norm": 0.28070124983787537, "learning_rate": 0.0001967647910446559, "loss": 0.6332, "step": 761 }, { "epoch": 0.1164380945104481, "grad_norm": 0.38696399331092834, "learning_rate": 0.00019675218544616482, "loss": 0.7009, "step": 762 }, { "epoch": 0.11659090040875578, "grad_norm": 0.24727767705917358, "learning_rate": 0.00019673955574267887, "loss": 0.6498, "step": 763 }, { "epoch": 0.11674370630706345, "grad_norm": 0.3144250810146332, "learning_rate": 0.00019672690193734468, "loss": 0.6396, "step": 764 }, { "epoch": 0.11689651220537113, "grad_norm": 0.30413126945495605, "learning_rate": 0.00019671422403331486, "loss": 0.6576, "step": 765 }, { "epoch": 0.1170493181036788, "grad_norm": 0.2549903392791748, "learning_rate": 0.00019670152203374793, "loss": 0.6948, "step": 766 }, { "epoch": 0.11720212400198647, "grad_norm": 0.2680594325065613, "learning_rate": 0.00019668879594180854, "loss": 0.9079, "step": 767 }, { "epoch": 0.11735492990029416, "grad_norm": 0.3748367726802826, "learning_rate": 0.00019667604576066724, "loss": 0.6821, "step": 768 }, { "epoch": 0.11750773579860183, "grad_norm": 0.2943898141384125, "learning_rate": 0.00019666327149350067, "loss": 0.6278, "step": 769 }, { "epoch": 0.1176605416969095, "grad_norm": 0.30158859491348267, "learning_rate": 0.00019665047314349146, "loss": 0.7498, "step": 770 }, { "epoch": 0.11781334759521718, "grad_norm": 0.2680291533470154, "learning_rate": 0.0001966376507138281, "loss": 0.6299, "step": 771 }, { "epoch": 0.11796615349352485, "grad_norm": 0.24324612319469452, "learning_rate": 0.00019662480420770532, "loss": 0.6987, "step": 772 }, { "epoch": 0.11811895939183252, "grad_norm": 0.2948191463947296, "learning_rate": 0.00019661193362832365, "loss": 0.6614, "step": 773 }, { "epoch": 0.1182717652901402, "grad_norm": 0.3052494525909424, "learning_rate": 0.00019659903897888972, "loss": 0.7924, "step": 774 }, { "epoch": 0.11842457118844787, "grad_norm": 0.2476397603750229, "learning_rate": 0.00019658612026261606, "loss": 0.7096, "step": 775 }, { "epoch": 0.11857737708675554, "grad_norm": 0.21247366070747375, "learning_rate": 0.00019657317748272135, "loss": 0.8716, "step": 776 }, { "epoch": 0.11873018298506323, "grad_norm": 0.32150718569755554, "learning_rate": 0.00019656021064243012, "loss": 0.647, "step": 777 }, { "epoch": 0.1188829888833709, "grad_norm": 0.3018513321876526, "learning_rate": 0.00019654721974497294, "loss": 0.6629, "step": 778 }, { "epoch": 0.11903579478167857, "grad_norm": 0.27877163887023926, "learning_rate": 0.00019653420479358639, "loss": 0.6738, "step": 779 }, { "epoch": 0.11918860067998625, "grad_norm": 0.2589527368545532, "learning_rate": 0.0001965211657915131, "loss": 0.6804, "step": 780 }, { "epoch": 0.11934140657829392, "grad_norm": 0.4898923337459564, "learning_rate": 0.00019650810274200153, "loss": 0.6081, "step": 781 }, { "epoch": 0.1194942124766016, "grad_norm": 0.30923375487327576, "learning_rate": 0.0001964950156483063, "loss": 0.7302, "step": 782 }, { "epoch": 0.11964701837490928, "grad_norm": 0.26589056849479675, "learning_rate": 0.0001964819045136879, "loss": 0.797, "step": 783 }, { "epoch": 0.11979982427321695, "grad_norm": 0.24651648104190826, "learning_rate": 0.00019646876934141289, "loss": 0.6002, "step": 784 }, { "epoch": 0.11995263017152462, "grad_norm": 0.2416309118270874, "learning_rate": 0.0001964556101347538, "loss": 0.68, "step": 785 }, { "epoch": 0.12010543606983229, "grad_norm": 0.23243074119091034, "learning_rate": 0.00019644242689698907, "loss": 0.9595, "step": 786 }, { "epoch": 0.12025824196813997, "grad_norm": 0.3017045557498932, "learning_rate": 0.00019642921963140331, "loss": 0.7222, "step": 787 }, { "epoch": 0.12041104786644764, "grad_norm": 0.2747794985771179, "learning_rate": 0.00019641598834128687, "loss": 0.7744, "step": 788 }, { "epoch": 0.12056385376475531, "grad_norm": 0.24212798476219177, "learning_rate": 0.0001964027330299363, "loss": 0.7895, "step": 789 }, { "epoch": 0.120716659663063, "grad_norm": 0.2827460467815399, "learning_rate": 0.000196389453700654, "loss": 0.551, "step": 790 }, { "epoch": 0.12086946556137067, "grad_norm": 0.25816842913627625, "learning_rate": 0.00019637615035674846, "loss": 0.7097, "step": 791 }, { "epoch": 0.12102227145967834, "grad_norm": 0.2768022119998932, "learning_rate": 0.00019636282300153406, "loss": 0.4357, "step": 792 }, { "epoch": 0.12117507735798602, "grad_norm": 0.31443700194358826, "learning_rate": 0.00019634947163833116, "loss": 0.587, "step": 793 }, { "epoch": 0.12132788325629369, "grad_norm": 0.3125135898590088, "learning_rate": 0.00019633609627046623, "loss": 0.707, "step": 794 }, { "epoch": 0.12148068915460136, "grad_norm": 0.2981247007846832, "learning_rate": 0.00019632269690127158, "loss": 0.792, "step": 795 }, { "epoch": 0.12163349505290905, "grad_norm": 0.23891063034534454, "learning_rate": 0.00019630927353408553, "loss": 0.6062, "step": 796 }, { "epoch": 0.12178630095121672, "grad_norm": 0.2705599069595337, "learning_rate": 0.0001962958261722524, "loss": 0.6864, "step": 797 }, { "epoch": 0.12193910684952439, "grad_norm": 0.25540342926979065, "learning_rate": 0.00019628235481912256, "loss": 0.6466, "step": 798 }, { "epoch": 0.12209191274783207, "grad_norm": 0.2492659091949463, "learning_rate": 0.00019626885947805222, "loss": 0.8629, "step": 799 }, { "epoch": 0.12224471864613974, "grad_norm": 0.23218658566474915, "learning_rate": 0.00019625534015240366, "loss": 0.5242, "step": 800 }, { "epoch": 0.12239752454444741, "grad_norm": 0.2692018151283264, "learning_rate": 0.00019624179684554505, "loss": 0.6638, "step": 801 }, { "epoch": 0.1225503304427551, "grad_norm": 0.3122497498989105, "learning_rate": 0.00019622822956085067, "loss": 0.8883, "step": 802 }, { "epoch": 0.12270313634106277, "grad_norm": 0.46200600266456604, "learning_rate": 0.00019621463830170064, "loss": 0.8743, "step": 803 }, { "epoch": 0.12285594223937044, "grad_norm": 0.28840991854667664, "learning_rate": 0.00019620102307148113, "loss": 0.6618, "step": 804 }, { "epoch": 0.12300874813767812, "grad_norm": 0.23175831139087677, "learning_rate": 0.00019618738387358424, "loss": 0.6825, "step": 805 }, { "epoch": 0.12316155403598579, "grad_norm": 0.35594913363456726, "learning_rate": 0.0001961737207114081, "loss": 0.6265, "step": 806 }, { "epoch": 0.12331435993429346, "grad_norm": 0.27673351764678955, "learning_rate": 0.00019616003358835675, "loss": 0.6526, "step": 807 }, { "epoch": 0.12346716583260114, "grad_norm": 0.26578110456466675, "learning_rate": 0.00019614632250784022, "loss": 0.7232, "step": 808 }, { "epoch": 0.12361997173090881, "grad_norm": 0.22138115763664246, "learning_rate": 0.0001961325874732745, "loss": 0.4978, "step": 809 }, { "epoch": 0.12377277762921648, "grad_norm": 0.34308725595474243, "learning_rate": 0.0001961188284880816, "loss": 0.7385, "step": 810 }, { "epoch": 0.12392558352752417, "grad_norm": 0.2813006341457367, "learning_rate": 0.0001961050455556894, "loss": 0.8297, "step": 811 }, { "epoch": 0.12407838942583184, "grad_norm": 0.24640551209449768, "learning_rate": 0.00019609123867953186, "loss": 0.9333, "step": 812 }, { "epoch": 0.12423119532413951, "grad_norm": 0.2680695652961731, "learning_rate": 0.00019607740786304877, "loss": 0.9178, "step": 813 }, { "epoch": 0.12438400122244718, "grad_norm": 0.2718088924884796, "learning_rate": 0.00019606355310968602, "loss": 0.6672, "step": 814 }, { "epoch": 0.12453680712075486, "grad_norm": 0.24112388491630554, "learning_rate": 0.0001960496744228954, "loss": 0.7025, "step": 815 }, { "epoch": 0.12468961301906253, "grad_norm": 0.30206188559532166, "learning_rate": 0.00019603577180613468, "loss": 0.6088, "step": 816 }, { "epoch": 0.1248424189173702, "grad_norm": 0.24509626626968384, "learning_rate": 0.00019602184526286757, "loss": 0.7398, "step": 817 }, { "epoch": 0.12499522481567789, "grad_norm": 0.2561381757259369, "learning_rate": 0.0001960078947965637, "loss": 0.6174, "step": 818 }, { "epoch": 0.12514803071398556, "grad_norm": 0.26984918117523193, "learning_rate": 0.00019599392041069877, "loss": 0.7665, "step": 819 }, { "epoch": 0.12530083661229324, "grad_norm": 0.3260689675807953, "learning_rate": 0.00019597992210875439, "loss": 0.8475, "step": 820 }, { "epoch": 0.1254536425106009, "grad_norm": 0.3531004786491394, "learning_rate": 0.00019596589989421807, "loss": 0.5424, "step": 821 }, { "epoch": 0.12560644840890858, "grad_norm": 0.281276673078537, "learning_rate": 0.00019595185377058337, "loss": 0.7901, "step": 822 }, { "epoch": 0.12575925430721627, "grad_norm": 0.32058045268058777, "learning_rate": 0.00019593778374134974, "loss": 0.5447, "step": 823 }, { "epoch": 0.12591206020552392, "grad_norm": 0.2947518229484558, "learning_rate": 0.0001959236898100226, "loss": 0.598, "step": 824 }, { "epoch": 0.1260648661038316, "grad_norm": 0.2923273742198944, "learning_rate": 0.0001959095719801134, "loss": 0.6917, "step": 825 }, { "epoch": 0.1262176720021393, "grad_norm": 0.42918404936790466, "learning_rate": 0.00019589543025513937, "loss": 0.7697, "step": 826 }, { "epoch": 0.12637047790044695, "grad_norm": 0.24322174489498138, "learning_rate": 0.00019588126463862388, "loss": 0.6655, "step": 827 }, { "epoch": 0.12652328379875463, "grad_norm": 0.2740302085876465, "learning_rate": 0.00019586707513409617, "loss": 0.8133, "step": 828 }, { "epoch": 0.12667608969706232, "grad_norm": 0.32025039196014404, "learning_rate": 0.00019585286174509143, "loss": 0.7481, "step": 829 }, { "epoch": 0.12682889559536997, "grad_norm": 0.27031072974205017, "learning_rate": 0.00019583862447515075, "loss": 0.7403, "step": 830 }, { "epoch": 0.12698170149367766, "grad_norm": 0.2899322807788849, "learning_rate": 0.00019582436332782132, "loss": 0.7497, "step": 831 }, { "epoch": 0.12713450739198534, "grad_norm": 0.2485196739435196, "learning_rate": 0.00019581007830665615, "loss": 0.7895, "step": 832 }, { "epoch": 0.127287313290293, "grad_norm": 0.3776859939098358, "learning_rate": 0.00019579576941521418, "loss": 0.8331, "step": 833 }, { "epoch": 0.12744011918860068, "grad_norm": 0.3224911391735077, "learning_rate": 0.0001957814366570604, "loss": 0.7736, "step": 834 }, { "epoch": 0.12759292508690837, "grad_norm": 0.23357270658016205, "learning_rate": 0.0001957670800357657, "loss": 0.7477, "step": 835 }, { "epoch": 0.12774573098521602, "grad_norm": 0.327070415019989, "learning_rate": 0.00019575269955490691, "loss": 0.9605, "step": 836 }, { "epoch": 0.1278985368835237, "grad_norm": 0.3038500249385834, "learning_rate": 0.0001957382952180668, "loss": 0.7441, "step": 837 }, { "epoch": 0.1280513427818314, "grad_norm": 0.25796058773994446, "learning_rate": 0.0001957238670288341, "loss": 0.622, "step": 838 }, { "epoch": 0.12820414868013905, "grad_norm": 0.41515886783599854, "learning_rate": 0.00019570941499080343, "loss": 1.0251, "step": 839 }, { "epoch": 0.12835695457844673, "grad_norm": 0.276103138923645, "learning_rate": 0.00019569493910757542, "loss": 0.7484, "step": 840 }, { "epoch": 0.12850976047675441, "grad_norm": 0.2659948170185089, "learning_rate": 0.00019568043938275663, "loss": 0.6444, "step": 841 }, { "epoch": 0.12866256637506207, "grad_norm": 0.3016412854194641, "learning_rate": 0.00019566591581995953, "loss": 0.6506, "step": 842 }, { "epoch": 0.12881537227336975, "grad_norm": 0.29354071617126465, "learning_rate": 0.00019565136842280255, "loss": 0.7701, "step": 843 }, { "epoch": 0.12896817817167744, "grad_norm": 0.2543928325176239, "learning_rate": 0.00019563679719491003, "loss": 0.6735, "step": 844 }, { "epoch": 0.1291209840699851, "grad_norm": 0.31514692306518555, "learning_rate": 0.00019562220213991232, "loss": 0.7412, "step": 845 }, { "epoch": 0.12927378996829278, "grad_norm": 0.26890984177589417, "learning_rate": 0.00019560758326144558, "loss": 0.737, "step": 846 }, { "epoch": 0.12942659586660046, "grad_norm": 0.41693368554115295, "learning_rate": 0.00019559294056315207, "loss": 0.9572, "step": 847 }, { "epoch": 0.12957940176490812, "grad_norm": 0.24249470233917236, "learning_rate": 0.00019557827404867984, "loss": 0.5923, "step": 848 }, { "epoch": 0.1297322076632158, "grad_norm": 0.2592976689338684, "learning_rate": 0.00019556358372168294, "loss": 0.6565, "step": 849 }, { "epoch": 0.1298850135615235, "grad_norm": 0.25530290603637695, "learning_rate": 0.0001955488695858213, "loss": 0.6702, "step": 850 }, { "epoch": 0.13003781945983114, "grad_norm": 0.27146828174591064, "learning_rate": 0.00019553413164476088, "loss": 0.7106, "step": 851 }, { "epoch": 0.13019062535813883, "grad_norm": 0.24907812476158142, "learning_rate": 0.00019551936990217352, "loss": 0.5918, "step": 852 }, { "epoch": 0.1303434312564465, "grad_norm": 0.4307115972042084, "learning_rate": 0.00019550458436173694, "loss": 0.6567, "step": 853 }, { "epoch": 0.13049623715475417, "grad_norm": 0.2714177966117859, "learning_rate": 0.0001954897750271349, "loss": 0.6176, "step": 854 }, { "epoch": 0.13064904305306185, "grad_norm": 0.2820480167865753, "learning_rate": 0.0001954749419020569, "loss": 0.6196, "step": 855 }, { "epoch": 0.1308018489513695, "grad_norm": 0.34379422664642334, "learning_rate": 0.00019546008499019864, "loss": 0.6766, "step": 856 }, { "epoch": 0.1309546548496772, "grad_norm": 0.3182826638221741, "learning_rate": 0.00019544520429526146, "loss": 0.7375, "step": 857 }, { "epoch": 0.13110746074798488, "grad_norm": 0.2834482789039612, "learning_rate": 0.00019543029982095286, "loss": 0.6309, "step": 858 }, { "epoch": 0.13126026664629253, "grad_norm": 0.2880655527114868, "learning_rate": 0.0001954153715709861, "loss": 0.835, "step": 859 }, { "epoch": 0.13141307254460022, "grad_norm": 0.2973262667655945, "learning_rate": 0.0001954004195490805, "loss": 0.614, "step": 860 }, { "epoch": 0.1315658784429079, "grad_norm": 0.2937520742416382, "learning_rate": 0.0001953854437589611, "loss": 0.6843, "step": 861 }, { "epoch": 0.13171868434121556, "grad_norm": 0.24553163349628448, "learning_rate": 0.00019537044420435914, "loss": 0.716, "step": 862 }, { "epoch": 0.13187149023952324, "grad_norm": 0.2711239755153656, "learning_rate": 0.00019535542088901155, "loss": 0.6589, "step": 863 }, { "epoch": 0.13202429613783093, "grad_norm": 0.29433688521385193, "learning_rate": 0.0001953403738166613, "loss": 0.71, "step": 864 }, { "epoch": 0.13217710203613858, "grad_norm": 0.34008464217185974, "learning_rate": 0.0001953253029910572, "loss": 0.7356, "step": 865 }, { "epoch": 0.13232990793444627, "grad_norm": 0.26458245515823364, "learning_rate": 0.00019531020841595406, "loss": 0.708, "step": 866 }, { "epoch": 0.13248271383275395, "grad_norm": 0.3054756224155426, "learning_rate": 0.00019529509009511253, "loss": 0.5603, "step": 867 }, { "epoch": 0.1326355197310616, "grad_norm": 0.26879218220710754, "learning_rate": 0.00019527994803229926, "loss": 0.7848, "step": 868 }, { "epoch": 0.1327883256293693, "grad_norm": 0.29384973645210266, "learning_rate": 0.0001952647822312867, "loss": 0.6419, "step": 869 }, { "epoch": 0.13294113152767698, "grad_norm": 0.2679104208946228, "learning_rate": 0.00019524959269585337, "loss": 0.6762, "step": 870 }, { "epoch": 0.13309393742598463, "grad_norm": 0.21466131508350372, "learning_rate": 0.00019523437942978357, "loss": 0.6237, "step": 871 }, { "epoch": 0.13324674332429232, "grad_norm": 0.34037020802497864, "learning_rate": 0.0001952191424368675, "loss": 0.6994, "step": 872 }, { "epoch": 0.1333995492226, "grad_norm": 0.2652078866958618, "learning_rate": 0.00019520388172090142, "loss": 0.6774, "step": 873 }, { "epoch": 0.13355235512090766, "grad_norm": 0.2574101686477661, "learning_rate": 0.00019518859728568736, "loss": 0.6073, "step": 874 }, { "epoch": 0.13370516101921534, "grad_norm": 0.2610401213169098, "learning_rate": 0.00019517328913503334, "loss": 0.6916, "step": 875 }, { "epoch": 0.13385796691752302, "grad_norm": 0.23128172755241394, "learning_rate": 0.00019515795727275323, "loss": 0.7244, "step": 876 }, { "epoch": 0.13401077281583068, "grad_norm": 0.2592519521713257, "learning_rate": 0.00019514260170266687, "loss": 0.6513, "step": 877 }, { "epoch": 0.13416357871413837, "grad_norm": 0.23765848577022552, "learning_rate": 0.00019512722242859992, "loss": 0.7319, "step": 878 }, { "epoch": 0.13431638461244605, "grad_norm": 0.2605260908603668, "learning_rate": 0.00019511181945438402, "loss": 0.7414, "step": 879 }, { "epoch": 0.1344691905107537, "grad_norm": 0.2504040002822876, "learning_rate": 0.00019509639278385673, "loss": 0.6466, "step": 880 }, { "epoch": 0.1346219964090614, "grad_norm": 0.37945783138275146, "learning_rate": 0.00019508094242086138, "loss": 0.6494, "step": 881 }, { "epoch": 0.13477480230736907, "grad_norm": 0.2152481973171234, "learning_rate": 0.0001950654683692474, "loss": 0.7879, "step": 882 }, { "epoch": 0.13492760820567673, "grad_norm": 0.24629932641983032, "learning_rate": 0.00019504997063286999, "loss": 0.7656, "step": 883 }, { "epoch": 0.13508041410398441, "grad_norm": 0.3862961530685425, "learning_rate": 0.00019503444921559023, "loss": 1.0176, "step": 884 }, { "epoch": 0.1352332200022921, "grad_norm": 0.30331647396087646, "learning_rate": 0.0001950189041212752, "loss": 0.8648, "step": 885 }, { "epoch": 0.13538602590059975, "grad_norm": 0.24901315569877625, "learning_rate": 0.00019500333535379783, "loss": 0.6522, "step": 886 }, { "epoch": 0.13553883179890744, "grad_norm": 0.24886654317378998, "learning_rate": 0.00019498774291703695, "loss": 0.6478, "step": 887 }, { "epoch": 0.13569163769721512, "grad_norm": 0.7643476724624634, "learning_rate": 0.00019497212681487725, "loss": 0.6913, "step": 888 }, { "epoch": 0.13584444359552278, "grad_norm": 0.2617362141609192, "learning_rate": 0.00019495648705120938, "loss": 0.7314, "step": 889 }, { "epoch": 0.13599724949383046, "grad_norm": 0.2617185711860657, "learning_rate": 0.00019494082362992986, "loss": 0.6769, "step": 890 }, { "epoch": 0.13615005539213815, "grad_norm": 0.2449088990688324, "learning_rate": 0.00019492513655494106, "loss": 0.6773, "step": 891 }, { "epoch": 0.1363028612904458, "grad_norm": 0.27909642457962036, "learning_rate": 0.00019490942583015133, "loss": 0.8005, "step": 892 }, { "epoch": 0.1364556671887535, "grad_norm": 0.269930899143219, "learning_rate": 0.00019489369145947487, "loss": 0.8991, "step": 893 }, { "epoch": 0.13660847308706117, "grad_norm": 0.27242738008499146, "learning_rate": 0.00019487793344683172, "loss": 0.6498, "step": 894 }, { "epoch": 0.13676127898536883, "grad_norm": 0.23424513638019562, "learning_rate": 0.00019486215179614788, "loss": 0.6458, "step": 895 }, { "epoch": 0.1369140848836765, "grad_norm": 0.367795467376709, "learning_rate": 0.0001948463465113552, "loss": 0.6949, "step": 896 }, { "epoch": 0.1370668907819842, "grad_norm": 0.31714144349098206, "learning_rate": 0.00019483051759639148, "loss": 0.5297, "step": 897 }, { "epoch": 0.13721969668029185, "grad_norm": 0.2915026545524597, "learning_rate": 0.00019481466505520033, "loss": 0.7198, "step": 898 }, { "epoch": 0.13737250257859954, "grad_norm": 0.25416919589042664, "learning_rate": 0.00019479878889173128, "loss": 0.7209, "step": 899 }, { "epoch": 0.13752530847690722, "grad_norm": 0.26738041639328003, "learning_rate": 0.0001947828891099397, "loss": 0.6714, "step": 900 }, { "epoch": 0.13767811437521488, "grad_norm": 0.28009870648384094, "learning_rate": 0.00019476696571378699, "loss": 0.7738, "step": 901 }, { "epoch": 0.13783092027352256, "grad_norm": 0.26281601190567017, "learning_rate": 0.00019475101870724024, "loss": 0.6876, "step": 902 }, { "epoch": 0.13798372617183025, "grad_norm": 0.3066788613796234, "learning_rate": 0.00019473504809427254, "loss": 0.6356, "step": 903 }, { "epoch": 0.1381365320701379, "grad_norm": 0.2479480803012848, "learning_rate": 0.00019471905387886281, "loss": 0.6844, "step": 904 }, { "epoch": 0.13828933796844559, "grad_norm": 0.2982046902179718, "learning_rate": 0.00019470303606499597, "loss": 0.6945, "step": 905 }, { "epoch": 0.13844214386675327, "grad_norm": 0.2929346263408661, "learning_rate": 0.0001946869946566626, "loss": 0.9048, "step": 906 }, { "epoch": 0.13859494976506093, "grad_norm": 0.2749553322792053, "learning_rate": 0.00019467092965785933, "loss": 0.6481, "step": 907 }, { "epoch": 0.1387477556633686, "grad_norm": 0.25245675444602966, "learning_rate": 0.00019465484107258866, "loss": 0.5614, "step": 908 }, { "epoch": 0.1389005615616763, "grad_norm": 0.278685599565506, "learning_rate": 0.00019463872890485888, "loss": 0.6961, "step": 909 }, { "epoch": 0.13905336745998395, "grad_norm": 0.27726492285728455, "learning_rate": 0.0001946225931586842, "loss": 0.8507, "step": 910 }, { "epoch": 0.13920617335829163, "grad_norm": 0.23716701567173004, "learning_rate": 0.00019460643383808473, "loss": 0.658, "step": 911 }, { "epoch": 0.1393589792565993, "grad_norm": 0.22296889126300812, "learning_rate": 0.00019459025094708645, "loss": 0.57, "step": 912 }, { "epoch": 0.13951178515490698, "grad_norm": 0.2558571696281433, "learning_rate": 0.0001945740444897211, "loss": 0.767, "step": 913 }, { "epoch": 0.13966459105321466, "grad_norm": 0.2778489291667938, "learning_rate": 0.0001945578144700265, "loss": 0.573, "step": 914 }, { "epoch": 0.13981739695152232, "grad_norm": 0.26163187623023987, "learning_rate": 0.00019454156089204614, "loss": 0.6999, "step": 915 }, { "epoch": 0.13997020284983, "grad_norm": 0.21346476674079895, "learning_rate": 0.0001945252837598295, "loss": 0.7107, "step": 916 }, { "epoch": 0.14012300874813768, "grad_norm": 0.30867141485214233, "learning_rate": 0.00019450898307743185, "loss": 0.7034, "step": 917 }, { "epoch": 0.14027581464644534, "grad_norm": 0.31402018666267395, "learning_rate": 0.00019449265884891444, "loss": 0.7362, "step": 918 }, { "epoch": 0.14042862054475302, "grad_norm": 0.2718082666397095, "learning_rate": 0.00019447631107834422, "loss": 0.6461, "step": 919 }, { "epoch": 0.1405814264430607, "grad_norm": 0.2874963581562042, "learning_rate": 0.0001944599397697942, "loss": 0.7703, "step": 920 }, { "epoch": 0.14073423234136836, "grad_norm": 0.2410213053226471, "learning_rate": 0.00019444354492734308, "loss": 0.8031, "step": 921 }, { "epoch": 0.14088703823967605, "grad_norm": 0.3052217960357666, "learning_rate": 0.00019442712655507553, "loss": 0.6492, "step": 922 }, { "epoch": 0.14103984413798373, "grad_norm": 0.253045916557312, "learning_rate": 0.00019441068465708204, "loss": 0.7135, "step": 923 }, { "epoch": 0.1411926500362914, "grad_norm": 0.26858294010162354, "learning_rate": 0.00019439421923745897, "loss": 0.6473, "step": 924 }, { "epoch": 0.14134545593459907, "grad_norm": 0.2760922610759735, "learning_rate": 0.00019437773030030856, "loss": 0.6578, "step": 925 }, { "epoch": 0.14149826183290676, "grad_norm": 0.27938711643218994, "learning_rate": 0.00019436121784973886, "loss": 0.6319, "step": 926 }, { "epoch": 0.14165106773121441, "grad_norm": 0.2972564399242401, "learning_rate": 0.00019434468188986385, "loss": 0.521, "step": 927 }, { "epoch": 0.1418038736295221, "grad_norm": 0.24861447513103485, "learning_rate": 0.00019432812242480327, "loss": 0.5922, "step": 928 }, { "epoch": 0.14195667952782978, "grad_norm": 0.2783520221710205, "learning_rate": 0.00019431153945868282, "loss": 0.6947, "step": 929 }, { "epoch": 0.14210948542613744, "grad_norm": 0.27699464559555054, "learning_rate": 0.00019429493299563398, "loss": 0.7188, "step": 930 }, { "epoch": 0.14226229132444512, "grad_norm": 0.272151380777359, "learning_rate": 0.00019427830303979412, "loss": 0.7472, "step": 931 }, { "epoch": 0.1424150972227528, "grad_norm": 0.2500142753124237, "learning_rate": 0.00019426164959530646, "loss": 0.699, "step": 932 }, { "epoch": 0.14256790312106046, "grad_norm": 0.22516344487667084, "learning_rate": 0.00019424497266632, "loss": 0.6416, "step": 933 }, { "epoch": 0.14272070901936815, "grad_norm": 0.24510562419891357, "learning_rate": 0.00019422827225698978, "loss": 0.6235, "step": 934 }, { "epoch": 0.14287351491767583, "grad_norm": 0.26091206073760986, "learning_rate": 0.0001942115483714765, "loss": 0.5763, "step": 935 }, { "epoch": 0.1430263208159835, "grad_norm": 0.27128151059150696, "learning_rate": 0.00019419480101394679, "loss": 0.6988, "step": 936 }, { "epoch": 0.14317912671429117, "grad_norm": 0.24952010810375214, "learning_rate": 0.00019417803018857306, "loss": 0.5557, "step": 937 }, { "epoch": 0.14333193261259886, "grad_norm": 0.24118371307849884, "learning_rate": 0.00019416123589953367, "loss": 0.7261, "step": 938 }, { "epoch": 0.1434847385109065, "grad_norm": 0.2614760994911194, "learning_rate": 0.00019414441815101277, "loss": 0.9327, "step": 939 }, { "epoch": 0.1436375444092142, "grad_norm": 0.23340481519699097, "learning_rate": 0.00019412757694720038, "loss": 0.7183, "step": 940 }, { "epoch": 0.14379035030752188, "grad_norm": 0.2697202265262604, "learning_rate": 0.0001941107122922923, "loss": 0.8033, "step": 941 }, { "epoch": 0.14394315620582954, "grad_norm": 0.2892836630344391, "learning_rate": 0.00019409382419049024, "loss": 0.7165, "step": 942 }, { "epoch": 0.14409596210413722, "grad_norm": 0.2863559126853943, "learning_rate": 0.00019407691264600177, "loss": 0.9925, "step": 943 }, { "epoch": 0.1442487680024449, "grad_norm": 0.4062459170818329, "learning_rate": 0.00019405997766304019, "loss": 0.98, "step": 944 }, { "epoch": 0.14440157390075256, "grad_norm": 0.2909787595272064, "learning_rate": 0.00019404301924582474, "loss": 0.7841, "step": 945 }, { "epoch": 0.14455437979906025, "grad_norm": 0.22701376676559448, "learning_rate": 0.00019402603739858046, "loss": 0.6955, "step": 946 }, { "epoch": 0.14470718569736793, "grad_norm": 0.24725738167762756, "learning_rate": 0.00019400903212553824, "loss": 0.542, "step": 947 }, { "epoch": 0.14485999159567559, "grad_norm": 0.6905295252799988, "learning_rate": 0.00019399200343093477, "loss": 0.7339, "step": 948 }, { "epoch": 0.14501279749398327, "grad_norm": 0.25552988052368164, "learning_rate": 0.00019397495131901268, "loss": 0.6901, "step": 949 }, { "epoch": 0.14516560339229095, "grad_norm": 0.26159825921058655, "learning_rate": 0.0001939578757940203, "loss": 0.5935, "step": 950 }, { "epoch": 0.1453184092905986, "grad_norm": 0.3060537278652191, "learning_rate": 0.0001939407768602119, "loss": 0.8255, "step": 951 }, { "epoch": 0.1454712151889063, "grad_norm": 0.3530615270137787, "learning_rate": 0.00019392365452184745, "loss": 0.8534, "step": 952 }, { "epoch": 0.14562402108721398, "grad_norm": 0.28947460651397705, "learning_rate": 0.00019390650878319297, "loss": 0.7282, "step": 953 }, { "epoch": 0.14577682698552163, "grad_norm": 0.29009896516799927, "learning_rate": 0.00019388933964852004, "loss": 0.8321, "step": 954 }, { "epoch": 0.14592963288382932, "grad_norm": 0.3136522173881531, "learning_rate": 0.0001938721471221063, "loss": 0.7298, "step": 955 }, { "epoch": 0.146082438782137, "grad_norm": 0.28415438532829285, "learning_rate": 0.00019385493120823507, "loss": 0.587, "step": 956 }, { "epoch": 0.14623524468044466, "grad_norm": 0.3139444887638092, "learning_rate": 0.00019383769191119556, "loss": 0.6301, "step": 957 }, { "epoch": 0.14638805057875234, "grad_norm": 0.25484979152679443, "learning_rate": 0.0001938204292352828, "loss": 0.7721, "step": 958 }, { "epoch": 0.14654085647706003, "grad_norm": 0.3041636049747467, "learning_rate": 0.00019380314318479772, "loss": 0.6631, "step": 959 }, { "epoch": 0.14669366237536768, "grad_norm": 0.24949052929878235, "learning_rate": 0.00019378583376404685, "loss": 0.7336, "step": 960 }, { "epoch": 0.14684646827367537, "grad_norm": 0.265553742647171, "learning_rate": 0.00019376850097734276, "loss": 0.5804, "step": 961 }, { "epoch": 0.14699927417198305, "grad_norm": 0.3368707597255707, "learning_rate": 0.0001937511448290038, "loss": 0.658, "step": 962 }, { "epoch": 0.1471520800702907, "grad_norm": 0.253562331199646, "learning_rate": 0.00019373376532335406, "loss": 0.5727, "step": 963 }, { "epoch": 0.1473048859685984, "grad_norm": 0.3116651475429535, "learning_rate": 0.00019371636246472355, "loss": 0.7254, "step": 964 }, { "epoch": 0.14745769186690605, "grad_norm": 0.31176048517227173, "learning_rate": 0.00019369893625744794, "loss": 0.5388, "step": 965 }, { "epoch": 0.14761049776521373, "grad_norm": 0.29979297518730164, "learning_rate": 0.00019368148670586893, "loss": 0.8029, "step": 966 }, { "epoch": 0.14776330366352142, "grad_norm": 0.2583375573158264, "learning_rate": 0.0001936640138143339, "loss": 0.7159, "step": 967 }, { "epoch": 0.14791610956182907, "grad_norm": 0.3004581034183502, "learning_rate": 0.00019364651758719607, "loss": 0.7379, "step": 968 }, { "epoch": 0.14806891546013676, "grad_norm": 0.27960747480392456, "learning_rate": 0.00019362899802881446, "loss": 0.6646, "step": 969 }, { "epoch": 0.14822172135844444, "grad_norm": 0.36117398738861084, "learning_rate": 0.00019361145514355395, "loss": 0.6869, "step": 970 }, { "epoch": 0.1483745272567521, "grad_norm": 0.22343234717845917, "learning_rate": 0.00019359388893578516, "loss": 0.7267, "step": 971 }, { "epoch": 0.14852733315505978, "grad_norm": 0.2645972967147827, "learning_rate": 0.00019357629940988463, "loss": 0.6949, "step": 972 }, { "epoch": 0.14868013905336747, "grad_norm": 0.3185739517211914, "learning_rate": 0.00019355868657023456, "loss": 0.8325, "step": 973 }, { "epoch": 0.14883294495167512, "grad_norm": 0.29953569173812866, "learning_rate": 0.00019354105042122311, "loss": 0.7535, "step": 974 }, { "epoch": 0.1489857508499828, "grad_norm": 0.3828144967556, "learning_rate": 0.00019352339096724417, "loss": 0.7962, "step": 975 }, { "epoch": 0.1491385567482905, "grad_norm": 0.370355486869812, "learning_rate": 0.0001935057082126974, "loss": 0.7899, "step": 976 }, { "epoch": 0.14929136264659815, "grad_norm": 0.3137153685092926, "learning_rate": 0.00019348800216198835, "loss": 0.681, "step": 977 }, { "epoch": 0.14944416854490583, "grad_norm": 0.25897806882858276, "learning_rate": 0.00019347027281952834, "loss": 0.7847, "step": 978 }, { "epoch": 0.14959697444321352, "grad_norm": 0.23975513875484467, "learning_rate": 0.00019345252018973446, "loss": 0.7329, "step": 979 }, { "epoch": 0.14974978034152117, "grad_norm": 0.5979729294776917, "learning_rate": 0.0001934347442770296, "loss": 0.7083, "step": 980 }, { "epoch": 0.14990258623982886, "grad_norm": 0.29401421546936035, "learning_rate": 0.00019341694508584256, "loss": 0.5436, "step": 981 }, { "epoch": 0.15005539213813654, "grad_norm": 0.31312599778175354, "learning_rate": 0.0001933991226206078, "loss": 0.7891, "step": 982 }, { "epoch": 0.1502081980364442, "grad_norm": 0.2626318633556366, "learning_rate": 0.00019338127688576566, "loss": 0.8839, "step": 983 }, { "epoch": 0.15036100393475188, "grad_norm": 0.3405093848705292, "learning_rate": 0.00019336340788576225, "loss": 0.6524, "step": 984 }, { "epoch": 0.15051380983305956, "grad_norm": 0.23463614284992218, "learning_rate": 0.00019334551562504948, "loss": 0.6376, "step": 985 }, { "epoch": 0.15066661573136722, "grad_norm": 0.2980091869831085, "learning_rate": 0.00019332760010808505, "loss": 0.5438, "step": 986 }, { "epoch": 0.1508194216296749, "grad_norm": 0.4364708662033081, "learning_rate": 0.00019330966133933246, "loss": 0.8847, "step": 987 }, { "epoch": 0.1509722275279826, "grad_norm": 0.3133280575275421, "learning_rate": 0.00019329169932326103, "loss": 0.8726, "step": 988 }, { "epoch": 0.15112503342629025, "grad_norm": 0.24318096041679382, "learning_rate": 0.0001932737140643458, "loss": 0.8904, "step": 989 }, { "epoch": 0.15127783932459793, "grad_norm": 0.2581064999103546, "learning_rate": 0.00019325570556706772, "loss": 0.7594, "step": 990 }, { "epoch": 0.1514306452229056, "grad_norm": 0.2756637930870056, "learning_rate": 0.00019323767383591338, "loss": 0.7214, "step": 991 }, { "epoch": 0.15158345112121327, "grad_norm": 0.2461249828338623, "learning_rate": 0.00019321961887537524, "loss": 0.5824, "step": 992 }, { "epoch": 0.15173625701952095, "grad_norm": 0.2575419247150421, "learning_rate": 0.00019320154068995163, "loss": 0.9961, "step": 993 }, { "epoch": 0.15188906291782864, "grad_norm": 0.28650832176208496, "learning_rate": 0.00019318343928414645, "loss": 0.7662, "step": 994 }, { "epoch": 0.1520418688161363, "grad_norm": 0.29323071241378784, "learning_rate": 0.00019316531466246964, "loss": 0.8253, "step": 995 }, { "epoch": 0.15219467471444398, "grad_norm": 0.2523307502269745, "learning_rate": 0.00019314716682943667, "loss": 0.7602, "step": 996 }, { "epoch": 0.15234748061275166, "grad_norm": 0.2807372212409973, "learning_rate": 0.000193128995789569, "loss": 0.671, "step": 997 }, { "epoch": 0.15250028651105932, "grad_norm": 0.3469073176383972, "learning_rate": 0.0001931108015473938, "loss": 0.6428, "step": 998 }, { "epoch": 0.152653092409367, "grad_norm": 0.2644406855106354, "learning_rate": 0.00019309258410744399, "loss": 0.7001, "step": 999 }, { "epoch": 0.1528058983076747, "grad_norm": 0.23576515913009644, "learning_rate": 0.00019307434347425826, "loss": 0.8893, "step": 1000 }, { "epoch": 0.15295870420598234, "grad_norm": 0.25611841678619385, "learning_rate": 0.00019305607965238117, "loss": 0.7812, "step": 1001 }, { "epoch": 0.15311151010429003, "grad_norm": 0.29595786333084106, "learning_rate": 0.00019303779264636295, "loss": 0.8537, "step": 1002 }, { "epoch": 0.1532643160025977, "grad_norm": 0.26146572828292847, "learning_rate": 0.00019301948246075966, "loss": 0.6906, "step": 1003 }, { "epoch": 0.15341712190090537, "grad_norm": 0.23449784517288208, "learning_rate": 0.00019300114910013322, "loss": 0.804, "step": 1004 }, { "epoch": 0.15356992779921305, "grad_norm": 0.2150595486164093, "learning_rate": 0.00019298279256905107, "loss": 0.6666, "step": 1005 }, { "epoch": 0.15372273369752074, "grad_norm": 0.28082481026649475, "learning_rate": 0.0001929644128720867, "loss": 0.6423, "step": 1006 }, { "epoch": 0.1538755395958284, "grad_norm": 0.2738368809223175, "learning_rate": 0.00019294601001381925, "loss": 0.685, "step": 1007 }, { "epoch": 0.15402834549413608, "grad_norm": 0.39818379282951355, "learning_rate": 0.0001929275839988336, "loss": 0.8979, "step": 1008 }, { "epoch": 0.15418115139244376, "grad_norm": 0.28069233894348145, "learning_rate": 0.00019290913483172045, "loss": 0.7443, "step": 1009 }, { "epoch": 0.15433395729075142, "grad_norm": 0.4211709797382355, "learning_rate": 0.00019289066251707625, "loss": 0.8838, "step": 1010 }, { "epoch": 0.1544867631890591, "grad_norm": 0.25366538763046265, "learning_rate": 0.00019287216705950324, "loss": 0.672, "step": 1011 }, { "epoch": 0.15463956908736678, "grad_norm": 0.2813873589038849, "learning_rate": 0.00019285364846360943, "loss": 0.9237, "step": 1012 }, { "epoch": 0.15479237498567444, "grad_norm": 0.30833756923675537, "learning_rate": 0.0001928351067340085, "loss": 0.876, "step": 1013 }, { "epoch": 0.15494518088398213, "grad_norm": 0.3030012547969818, "learning_rate": 0.00019281654187532, "loss": 0.6612, "step": 1014 }, { "epoch": 0.1550979867822898, "grad_norm": 0.28833889961242676, "learning_rate": 0.00019279795389216922, "loss": 0.8364, "step": 1015 }, { "epoch": 0.15525079268059747, "grad_norm": 0.2622121274471283, "learning_rate": 0.00019277934278918725, "loss": 0.6571, "step": 1016 }, { "epoch": 0.15540359857890515, "grad_norm": 0.30616286396980286, "learning_rate": 0.0001927607085710108, "loss": 0.5411, "step": 1017 }, { "epoch": 0.15555640447721283, "grad_norm": 0.28586992621421814, "learning_rate": 0.00019274205124228245, "loss": 0.608, "step": 1018 }, { "epoch": 0.1557092103755205, "grad_norm": 0.26837894320487976, "learning_rate": 0.00019272337080765057, "loss": 0.8362, "step": 1019 }, { "epoch": 0.15586201627382817, "grad_norm": 0.3665698170661926, "learning_rate": 0.00019270466727176917, "loss": 0.5847, "step": 1020 }, { "epoch": 0.15601482217213583, "grad_norm": 0.2480638474225998, "learning_rate": 0.0001926859406392981, "loss": 0.6732, "step": 1021 }, { "epoch": 0.15616762807044351, "grad_norm": 0.3244907259941101, "learning_rate": 0.00019266719091490296, "loss": 0.8776, "step": 1022 }, { "epoch": 0.1563204339687512, "grad_norm": 0.39362213015556335, "learning_rate": 0.00019264841810325508, "loss": 0.7167, "step": 1023 }, { "epoch": 0.15647323986705886, "grad_norm": 0.2980339229106903, "learning_rate": 0.00019262962220903152, "loss": 0.8809, "step": 1024 }, { "epoch": 0.15662604576536654, "grad_norm": 0.32592833042144775, "learning_rate": 0.00019261080323691517, "loss": 0.9097, "step": 1025 }, { "epoch": 0.15677885166367422, "grad_norm": 0.30861401557922363, "learning_rate": 0.00019259196119159454, "loss": 0.5337, "step": 1026 }, { "epoch": 0.15693165756198188, "grad_norm": 0.3252275884151459, "learning_rate": 0.00019257309607776407, "loss": 0.8202, "step": 1027 }, { "epoch": 0.15708446346028956, "grad_norm": 0.3161613345146179, "learning_rate": 0.00019255420790012377, "loss": 0.5353, "step": 1028 }, { "epoch": 0.15723726935859725, "grad_norm": 0.22845958173274994, "learning_rate": 0.00019253529666337952, "loss": 0.6994, "step": 1029 }, { "epoch": 0.1573900752569049, "grad_norm": 0.2573609948158264, "learning_rate": 0.00019251636237224283, "loss": 0.7671, "step": 1030 }, { "epoch": 0.1575428811552126, "grad_norm": 0.24271678924560547, "learning_rate": 0.0001924974050314311, "loss": 0.5922, "step": 1031 }, { "epoch": 0.15769568705352027, "grad_norm": 0.2551860213279724, "learning_rate": 0.00019247842464566734, "loss": 0.7007, "step": 1032 }, { "epoch": 0.15784849295182793, "grad_norm": 0.2626005709171295, "learning_rate": 0.00019245942121968036, "loss": 0.7403, "step": 1033 }, { "epoch": 0.1580012988501356, "grad_norm": 0.30448147654533386, "learning_rate": 0.0001924403947582047, "loss": 0.7081, "step": 1034 }, { "epoch": 0.1581541047484433, "grad_norm": 0.24635186791419983, "learning_rate": 0.00019242134526598067, "loss": 0.5654, "step": 1035 }, { "epoch": 0.15830691064675095, "grad_norm": 0.29201674461364746, "learning_rate": 0.00019240227274775425, "loss": 0.7182, "step": 1036 }, { "epoch": 0.15845971654505864, "grad_norm": 0.46213141083717346, "learning_rate": 0.00019238317720827729, "loss": 0.8169, "step": 1037 }, { "epoch": 0.15861252244336632, "grad_norm": 0.2620154321193695, "learning_rate": 0.00019236405865230712, "loss": 0.9387, "step": 1038 }, { "epoch": 0.15876532834167398, "grad_norm": 0.26951172947883606, "learning_rate": 0.00019234491708460712, "loss": 0.511, "step": 1039 }, { "epoch": 0.15891813423998166, "grad_norm": 0.22812886536121368, "learning_rate": 0.0001923257525099462, "loss": 0.7245, "step": 1040 }, { "epoch": 0.15907094013828935, "grad_norm": 0.27627134323120117, "learning_rate": 0.00019230656493309902, "loss": 0.5724, "step": 1041 }, { "epoch": 0.159223746036597, "grad_norm": 0.26270973682403564, "learning_rate": 0.00019228735435884606, "loss": 0.6993, "step": 1042 }, { "epoch": 0.1593765519349047, "grad_norm": 0.27464500069618225, "learning_rate": 0.0001922681207919734, "loss": 0.8781, "step": 1043 }, { "epoch": 0.15952935783321237, "grad_norm": 0.3051292300224304, "learning_rate": 0.000192248864237273, "loss": 0.7656, "step": 1044 }, { "epoch": 0.15968216373152003, "grad_norm": 0.2516727149486542, "learning_rate": 0.00019222958469954242, "loss": 0.8011, "step": 1045 }, { "epoch": 0.1598349696298277, "grad_norm": 0.2554173767566681, "learning_rate": 0.00019221028218358504, "loss": 0.6839, "step": 1046 }, { "epoch": 0.1599877755281354, "grad_norm": 0.23768573999404907, "learning_rate": 0.00019219095669420984, "loss": 0.6611, "step": 1047 }, { "epoch": 0.16014058142644305, "grad_norm": 0.28291550278663635, "learning_rate": 0.00019217160823623169, "loss": 0.7191, "step": 1048 }, { "epoch": 0.16029338732475074, "grad_norm": 0.3698108494281769, "learning_rate": 0.00019215223681447104, "loss": 0.7449, "step": 1049 }, { "epoch": 0.16044619322305842, "grad_norm": 0.24757391214370728, "learning_rate": 0.00019213284243375415, "loss": 0.676, "step": 1050 }, { "epoch": 0.16059899912136608, "grad_norm": 0.25702232122421265, "learning_rate": 0.00019211342509891293, "loss": 0.7301, "step": 1051 }, { "epoch": 0.16075180501967376, "grad_norm": 0.37221047282218933, "learning_rate": 0.0001920939848147851, "loss": 0.667, "step": 1052 }, { "epoch": 0.16090461091798144, "grad_norm": 0.5544690489768982, "learning_rate": 0.000192074521586214, "loss": 0.8024, "step": 1053 }, { "epoch": 0.1610574168162891, "grad_norm": 0.2715505361557007, "learning_rate": 0.00019205503541804873, "loss": 0.6859, "step": 1054 }, { "epoch": 0.16121022271459678, "grad_norm": 0.2992199957370758, "learning_rate": 0.00019203552631514415, "loss": 0.8794, "step": 1055 }, { "epoch": 0.16136302861290447, "grad_norm": 0.2541504502296448, "learning_rate": 0.00019201599428236073, "loss": 0.6467, "step": 1056 }, { "epoch": 0.16151583451121213, "grad_norm": 0.27539893984794617, "learning_rate": 0.00019199643932456476, "loss": 0.6035, "step": 1057 }, { "epoch": 0.1616686404095198, "grad_norm": 0.261419415473938, "learning_rate": 0.00019197686144662815, "loss": 0.7197, "step": 1058 }, { "epoch": 0.1618214463078275, "grad_norm": 0.2520885765552521, "learning_rate": 0.00019195726065342856, "loss": 0.5276, "step": 1059 }, { "epoch": 0.16197425220613515, "grad_norm": 0.29256707429885864, "learning_rate": 0.00019193763694984943, "loss": 0.6546, "step": 1060 }, { "epoch": 0.16212705810444283, "grad_norm": 0.5188539624214172, "learning_rate": 0.00019191799034077981, "loss": 0.759, "step": 1061 }, { "epoch": 0.16227986400275052, "grad_norm": 0.23571883141994476, "learning_rate": 0.00019189832083111444, "loss": 0.6998, "step": 1062 }, { "epoch": 0.16243266990105817, "grad_norm": 0.24244998395442963, "learning_rate": 0.00019187862842575388, "loss": 0.6818, "step": 1063 }, { "epoch": 0.16258547579936586, "grad_norm": 0.29776662588119507, "learning_rate": 0.0001918589131296043, "loss": 0.6825, "step": 1064 }, { "epoch": 0.16273828169767354, "grad_norm": 0.29446274042129517, "learning_rate": 0.0001918391749475776, "loss": 0.7367, "step": 1065 }, { "epoch": 0.1628910875959812, "grad_norm": 0.27736350893974304, "learning_rate": 0.00019181941388459137, "loss": 0.7743, "step": 1066 }, { "epoch": 0.16304389349428888, "grad_norm": 0.2596782147884369, "learning_rate": 0.00019179962994556892, "loss": 0.6474, "step": 1067 }, { "epoch": 0.16319669939259657, "grad_norm": 0.2921583652496338, "learning_rate": 0.0001917798231354393, "loss": 0.7613, "step": 1068 }, { "epoch": 0.16334950529090422, "grad_norm": 0.24355407059192657, "learning_rate": 0.00019175999345913712, "loss": 0.5877, "step": 1069 }, { "epoch": 0.1635023111892119, "grad_norm": 0.2529122531414032, "learning_rate": 0.00019174014092160287, "loss": 0.7902, "step": 1070 }, { "epoch": 0.1636551170875196, "grad_norm": 0.3269736170768738, "learning_rate": 0.00019172026552778256, "loss": 0.8058, "step": 1071 }, { "epoch": 0.16380792298582725, "grad_norm": 0.2811448574066162, "learning_rate": 0.00019170036728262803, "loss": 1.0175, "step": 1072 }, { "epoch": 0.16396072888413493, "grad_norm": 0.23702581226825714, "learning_rate": 0.00019168044619109672, "loss": 0.5767, "step": 1073 }, { "epoch": 0.16411353478244262, "grad_norm": 0.36616837978363037, "learning_rate": 0.00019166050225815186, "loss": 0.7306, "step": 1074 }, { "epoch": 0.16426634068075027, "grad_norm": 0.24824507534503937, "learning_rate": 0.00019164053548876227, "loss": 0.6099, "step": 1075 }, { "epoch": 0.16441914657905796, "grad_norm": 0.24607868492603302, "learning_rate": 0.00019162054588790252, "loss": 0.7479, "step": 1076 }, { "epoch": 0.1645719524773656, "grad_norm": 0.2548847794532776, "learning_rate": 0.00019160053346055285, "loss": 0.6783, "step": 1077 }, { "epoch": 0.1647247583756733, "grad_norm": 0.3230314254760742, "learning_rate": 0.0001915804982116992, "loss": 0.8464, "step": 1078 }, { "epoch": 0.16487756427398098, "grad_norm": 0.2413746565580368, "learning_rate": 0.00019156044014633316, "loss": 0.7222, "step": 1079 }, { "epoch": 0.16503037017228864, "grad_norm": 0.2753642797470093, "learning_rate": 0.00019154035926945202, "loss": 0.8344, "step": 1080 }, { "epoch": 0.16518317607059632, "grad_norm": 1.3304742574691772, "learning_rate": 0.0001915202555860588, "loss": 0.715, "step": 1081 }, { "epoch": 0.165335981968904, "grad_norm": 0.3043583035469055, "learning_rate": 0.00019150012910116213, "loss": 0.6851, "step": 1082 }, { "epoch": 0.16548878786721166, "grad_norm": 0.252945214509964, "learning_rate": 0.00019147997981977638, "loss": 0.8384, "step": 1083 }, { "epoch": 0.16564159376551935, "grad_norm": 0.2606157064437866, "learning_rate": 0.00019145980774692157, "loss": 0.7957, "step": 1084 }, { "epoch": 0.16579439966382703, "grad_norm": 0.3107841908931732, "learning_rate": 0.00019143961288762336, "loss": 0.7824, "step": 1085 }, { "epoch": 0.1659472055621347, "grad_norm": 0.35161152482032776, "learning_rate": 0.0001914193952469132, "loss": 0.7518, "step": 1086 }, { "epoch": 0.16610001146044237, "grad_norm": 0.30095744132995605, "learning_rate": 0.0001913991548298281, "loss": 0.5003, "step": 1087 }, { "epoch": 0.16625281735875005, "grad_norm": 0.27559372782707214, "learning_rate": 0.0001913788916414108, "loss": 0.742, "step": 1088 }, { "epoch": 0.1664056232570577, "grad_norm": 0.2867778241634369, "learning_rate": 0.00019135860568670972, "loss": 0.7433, "step": 1089 }, { "epoch": 0.1665584291553654, "grad_norm": 0.30389800667762756, "learning_rate": 0.0001913382969707789, "loss": 0.7928, "step": 1090 }, { "epoch": 0.16671123505367308, "grad_norm": 0.2673511803150177, "learning_rate": 0.00019131796549867812, "loss": 0.7581, "step": 1091 }, { "epoch": 0.16686404095198074, "grad_norm": 0.3299412131309509, "learning_rate": 0.00019129761127547275, "loss": 0.7698, "step": 1092 }, { "epoch": 0.16701684685028842, "grad_norm": 0.33078551292419434, "learning_rate": 0.00019127723430623395, "loss": 0.6046, "step": 1093 }, { "epoch": 0.1671696527485961, "grad_norm": 0.28574293851852417, "learning_rate": 0.00019125683459603838, "loss": 0.5757, "step": 1094 }, { "epoch": 0.16732245864690376, "grad_norm": 0.32351842522621155, "learning_rate": 0.00019123641214996852, "loss": 0.5831, "step": 1095 }, { "epoch": 0.16747526454521144, "grad_norm": 0.2723073363304138, "learning_rate": 0.00019121596697311245, "loss": 0.8194, "step": 1096 }, { "epoch": 0.16762807044351913, "grad_norm": 0.32978907227516174, "learning_rate": 0.00019119549907056392, "loss": 0.6952, "step": 1097 }, { "epoch": 0.16778087634182678, "grad_norm": 0.30837321281433105, "learning_rate": 0.00019117500844742223, "loss": 0.7523, "step": 1098 }, { "epoch": 0.16793368224013447, "grad_norm": 0.24898523092269897, "learning_rate": 0.0001911544951087926, "loss": 0.7134, "step": 1099 }, { "epoch": 0.16808648813844215, "grad_norm": 0.27018532156944275, "learning_rate": 0.00019113395905978568, "loss": 0.613, "step": 1100 }, { "epoch": 0.1682392940367498, "grad_norm": 0.3074743151664734, "learning_rate": 0.00019111340030551784, "loss": 0.7982, "step": 1101 }, { "epoch": 0.1683920999350575, "grad_norm": 0.29580435156822205, "learning_rate": 0.00019109281885111115, "loss": 0.7358, "step": 1102 }, { "epoch": 0.16854490583336518, "grad_norm": 0.48277348279953003, "learning_rate": 0.00019107221470169333, "loss": 0.6511, "step": 1103 }, { "epoch": 0.16869771173167283, "grad_norm": 0.26997652649879456, "learning_rate": 0.00019105158786239765, "loss": 0.7542, "step": 1104 }, { "epoch": 0.16885051762998052, "grad_norm": 0.3362867832183838, "learning_rate": 0.0001910309383383632, "loss": 0.7393, "step": 1105 }, { "epoch": 0.1690033235282882, "grad_norm": 0.27180519700050354, "learning_rate": 0.00019101026613473456, "loss": 0.6968, "step": 1106 }, { "epoch": 0.16915612942659586, "grad_norm": 0.3135218918323517, "learning_rate": 0.00019098957125666212, "loss": 0.6301, "step": 1107 }, { "epoch": 0.16930893532490354, "grad_norm": 0.2497778683900833, "learning_rate": 0.00019096885370930173, "loss": 0.6232, "step": 1108 }, { "epoch": 0.16946174122321123, "grad_norm": 0.2656903862953186, "learning_rate": 0.0001909481134978151, "loss": 0.8126, "step": 1109 }, { "epoch": 0.16961454712151888, "grad_norm": 0.28536882996559143, "learning_rate": 0.00019092735062736945, "loss": 0.8282, "step": 1110 }, { "epoch": 0.16976735301982657, "grad_norm": 0.27915704250335693, "learning_rate": 0.00019090656510313762, "loss": 0.5578, "step": 1111 }, { "epoch": 0.16992015891813425, "grad_norm": 0.9698283672332764, "learning_rate": 0.00019088575693029818, "loss": 0.5033, "step": 1112 }, { "epoch": 0.1700729648164419, "grad_norm": 0.2613937258720398, "learning_rate": 0.00019086492611403535, "loss": 0.6993, "step": 1113 }, { "epoch": 0.1702257707147496, "grad_norm": 0.35720980167388916, "learning_rate": 0.00019084407265953889, "loss": 0.7705, "step": 1114 }, { "epoch": 0.17037857661305728, "grad_norm": 0.3005627691745758, "learning_rate": 0.0001908231965720043, "loss": 0.7495, "step": 1115 }, { "epoch": 0.17053138251136493, "grad_norm": 0.3507259488105774, "learning_rate": 0.00019080229785663268, "loss": 0.5559, "step": 1116 }, { "epoch": 0.17068418840967262, "grad_norm": 0.31441208720207214, "learning_rate": 0.00019078137651863078, "loss": 0.794, "step": 1117 }, { "epoch": 0.1708369943079803, "grad_norm": 0.2835392355918884, "learning_rate": 0.00019076043256321094, "loss": 0.6644, "step": 1118 }, { "epoch": 0.17098980020628796, "grad_norm": 0.2525550127029419, "learning_rate": 0.00019073946599559123, "loss": 0.7448, "step": 1119 }, { "epoch": 0.17114260610459564, "grad_norm": 0.3684603273868561, "learning_rate": 0.00019071847682099522, "loss": 0.8866, "step": 1120 }, { "epoch": 0.17129541200290332, "grad_norm": 0.350276917219162, "learning_rate": 0.00019069746504465224, "loss": 0.6864, "step": 1121 }, { "epoch": 0.17144821790121098, "grad_norm": 0.2641281187534332, "learning_rate": 0.00019067643067179714, "loss": 0.712, "step": 1122 }, { "epoch": 0.17160102379951866, "grad_norm": 0.28185218572616577, "learning_rate": 0.00019065537370767055, "loss": 0.6921, "step": 1123 }, { "epoch": 0.17175382969782635, "grad_norm": 0.2548218071460724, "learning_rate": 0.00019063429415751857, "loss": 0.7298, "step": 1124 }, { "epoch": 0.171906635596134, "grad_norm": 0.2929299473762512, "learning_rate": 0.000190613192026593, "loss": 0.6102, "step": 1125 }, { "epoch": 0.1720594414944417, "grad_norm": 0.22588087618350983, "learning_rate": 0.00019059206732015128, "loss": 0.6073, "step": 1126 }, { "epoch": 0.17221224739274937, "grad_norm": 0.2806350588798523, "learning_rate": 0.00019057092004345642, "loss": 0.7085, "step": 1127 }, { "epoch": 0.17236505329105703, "grad_norm": 0.2670913338661194, "learning_rate": 0.0001905497502017771, "loss": 0.6747, "step": 1128 }, { "epoch": 0.17251785918936471, "grad_norm": 0.2958940267562866, "learning_rate": 0.00019052855780038764, "loss": 0.5434, "step": 1129 }, { "epoch": 0.17267066508767237, "grad_norm": 0.46745023131370544, "learning_rate": 0.00019050734284456792, "loss": 0.7918, "step": 1130 }, { "epoch": 0.17282347098598005, "grad_norm": 0.24842768907546997, "learning_rate": 0.00019048610533960346, "loss": 0.7636, "step": 1131 }, { "epoch": 0.17297627688428774, "grad_norm": 0.23693160712718964, "learning_rate": 0.00019046484529078542, "loss": 0.5601, "step": 1132 }, { "epoch": 0.1731290827825954, "grad_norm": 0.27304303646087646, "learning_rate": 0.00019044356270341055, "loss": 0.6694, "step": 1133 }, { "epoch": 0.17328188868090308, "grad_norm": 0.28675514459609985, "learning_rate": 0.00019042225758278124, "loss": 0.7822, "step": 1134 }, { "epoch": 0.17343469457921076, "grad_norm": 0.32071536779403687, "learning_rate": 0.0001904009299342055, "loss": 0.6399, "step": 1135 }, { "epoch": 0.17358750047751842, "grad_norm": 0.2699134945869446, "learning_rate": 0.0001903795797629969, "loss": 0.699, "step": 1136 }, { "epoch": 0.1737403063758261, "grad_norm": 0.27211201190948486, "learning_rate": 0.00019035820707447468, "loss": 0.6925, "step": 1137 }, { "epoch": 0.1738931122741338, "grad_norm": 0.27725741267204285, "learning_rate": 0.00019033681187396364, "loss": 0.544, "step": 1138 }, { "epoch": 0.17404591817244144, "grad_norm": 0.3354361951351166, "learning_rate": 0.0001903153941667942, "loss": 0.65, "step": 1139 }, { "epoch": 0.17419872407074913, "grad_norm": 0.2714371383190155, "learning_rate": 0.0001902939539583025, "loss": 0.679, "step": 1140 }, { "epoch": 0.1743515299690568, "grad_norm": 0.33846041560173035, "learning_rate": 0.00019027249125383008, "loss": 0.7282, "step": 1141 }, { "epoch": 0.17450433586736447, "grad_norm": 0.29932504892349243, "learning_rate": 0.00019025100605872425, "loss": 0.7207, "step": 1142 }, { "epoch": 0.17465714176567215, "grad_norm": 0.26585763692855835, "learning_rate": 0.00019022949837833782, "loss": 0.5864, "step": 1143 }, { "epoch": 0.17480994766397984, "grad_norm": 0.2552662789821625, "learning_rate": 0.00019020796821802934, "loss": 0.6423, "step": 1144 }, { "epoch": 0.1749627535622875, "grad_norm": 0.333668977022171, "learning_rate": 0.00019018641558316276, "loss": 0.7273, "step": 1145 }, { "epoch": 0.17511555946059518, "grad_norm": 0.27263349294662476, "learning_rate": 0.0001901648404791078, "loss": 0.8035, "step": 1146 }, { "epoch": 0.17526836535890286, "grad_norm": 0.3458087146282196, "learning_rate": 0.00019014324291123966, "loss": 0.7291, "step": 1147 }, { "epoch": 0.17542117125721052, "grad_norm": 0.27026665210723877, "learning_rate": 0.00019012162288493926, "loss": 0.7192, "step": 1148 }, { "epoch": 0.1755739771555182, "grad_norm": 0.4639197587966919, "learning_rate": 0.00019009998040559305, "loss": 0.7784, "step": 1149 }, { "epoch": 0.17572678305382589, "grad_norm": 0.4161984324455261, "learning_rate": 0.000190078315478593, "loss": 0.6429, "step": 1150 }, { "epoch": 0.17587958895213354, "grad_norm": 0.33808472752571106, "learning_rate": 0.0001900566281093368, "loss": 0.8773, "step": 1151 }, { "epoch": 0.17603239485044123, "grad_norm": 0.3738580346107483, "learning_rate": 0.00019003491830322768, "loss": 0.7163, "step": 1152 }, { "epoch": 0.1761852007487489, "grad_norm": 0.2702450752258301, "learning_rate": 0.00019001318606567442, "loss": 0.7637, "step": 1153 }, { "epoch": 0.17633800664705657, "grad_norm": 0.2791532278060913, "learning_rate": 0.00018999143140209146, "loss": 0.7335, "step": 1154 }, { "epoch": 0.17649081254536425, "grad_norm": 0.29353049397468567, "learning_rate": 0.00018996965431789878, "loss": 0.7129, "step": 1155 }, { "epoch": 0.17664361844367193, "grad_norm": 0.2993602752685547, "learning_rate": 0.00018994785481852192, "loss": 1.0111, "step": 1156 }, { "epoch": 0.1767964243419796, "grad_norm": 0.24442961812019348, "learning_rate": 0.0001899260329093921, "loss": 0.7211, "step": 1157 }, { "epoch": 0.17694923024028728, "grad_norm": 0.3937727212905884, "learning_rate": 0.00018990418859594606, "loss": 0.8264, "step": 1158 }, { "epoch": 0.17710203613859496, "grad_norm": 0.34876108169555664, "learning_rate": 0.00018988232188362609, "loss": 0.6769, "step": 1159 }, { "epoch": 0.17725484203690262, "grad_norm": 0.26349112391471863, "learning_rate": 0.00018986043277788013, "loss": 0.6767, "step": 1160 }, { "epoch": 0.1774076479352103, "grad_norm": 0.30671951174736023, "learning_rate": 0.00018983852128416162, "loss": 0.6329, "step": 1161 }, { "epoch": 0.17756045383351798, "grad_norm": 0.30573394894599915, "learning_rate": 0.00018981658740792968, "loss": 0.7471, "step": 1162 }, { "epoch": 0.17771325973182564, "grad_norm": 0.30444616079330444, "learning_rate": 0.00018979463115464894, "loss": 0.6539, "step": 1163 }, { "epoch": 0.17786606563013332, "grad_norm": 0.26366716623306274, "learning_rate": 0.00018977265252978959, "loss": 0.7571, "step": 1164 }, { "epoch": 0.178018871528441, "grad_norm": 0.3460015058517456, "learning_rate": 0.00018975065153882745, "loss": 0.8319, "step": 1165 }, { "epoch": 0.17817167742674866, "grad_norm": 0.2514568269252777, "learning_rate": 0.00018972862818724385, "loss": 0.879, "step": 1166 }, { "epoch": 0.17832448332505635, "grad_norm": 0.30631664395332336, "learning_rate": 0.00018970658248052574, "loss": 0.7929, "step": 1167 }, { "epoch": 0.17847728922336403, "grad_norm": 0.3222149908542633, "learning_rate": 0.00018968451442416564, "loss": 0.698, "step": 1168 }, { "epoch": 0.1786300951216717, "grad_norm": 0.26225098967552185, "learning_rate": 0.00018966242402366162, "loss": 0.7549, "step": 1169 }, { "epoch": 0.17878290101997937, "grad_norm": 0.4013647139072418, "learning_rate": 0.00018964031128451727, "loss": 0.6848, "step": 1170 }, { "epoch": 0.17893570691828706, "grad_norm": 0.36838847398757935, "learning_rate": 0.00018961817621224186, "loss": 0.8063, "step": 1171 }, { "epoch": 0.1790885128165947, "grad_norm": 0.2401532381772995, "learning_rate": 0.00018959601881235008, "loss": 0.5936, "step": 1172 }, { "epoch": 0.1792413187149024, "grad_norm": 0.24672825634479523, "learning_rate": 0.00018957383909036233, "loss": 0.6094, "step": 1173 }, { "epoch": 0.17939412461321008, "grad_norm": 0.31857630610466003, "learning_rate": 0.00018955163705180444, "loss": 0.7282, "step": 1174 }, { "epoch": 0.17954693051151774, "grad_norm": 0.2470935434103012, "learning_rate": 0.00018952941270220793, "loss": 0.7646, "step": 1175 }, { "epoch": 0.17969973640982542, "grad_norm": 0.24539603292942047, "learning_rate": 0.00018950716604710982, "loss": 0.7425, "step": 1176 }, { "epoch": 0.1798525423081331, "grad_norm": 0.2655848562717438, "learning_rate": 0.00018948489709205254, "loss": 0.6178, "step": 1177 }, { "epoch": 0.18000534820644076, "grad_norm": 0.27233999967575073, "learning_rate": 0.00018946260584258438, "loss": 0.7679, "step": 1178 }, { "epoch": 0.18015815410474845, "grad_norm": 0.24756716191768646, "learning_rate": 0.0001894402923042589, "loss": 0.6063, "step": 1179 }, { "epoch": 0.18031096000305613, "grad_norm": 0.24609854817390442, "learning_rate": 0.0001894179564826354, "loss": 0.8233, "step": 1180 }, { "epoch": 0.1804637659013638, "grad_norm": 0.2573990523815155, "learning_rate": 0.00018939559838327866, "loss": 0.6456, "step": 1181 }, { "epoch": 0.18061657179967147, "grad_norm": 0.27310270071029663, "learning_rate": 0.00018937321801175896, "loss": 0.8405, "step": 1182 }, { "epoch": 0.18076937769797916, "grad_norm": 0.3258965313434601, "learning_rate": 0.0001893508153736522, "loss": 0.8264, "step": 1183 }, { "epoch": 0.1809221835962868, "grad_norm": 0.246231347322464, "learning_rate": 0.00018932839047453986, "loss": 0.892, "step": 1184 }, { "epoch": 0.1810749894945945, "grad_norm": 0.43690553307533264, "learning_rate": 0.00018930594332000885, "loss": 0.649, "step": 1185 }, { "epoch": 0.18122779539290215, "grad_norm": 0.25682827830314636, "learning_rate": 0.00018928347391565173, "loss": 0.7664, "step": 1186 }, { "epoch": 0.18138060129120984, "grad_norm": 0.3272826075553894, "learning_rate": 0.00018926098226706655, "loss": 0.7886, "step": 1187 }, { "epoch": 0.18153340718951752, "grad_norm": 0.2642538845539093, "learning_rate": 0.00018923846837985692, "loss": 0.8355, "step": 1188 }, { "epoch": 0.18168621308782518, "grad_norm": 0.2583806812763214, "learning_rate": 0.000189215932259632, "loss": 0.6944, "step": 1189 }, { "epoch": 0.18183901898613286, "grad_norm": 0.3627317249774933, "learning_rate": 0.00018919337391200644, "loss": 0.625, "step": 1190 }, { "epoch": 0.18199182488444055, "grad_norm": 0.2754598557949066, "learning_rate": 0.00018917079334260044, "loss": 0.7383, "step": 1191 }, { "epoch": 0.1821446307827482, "grad_norm": 0.2284909188747406, "learning_rate": 0.00018914819055703986, "loss": 0.68, "step": 1192 }, { "epoch": 0.18229743668105589, "grad_norm": 0.22947415709495544, "learning_rate": 0.0001891255655609559, "loss": 0.5471, "step": 1193 }, { "epoch": 0.18245024257936357, "grad_norm": 0.32317447662353516, "learning_rate": 0.0001891029183599854, "loss": 0.8991, "step": 1194 }, { "epoch": 0.18260304847767123, "grad_norm": 0.24611344933509827, "learning_rate": 0.0001890802489597708, "loss": 0.8077, "step": 1195 }, { "epoch": 0.1827558543759789, "grad_norm": 0.27162209153175354, "learning_rate": 0.0001890575573659599, "loss": 0.8446, "step": 1196 }, { "epoch": 0.1829086602742866, "grad_norm": 0.2550401985645294, "learning_rate": 0.00018903484358420616, "loss": 0.5734, "step": 1197 }, { "epoch": 0.18306146617259425, "grad_norm": 0.2913491427898407, "learning_rate": 0.0001890121076201685, "loss": 0.6703, "step": 1198 }, { "epoch": 0.18321427207090193, "grad_norm": 0.2754542827606201, "learning_rate": 0.00018898934947951147, "loss": 0.7495, "step": 1199 }, { "epoch": 0.18336707796920962, "grad_norm": 0.2145329713821411, "learning_rate": 0.00018896656916790497, "loss": 0.6425, "step": 1200 }, { "epoch": 0.18351988386751728, "grad_norm": 0.2763057053089142, "learning_rate": 0.0001889437666910246, "loss": 0.7225, "step": 1201 }, { "epoch": 0.18367268976582496, "grad_norm": 0.3310900926589966, "learning_rate": 0.00018892094205455134, "loss": 0.5517, "step": 1202 }, { "epoch": 0.18382549566413264, "grad_norm": 0.2717922031879425, "learning_rate": 0.0001888980952641718, "loss": 0.7338, "step": 1203 }, { "epoch": 0.1839783015624403, "grad_norm": 0.2514691650867462, "learning_rate": 0.00018887522632557807, "loss": 0.5614, "step": 1204 }, { "epoch": 0.18413110746074798, "grad_norm": 0.24077638983726501, "learning_rate": 0.00018885233524446773, "loss": 0.6412, "step": 1205 }, { "epoch": 0.18428391335905567, "grad_norm": 0.2869153320789337, "learning_rate": 0.00018882942202654392, "loss": 0.6988, "step": 1206 }, { "epoch": 0.18443671925736332, "grad_norm": 0.2389439344406128, "learning_rate": 0.00018880648667751526, "loss": 0.6581, "step": 1207 }, { "epoch": 0.184589525155671, "grad_norm": 0.2723163962364197, "learning_rate": 0.00018878352920309593, "loss": 0.942, "step": 1208 }, { "epoch": 0.1847423310539787, "grad_norm": 0.26030030846595764, "learning_rate": 0.00018876054960900555, "loss": 0.6953, "step": 1209 }, { "epoch": 0.18489513695228635, "grad_norm": 0.3128332793712616, "learning_rate": 0.00018873754790096932, "loss": 0.6775, "step": 1210 }, { "epoch": 0.18504794285059403, "grad_norm": 0.29483985900878906, "learning_rate": 0.0001887145240847179, "loss": 0.7367, "step": 1211 }, { "epoch": 0.18520074874890172, "grad_norm": 0.30954962968826294, "learning_rate": 0.00018869147816598752, "loss": 0.6747, "step": 1212 }, { "epoch": 0.18535355464720937, "grad_norm": 0.2984929084777832, "learning_rate": 0.00018866841015051985, "loss": 0.847, "step": 1213 }, { "epoch": 0.18550636054551706, "grad_norm": 0.26335060596466064, "learning_rate": 0.00018864532004406206, "loss": 0.6406, "step": 1214 }, { "epoch": 0.18565916644382474, "grad_norm": 0.3111365735530853, "learning_rate": 0.0001886222078523669, "loss": 0.8221, "step": 1215 }, { "epoch": 0.1858119723421324, "grad_norm": 0.2846592664718628, "learning_rate": 0.00018859907358119259, "loss": 0.6764, "step": 1216 }, { "epoch": 0.18596477824044008, "grad_norm": 0.3108222186565399, "learning_rate": 0.00018857591723630282, "loss": 0.6278, "step": 1217 }, { "epoch": 0.18611758413874777, "grad_norm": 0.27452194690704346, "learning_rate": 0.0001885527388234668, "loss": 0.6661, "step": 1218 }, { "epoch": 0.18627039003705542, "grad_norm": 0.3490808308124542, "learning_rate": 0.00018852953834845923, "loss": 0.7153, "step": 1219 }, { "epoch": 0.1864231959353631, "grad_norm": 0.4313880503177643, "learning_rate": 0.00018850631581706032, "loss": 0.5908, "step": 1220 }, { "epoch": 0.1865760018336708, "grad_norm": 0.2889242470264435, "learning_rate": 0.00018848307123505578, "loss": 0.796, "step": 1221 }, { "epoch": 0.18672880773197845, "grad_norm": 0.37442779541015625, "learning_rate": 0.00018845980460823676, "loss": 0.6915, "step": 1222 }, { "epoch": 0.18688161363028613, "grad_norm": 0.3386521339416504, "learning_rate": 0.00018843651594239997, "loss": 0.8258, "step": 1223 }, { "epoch": 0.18703441952859381, "grad_norm": 0.29641178250312805, "learning_rate": 0.0001884132052433476, "loss": 0.6895, "step": 1224 }, { "epoch": 0.18718722542690147, "grad_norm": 0.29515600204467773, "learning_rate": 0.00018838987251688734, "loss": 0.6559, "step": 1225 }, { "epoch": 0.18734003132520916, "grad_norm": 0.2754113972187042, "learning_rate": 0.0001883665177688323, "loss": 0.9137, "step": 1226 }, { "epoch": 0.18749283722351684, "grad_norm": 0.32451876997947693, "learning_rate": 0.0001883431410050011, "loss": 0.5024, "step": 1227 }, { "epoch": 0.1876456431218245, "grad_norm": 0.2551485300064087, "learning_rate": 0.00018831974223121792, "loss": 0.7639, "step": 1228 }, { "epoch": 0.18779844902013218, "grad_norm": 0.2415483593940735, "learning_rate": 0.0001882963214533123, "loss": 0.5037, "step": 1229 }, { "epoch": 0.18795125491843986, "grad_norm": 0.2431052029132843, "learning_rate": 0.00018827287867711942, "loss": 0.7216, "step": 1230 }, { "epoch": 0.18810406081674752, "grad_norm": 0.27758708596229553, "learning_rate": 0.00018824941390847976, "loss": 0.7804, "step": 1231 }, { "epoch": 0.1882568667150552, "grad_norm": 0.3011445105075836, "learning_rate": 0.00018822592715323944, "loss": 0.7279, "step": 1232 }, { "epoch": 0.1884096726133629, "grad_norm": 0.36954644322395325, "learning_rate": 0.00018820241841724996, "loss": 0.7755, "step": 1233 }, { "epoch": 0.18856247851167054, "grad_norm": 0.29674795269966125, "learning_rate": 0.0001881788877063683, "loss": 0.7926, "step": 1234 }, { "epoch": 0.18871528440997823, "grad_norm": 0.29829445481300354, "learning_rate": 0.00018815533502645698, "loss": 0.7176, "step": 1235 }, { "epoch": 0.1888680903082859, "grad_norm": 0.29634547233581543, "learning_rate": 0.00018813176038338393, "loss": 0.5793, "step": 1236 }, { "epoch": 0.18902089620659357, "grad_norm": 0.2551177442073822, "learning_rate": 0.00018810816378302258, "loss": 0.8047, "step": 1237 }, { "epoch": 0.18917370210490125, "grad_norm": 0.3007522225379944, "learning_rate": 0.00018808454523125184, "loss": 0.646, "step": 1238 }, { "epoch": 0.18932650800320894, "grad_norm": 0.32498404383659363, "learning_rate": 0.00018806090473395603, "loss": 0.9898, "step": 1239 }, { "epoch": 0.1894793139015166, "grad_norm": 0.28269198536872864, "learning_rate": 0.00018803724229702503, "loss": 0.6897, "step": 1240 }, { "epoch": 0.18963211979982428, "grad_norm": 0.4503588080406189, "learning_rate": 0.00018801355792635413, "loss": 0.5233, "step": 1241 }, { "epoch": 0.18978492569813193, "grad_norm": 0.29055777192115784, "learning_rate": 0.00018798985162784404, "loss": 0.7211, "step": 1242 }, { "epoch": 0.18993773159643962, "grad_norm": 0.27893009781837463, "learning_rate": 0.00018796612340740105, "loss": 0.7643, "step": 1243 }, { "epoch": 0.1900905374947473, "grad_norm": 0.335261732339859, "learning_rate": 0.00018794237327093684, "loss": 0.6786, "step": 1244 }, { "epoch": 0.19024334339305496, "grad_norm": 0.5441724061965942, "learning_rate": 0.0001879186012243685, "loss": 0.6558, "step": 1245 }, { "epoch": 0.19039614929136264, "grad_norm": 0.25137859582901, "learning_rate": 0.00018789480727361872, "loss": 0.6788, "step": 1246 }, { "epoch": 0.19054895518967033, "grad_norm": 0.293194055557251, "learning_rate": 0.00018787099142461547, "loss": 0.7672, "step": 1247 }, { "epoch": 0.19070176108797798, "grad_norm": 0.3383125066757202, "learning_rate": 0.00018784715368329235, "loss": 0.8097, "step": 1248 }, { "epoch": 0.19085456698628567, "grad_norm": 0.3129540979862213, "learning_rate": 0.0001878232940555883, "loss": 0.7752, "step": 1249 }, { "epoch": 0.19100737288459335, "grad_norm": 0.31098222732543945, "learning_rate": 0.00018779941254744772, "loss": 0.599, "step": 1250 }, { "epoch": 0.191160178782901, "grad_norm": 0.37664595246315, "learning_rate": 0.00018777550916482055, "loss": 0.8034, "step": 1251 }, { "epoch": 0.1913129846812087, "grad_norm": 0.3807818293571472, "learning_rate": 0.00018775158391366205, "loss": 0.6301, "step": 1252 }, { "epoch": 0.19146579057951638, "grad_norm": 0.2625497281551361, "learning_rate": 0.00018772763679993304, "loss": 0.6262, "step": 1253 }, { "epoch": 0.19161859647782403, "grad_norm": 0.43181419372558594, "learning_rate": 0.00018770366782959973, "loss": 0.6965, "step": 1254 }, { "epoch": 0.19177140237613172, "grad_norm": 0.25425031781196594, "learning_rate": 0.00018767967700863378, "loss": 0.7899, "step": 1255 }, { "epoch": 0.1919242082744394, "grad_norm": 0.28523769974708557, "learning_rate": 0.0001876556643430123, "loss": 0.6635, "step": 1256 }, { "epoch": 0.19207701417274706, "grad_norm": 0.2615564167499542, "learning_rate": 0.00018763162983871786, "loss": 0.5732, "step": 1257 }, { "epoch": 0.19222982007105474, "grad_norm": 0.2762017250061035, "learning_rate": 0.00018760757350173846, "loss": 0.6992, "step": 1258 }, { "epoch": 0.19238262596936243, "grad_norm": 0.3806002140045166, "learning_rate": 0.00018758349533806753, "loss": 0.7677, "step": 1259 }, { "epoch": 0.19253543186767008, "grad_norm": 0.5583063960075378, "learning_rate": 0.00018755939535370391, "loss": 0.6268, "step": 1260 }, { "epoch": 0.19268823776597777, "grad_norm": 0.3534693121910095, "learning_rate": 0.00018753527355465193, "loss": 0.8504, "step": 1261 }, { "epoch": 0.19284104366428545, "grad_norm": 0.26150697469711304, "learning_rate": 0.00018751112994692132, "loss": 0.7045, "step": 1262 }, { "epoch": 0.1929938495625931, "grad_norm": 0.28841933608055115, "learning_rate": 0.0001874869645365273, "loss": 0.8777, "step": 1263 }, { "epoch": 0.1931466554609008, "grad_norm": 0.32916703820228577, "learning_rate": 0.00018746277732949044, "loss": 0.7558, "step": 1264 }, { "epoch": 0.19329946135920847, "grad_norm": 0.24659676849842072, "learning_rate": 0.0001874385683318368, "loss": 0.6683, "step": 1265 }, { "epoch": 0.19345226725751613, "grad_norm": 0.21921052038669586, "learning_rate": 0.00018741433754959784, "loss": 0.6024, "step": 1266 }, { "epoch": 0.19360507315582381, "grad_norm": 0.3062233328819275, "learning_rate": 0.00018739008498881048, "loss": 0.5211, "step": 1267 }, { "epoch": 0.1937578790541315, "grad_norm": 0.414465993642807, "learning_rate": 0.000187365810655517, "loss": 0.6716, "step": 1268 }, { "epoch": 0.19391068495243916, "grad_norm": 0.41710537672042847, "learning_rate": 0.00018734151455576515, "loss": 0.7272, "step": 1269 }, { "epoch": 0.19406349085074684, "grad_norm": 0.29968956112861633, "learning_rate": 0.00018731719669560812, "loss": 0.8149, "step": 1270 }, { "epoch": 0.19421629674905452, "grad_norm": 0.6653236746788025, "learning_rate": 0.0001872928570811045, "loss": 0.5673, "step": 1271 }, { "epoch": 0.19436910264736218, "grad_norm": 0.24293015897274017, "learning_rate": 0.0001872684957183183, "loss": 0.6617, "step": 1272 }, { "epoch": 0.19452190854566986, "grad_norm": 0.2898862659931183, "learning_rate": 0.00018724411261331896, "loss": 0.8086, "step": 1273 }, { "epoch": 0.19467471444397755, "grad_norm": 0.26143643260002136, "learning_rate": 0.00018721970777218127, "loss": 0.6261, "step": 1274 }, { "epoch": 0.1948275203422852, "grad_norm": 0.3271556496620178, "learning_rate": 0.00018719528120098556, "loss": 0.7828, "step": 1275 }, { "epoch": 0.1949803262405929, "grad_norm": 0.2851318418979645, "learning_rate": 0.00018717083290581746, "loss": 0.6906, "step": 1276 }, { "epoch": 0.19513313213890057, "grad_norm": 0.26299694180488586, "learning_rate": 0.0001871463628927681, "loss": 0.7106, "step": 1277 }, { "epoch": 0.19528593803720823, "grad_norm": 0.4681147634983063, "learning_rate": 0.00018712187116793393, "loss": 0.7675, "step": 1278 }, { "epoch": 0.1954387439355159, "grad_norm": 0.2557898461818695, "learning_rate": 0.0001870973577374169, "loss": 0.7315, "step": 1279 }, { "epoch": 0.1955915498338236, "grad_norm": 0.31964412331581116, "learning_rate": 0.0001870728226073243, "loss": 0.8681, "step": 1280 }, { "epoch": 0.19574435573213125, "grad_norm": 0.25558051466941833, "learning_rate": 0.00018704826578376884, "loss": 0.7058, "step": 1281 }, { "epoch": 0.19589716163043894, "grad_norm": 0.28534409403800964, "learning_rate": 0.0001870236872728687, "loss": 0.7881, "step": 1282 }, { "epoch": 0.19604996752874662, "grad_norm": 0.24193304777145386, "learning_rate": 0.00018699908708074735, "loss": 0.7273, "step": 1283 }, { "epoch": 0.19620277342705428, "grad_norm": 0.28959253430366516, "learning_rate": 0.00018697446521353375, "loss": 0.6541, "step": 1284 }, { "epoch": 0.19635557932536196, "grad_norm": 0.263320654630661, "learning_rate": 0.00018694982167736222, "loss": 0.5601, "step": 1285 }, { "epoch": 0.19650838522366965, "grad_norm": 0.23733118176460266, "learning_rate": 0.0001869251564783725, "loss": 0.554, "step": 1286 }, { "epoch": 0.1966611911219773, "grad_norm": 0.2797900140285492, "learning_rate": 0.00018690046962270974, "loss": 0.9695, "step": 1287 }, { "epoch": 0.196813997020285, "grad_norm": 0.28238213062286377, "learning_rate": 0.00018687576111652438, "loss": 0.6728, "step": 1288 }, { "epoch": 0.19696680291859267, "grad_norm": 0.3334377408027649, "learning_rate": 0.00018685103096597244, "loss": 0.6607, "step": 1289 }, { "epoch": 0.19711960881690033, "grad_norm": 0.2756267488002777, "learning_rate": 0.00018682627917721516, "loss": 0.6685, "step": 1290 }, { "epoch": 0.197272414715208, "grad_norm": 0.25085243582725525, "learning_rate": 0.00018680150575641928, "loss": 0.6337, "step": 1291 }, { "epoch": 0.1974252206135157, "grad_norm": 0.2986142039299011, "learning_rate": 0.00018677671070975688, "loss": 0.6334, "step": 1292 }, { "epoch": 0.19757802651182335, "grad_norm": 0.35889819264411926, "learning_rate": 0.00018675189404340542, "loss": 0.6769, "step": 1293 }, { "epoch": 0.19773083241013104, "grad_norm": 0.35846251249313354, "learning_rate": 0.00018672705576354775, "loss": 0.7592, "step": 1294 }, { "epoch": 0.19788363830843872, "grad_norm": 0.2749708592891693, "learning_rate": 0.00018670219587637219, "loss": 0.6868, "step": 1295 }, { "epoch": 0.19803644420674638, "grad_norm": 0.31376180052757263, "learning_rate": 0.0001866773143880723, "loss": 0.5926, "step": 1296 }, { "epoch": 0.19818925010505406, "grad_norm": 0.400387167930603, "learning_rate": 0.00018665241130484713, "loss": 0.8536, "step": 1297 }, { "epoch": 0.19834205600336172, "grad_norm": 0.3091200590133667, "learning_rate": 0.00018662748663290105, "loss": 0.7177, "step": 1298 }, { "epoch": 0.1984948619016694, "grad_norm": 0.27559390664100647, "learning_rate": 0.00018660254037844388, "loss": 0.8545, "step": 1299 }, { "epoch": 0.19864766779997708, "grad_norm": 0.2838318943977356, "learning_rate": 0.00018657757254769074, "loss": 0.758, "step": 1300 }, { "epoch": 0.19880047369828474, "grad_norm": 0.2726922035217285, "learning_rate": 0.0001865525831468621, "loss": 0.549, "step": 1301 }, { "epoch": 0.19895327959659243, "grad_norm": 0.6867300271987915, "learning_rate": 0.00018652757218218396, "loss": 0.6198, "step": 1302 }, { "epoch": 0.1991060854949001, "grad_norm": 0.32437586784362793, "learning_rate": 0.0001865025396598875, "loss": 0.7343, "step": 1303 }, { "epoch": 0.19925889139320777, "grad_norm": 0.29952913522720337, "learning_rate": 0.00018647748558620942, "loss": 0.8007, "step": 1304 }, { "epoch": 0.19941169729151545, "grad_norm": 1.2307347059249878, "learning_rate": 0.00018645240996739175, "loss": 0.6912, "step": 1305 }, { "epoch": 0.19956450318982313, "grad_norm": 0.3030075430870056, "learning_rate": 0.00018642731280968185, "loss": 0.6624, "step": 1306 }, { "epoch": 0.1997173090881308, "grad_norm": 0.35436445474624634, "learning_rate": 0.0001864021941193324, "loss": 0.7682, "step": 1307 }, { "epoch": 0.19987011498643847, "grad_norm": 0.33362630009651184, "learning_rate": 0.00018637705390260161, "loss": 0.6417, "step": 1308 }, { "epoch": 0.20002292088474616, "grad_norm": 0.4144555628299713, "learning_rate": 0.00018635189216575291, "loss": 0.7121, "step": 1309 }, { "epoch": 0.20017572678305381, "grad_norm": 0.2980126738548279, "learning_rate": 0.0001863267089150551, "loss": 0.6802, "step": 1310 }, { "epoch": 0.2003285326813615, "grad_norm": 0.27214938402175903, "learning_rate": 0.00018630150415678242, "loss": 0.5862, "step": 1311 }, { "epoch": 0.20048133857966918, "grad_norm": 0.2389996200799942, "learning_rate": 0.00018627627789721444, "loss": 0.6268, "step": 1312 }, { "epoch": 0.20063414447797684, "grad_norm": 0.26987066864967346, "learning_rate": 0.00018625103014263602, "loss": 0.7125, "step": 1313 }, { "epoch": 0.20078695037628452, "grad_norm": 0.4237341582775116, "learning_rate": 0.0001862257608993375, "loss": 0.3945, "step": 1314 }, { "epoch": 0.2009397562745922, "grad_norm": 0.30507996678352356, "learning_rate": 0.00018620047017361442, "loss": 0.7114, "step": 1315 }, { "epoch": 0.20109256217289986, "grad_norm": 0.3909916281700134, "learning_rate": 0.00018617515797176776, "loss": 0.8767, "step": 1316 }, { "epoch": 0.20124536807120755, "grad_norm": 0.3162682056427002, "learning_rate": 0.00018614982430010388, "loss": 0.6625, "step": 1317 }, { "epoch": 0.20139817396951523, "grad_norm": 0.2585495412349701, "learning_rate": 0.00018612446916493444, "loss": 0.7066, "step": 1318 }, { "epoch": 0.2015509798678229, "grad_norm": 0.27862757444381714, "learning_rate": 0.00018609909257257648, "loss": 0.9383, "step": 1319 }, { "epoch": 0.20170378576613057, "grad_norm": 0.2943915128707886, "learning_rate": 0.00018607369452935233, "loss": 0.7859, "step": 1320 }, { "epoch": 0.20185659166443826, "grad_norm": 0.4653479754924774, "learning_rate": 0.00018604827504158967, "loss": 0.8381, "step": 1321 }, { "epoch": 0.2020093975627459, "grad_norm": 0.23425963521003723, "learning_rate": 0.00018602283411562164, "loss": 0.7697, "step": 1322 }, { "epoch": 0.2021622034610536, "grad_norm": 0.2639780640602112, "learning_rate": 0.0001859973717577866, "loss": 0.8266, "step": 1323 }, { "epoch": 0.20231500935936128, "grad_norm": 0.3585314154624939, "learning_rate": 0.00018597188797442823, "loss": 0.5168, "step": 1324 }, { "epoch": 0.20246781525766894, "grad_norm": 0.3173975646495819, "learning_rate": 0.00018594638277189568, "loss": 0.7392, "step": 1325 }, { "epoch": 0.20262062115597662, "grad_norm": 0.24657927453517914, "learning_rate": 0.0001859208561565433, "loss": 0.7558, "step": 1326 }, { "epoch": 0.2027734270542843, "grad_norm": 0.26230642199516296, "learning_rate": 0.0001858953081347308, "loss": 0.7076, "step": 1327 }, { "epoch": 0.20292623295259196, "grad_norm": 0.3020760416984558, "learning_rate": 0.00018586973871282338, "loss": 0.6357, "step": 1328 }, { "epoch": 0.20307903885089965, "grad_norm": 0.27209731936454773, "learning_rate": 0.00018584414789719132, "loss": 0.6761, "step": 1329 }, { "epoch": 0.20323184474920733, "grad_norm": 0.24238243699073792, "learning_rate": 0.00018581853569421043, "loss": 0.6273, "step": 1330 }, { "epoch": 0.203384650647515, "grad_norm": 0.3168526589870453, "learning_rate": 0.00018579290211026173, "loss": 0.556, "step": 1331 }, { "epoch": 0.20353745654582267, "grad_norm": 0.2814149856567383, "learning_rate": 0.00018576724715173168, "loss": 0.5308, "step": 1332 }, { "epoch": 0.20369026244413035, "grad_norm": 0.3175278604030609, "learning_rate": 0.00018574157082501194, "loss": 0.8015, "step": 1333 }, { "epoch": 0.203843068342438, "grad_norm": 0.32856446504592896, "learning_rate": 0.00018571587313649955, "loss": 0.7576, "step": 1334 }, { "epoch": 0.2039958742407457, "grad_norm": 0.3181629180908203, "learning_rate": 0.00018569015409259688, "loss": 0.7387, "step": 1335 }, { "epoch": 0.20414868013905338, "grad_norm": 0.2765921652317047, "learning_rate": 0.00018566441369971166, "loss": 0.7357, "step": 1336 }, { "epoch": 0.20430148603736104, "grad_norm": 0.30099403858184814, "learning_rate": 0.00018563865196425682, "loss": 0.6671, "step": 1337 }, { "epoch": 0.20445429193566872, "grad_norm": 0.37037232518196106, "learning_rate": 0.00018561286889265074, "loss": 0.6421, "step": 1338 }, { "epoch": 0.2046070978339764, "grad_norm": 0.32211172580718994, "learning_rate": 0.000185587064491317, "loss": 0.6952, "step": 1339 }, { "epoch": 0.20475990373228406, "grad_norm": 0.31535395979881287, "learning_rate": 0.00018556123876668459, "loss": 0.5887, "step": 1340 }, { "epoch": 0.20491270963059174, "grad_norm": 0.4243486523628235, "learning_rate": 0.00018553539172518776, "loss": 0.8713, "step": 1341 }, { "epoch": 0.20506551552889943, "grad_norm": 0.2893839478492737, "learning_rate": 0.00018550952337326607, "loss": 0.5753, "step": 1342 }, { "epoch": 0.20521832142720708, "grad_norm": 0.24352984130382538, "learning_rate": 0.00018548363371736449, "loss": 0.6823, "step": 1343 }, { "epoch": 0.20537112732551477, "grad_norm": 0.2798251509666443, "learning_rate": 0.00018545772276393308, "loss": 0.5801, "step": 1344 }, { "epoch": 0.20552393322382245, "grad_norm": 0.2867914140224457, "learning_rate": 0.0001854317905194274, "loss": 0.7108, "step": 1345 }, { "epoch": 0.2056767391221301, "grad_norm": 0.3005681335926056, "learning_rate": 0.00018540583699030826, "loss": 0.7227, "step": 1346 }, { "epoch": 0.2058295450204378, "grad_norm": 0.26915448904037476, "learning_rate": 0.00018537986218304176, "loss": 0.6557, "step": 1347 }, { "epoch": 0.20598235091874548, "grad_norm": 0.2573173940181732, "learning_rate": 0.00018535386610409927, "loss": 0.6926, "step": 1348 }, { "epoch": 0.20613515681705313, "grad_norm": 0.3156735897064209, "learning_rate": 0.00018532784875995755, "loss": 0.7268, "step": 1349 }, { "epoch": 0.20628796271536082, "grad_norm": 0.35365813970565796, "learning_rate": 0.00018530181015709855, "loss": 0.7369, "step": 1350 }, { "epoch": 0.20644076861366847, "grad_norm": 0.33551231026649475, "learning_rate": 0.0001852757503020096, "loss": 0.6259, "step": 1351 }, { "epoch": 0.20659357451197616, "grad_norm": 0.3139243423938751, "learning_rate": 0.0001852496692011833, "loss": 0.7496, "step": 1352 }, { "epoch": 0.20674638041028384, "grad_norm": 0.2617344260215759, "learning_rate": 0.00018522356686111752, "loss": 0.8014, "step": 1353 }, { "epoch": 0.2068991863085915, "grad_norm": 0.27416306734085083, "learning_rate": 0.00018519744328831543, "loss": 0.7364, "step": 1354 }, { "epoch": 0.20705199220689918, "grad_norm": 0.2772804796695709, "learning_rate": 0.00018517129848928554, "loss": 0.7281, "step": 1355 }, { "epoch": 0.20720479810520687, "grad_norm": 0.39369335770606995, "learning_rate": 0.00018514513247054154, "loss": 0.7729, "step": 1356 }, { "epoch": 0.20735760400351452, "grad_norm": 0.2790491282939911, "learning_rate": 0.00018511894523860254, "loss": 0.8568, "step": 1357 }, { "epoch": 0.2075104099018222, "grad_norm": 0.3145041763782501, "learning_rate": 0.00018509273679999283, "loss": 0.9169, "step": 1358 }, { "epoch": 0.2076632158001299, "grad_norm": 0.2785448729991913, "learning_rate": 0.00018506650716124207, "loss": 0.7077, "step": 1359 }, { "epoch": 0.20781602169843755, "grad_norm": 0.3012505769729614, "learning_rate": 0.0001850402563288851, "loss": 0.6312, "step": 1360 }, { "epoch": 0.20796882759674523, "grad_norm": 0.28249379992485046, "learning_rate": 0.00018501398430946207, "loss": 0.7125, "step": 1361 }, { "epoch": 0.20812163349505292, "grad_norm": 0.30596253275871277, "learning_rate": 0.00018498769110951855, "loss": 0.837, "step": 1362 }, { "epoch": 0.20827443939336057, "grad_norm": 0.2675941288471222, "learning_rate": 0.00018496137673560518, "loss": 0.7414, "step": 1363 }, { "epoch": 0.20842724529166826, "grad_norm": 0.248866006731987, "learning_rate": 0.00018493504119427795, "loss": 0.749, "step": 1364 }, { "epoch": 0.20858005118997594, "grad_norm": 0.2572340667247772, "learning_rate": 0.0001849086844920982, "loss": 0.5073, "step": 1365 }, { "epoch": 0.2087328570882836, "grad_norm": 0.2993871569633484, "learning_rate": 0.00018488230663563242, "loss": 0.6901, "step": 1366 }, { "epoch": 0.20888566298659128, "grad_norm": 0.2996583878993988, "learning_rate": 0.0001848559076314525, "loss": 1.0662, "step": 1367 }, { "epoch": 0.20903846888489896, "grad_norm": 0.2594098448753357, "learning_rate": 0.00018482948748613546, "loss": 0.7223, "step": 1368 }, { "epoch": 0.20919127478320662, "grad_norm": 0.2878977060317993, "learning_rate": 0.0001848030462062637, "loss": 0.7029, "step": 1369 }, { "epoch": 0.2093440806815143, "grad_norm": 0.23204508423805237, "learning_rate": 0.00018477658379842485, "loss": 0.7097, "step": 1370 }, { "epoch": 0.209496886579822, "grad_norm": 0.29869040846824646, "learning_rate": 0.0001847501002692118, "loss": 0.5587, "step": 1371 }, { "epoch": 0.20964969247812965, "grad_norm": 0.4763803780078888, "learning_rate": 0.00018472359562522267, "loss": 0.7924, "step": 1372 }, { "epoch": 0.20980249837643733, "grad_norm": 0.29070818424224854, "learning_rate": 0.00018469706987306087, "loss": 0.7127, "step": 1373 }, { "epoch": 0.209955304274745, "grad_norm": 0.4801795184612274, "learning_rate": 0.00018467052301933507, "loss": 0.7563, "step": 1374 }, { "epoch": 0.21010811017305267, "grad_norm": 0.2665102481842041, "learning_rate": 0.0001846439550706592, "loss": 0.6069, "step": 1375 }, { "epoch": 0.21026091607136035, "grad_norm": 0.4513528048992157, "learning_rate": 0.00018461736603365248, "loss": 0.5492, "step": 1376 }, { "epoch": 0.21041372196966804, "grad_norm": 0.23580753803253174, "learning_rate": 0.0001845907559149393, "loss": 0.7941, "step": 1377 }, { "epoch": 0.2105665278679757, "grad_norm": 0.29539886116981506, "learning_rate": 0.00018456412472114936, "loss": 0.6216, "step": 1378 }, { "epoch": 0.21071933376628338, "grad_norm": 0.25598660111427307, "learning_rate": 0.00018453747245891758, "loss": 0.7376, "step": 1379 }, { "epoch": 0.21087213966459106, "grad_norm": 0.30273643136024475, "learning_rate": 0.0001845107991348842, "loss": 0.6537, "step": 1380 }, { "epoch": 0.21102494556289872, "grad_norm": 0.28207799792289734, "learning_rate": 0.00018448410475569457, "loss": 0.7825, "step": 1381 }, { "epoch": 0.2111777514612064, "grad_norm": 0.42546963691711426, "learning_rate": 0.00018445738932799946, "loss": 0.8203, "step": 1382 }, { "epoch": 0.2113305573595141, "grad_norm": 0.3256038427352905, "learning_rate": 0.00018443065285845474, "loss": 0.7071, "step": 1383 }, { "epoch": 0.21148336325782174, "grad_norm": 0.2774294912815094, "learning_rate": 0.0001844038953537216, "loss": 0.6433, "step": 1384 }, { "epoch": 0.21163616915612943, "grad_norm": 0.3057693541049957, "learning_rate": 0.0001843771168204664, "loss": 0.6704, "step": 1385 }, { "epoch": 0.2117889750544371, "grad_norm": 0.2981327176094055, "learning_rate": 0.00018435031726536088, "loss": 0.7927, "step": 1386 }, { "epoch": 0.21194178095274477, "grad_norm": 0.3180636763572693, "learning_rate": 0.00018432349669508184, "loss": 0.6058, "step": 1387 }, { "epoch": 0.21209458685105245, "grad_norm": 0.34904152154922485, "learning_rate": 0.00018429665511631143, "loss": 0.9131, "step": 1388 }, { "epoch": 0.21224739274936014, "grad_norm": 0.3404789865016937, "learning_rate": 0.00018426979253573702, "loss": 0.5944, "step": 1389 }, { "epoch": 0.2124001986476678, "grad_norm": 0.24927204847335815, "learning_rate": 0.00018424290896005118, "loss": 0.6979, "step": 1390 }, { "epoch": 0.21255300454597548, "grad_norm": 0.29016798734664917, "learning_rate": 0.00018421600439595171, "loss": 0.7192, "step": 1391 }, { "epoch": 0.21270581044428316, "grad_norm": 0.27132588624954224, "learning_rate": 0.0001841890788501417, "loss": 0.7125, "step": 1392 }, { "epoch": 0.21285861634259082, "grad_norm": 0.2970811128616333, "learning_rate": 0.00018416213232932938, "loss": 0.8096, "step": 1393 }, { "epoch": 0.2130114222408985, "grad_norm": 0.3342524766921997, "learning_rate": 0.00018413516484022826, "loss": 0.716, "step": 1394 }, { "epoch": 0.21316422813920619, "grad_norm": 0.24330155551433563, "learning_rate": 0.0001841081763895571, "loss": 0.6139, "step": 1395 }, { "epoch": 0.21331703403751384, "grad_norm": 0.26770898699760437, "learning_rate": 0.0001840811669840398, "loss": 0.6953, "step": 1396 }, { "epoch": 0.21346983993582153, "grad_norm": 0.3319970965385437, "learning_rate": 0.0001840541366304055, "loss": 0.6463, "step": 1397 }, { "epoch": 0.2136226458341292, "grad_norm": 0.2949671447277069, "learning_rate": 0.0001840270853353887, "loss": 0.7701, "step": 1398 }, { "epoch": 0.21377545173243687, "grad_norm": 0.2886238396167755, "learning_rate": 0.0001840000131057289, "loss": 0.6067, "step": 1399 }, { "epoch": 0.21392825763074455, "grad_norm": 0.3149748146533966, "learning_rate": 0.00018397291994817097, "loss": 0.7083, "step": 1400 }, { "epoch": 0.21408106352905223, "grad_norm": 0.27100950479507446, "learning_rate": 0.0001839458058694649, "loss": 0.705, "step": 1401 }, { "epoch": 0.2142338694273599, "grad_norm": 0.29059261083602905, "learning_rate": 0.00018391867087636597, "loss": 0.6235, "step": 1402 }, { "epoch": 0.21438667532566758, "grad_norm": 0.30112072825431824, "learning_rate": 0.0001838915149756346, "loss": 1.0977, "step": 1403 }, { "epoch": 0.21453948122397526, "grad_norm": 0.31074076890945435, "learning_rate": 0.00018386433817403654, "loss": 0.7036, "step": 1404 }, { "epoch": 0.21469228712228292, "grad_norm": 0.2648938000202179, "learning_rate": 0.00018383714047834256, "loss": 0.6831, "step": 1405 }, { "epoch": 0.2148450930205906, "grad_norm": 0.33752021193504333, "learning_rate": 0.00018380992189532877, "loss": 0.6442, "step": 1406 }, { "epoch": 0.21499789891889826, "grad_norm": 0.25656089186668396, "learning_rate": 0.0001837826824317765, "loss": 0.7233, "step": 1407 }, { "epoch": 0.21515070481720594, "grad_norm": 0.3936312198638916, "learning_rate": 0.00018375542209447216, "loss": 0.9006, "step": 1408 }, { "epoch": 0.21530351071551362, "grad_norm": 0.2963830530643463, "learning_rate": 0.0001837281408902075, "loss": 0.7601, "step": 1409 }, { "epoch": 0.21545631661382128, "grad_norm": 0.39272454380989075, "learning_rate": 0.00018370083882577934, "loss": 0.6576, "step": 1410 }, { "epoch": 0.21560912251212896, "grad_norm": 0.28687795996665955, "learning_rate": 0.00018367351590798978, "loss": 0.7846, "step": 1411 }, { "epoch": 0.21576192841043665, "grad_norm": 0.2810840904712677, "learning_rate": 0.00018364617214364614, "loss": 0.684, "step": 1412 }, { "epoch": 0.2159147343087443, "grad_norm": 0.26902371644973755, "learning_rate": 0.00018361880753956083, "loss": 0.6707, "step": 1413 }, { "epoch": 0.216067540207052, "grad_norm": 0.27405041456222534, "learning_rate": 0.00018359142210255154, "loss": 0.6771, "step": 1414 }, { "epoch": 0.21622034610535967, "grad_norm": 0.25500932335853577, "learning_rate": 0.00018356401583944116, "loss": 0.7029, "step": 1415 }, { "epoch": 0.21637315200366733, "grad_norm": 0.2854139506816864, "learning_rate": 0.00018353658875705766, "loss": 0.6724, "step": 1416 }, { "epoch": 0.216525957901975, "grad_norm": 0.26606252789497375, "learning_rate": 0.0001835091408622343, "loss": 0.7796, "step": 1417 }, { "epoch": 0.2166787638002827, "grad_norm": 0.3369622826576233, "learning_rate": 0.00018348167216180952, "loss": 0.643, "step": 1418 }, { "epoch": 0.21683156969859035, "grad_norm": 0.40154317021369934, "learning_rate": 0.00018345418266262683, "loss": 0.6505, "step": 1419 }, { "epoch": 0.21698437559689804, "grad_norm": 0.2847399413585663, "learning_rate": 0.0001834266723715351, "loss": 0.8197, "step": 1420 }, { "epoch": 0.21713718149520572, "grad_norm": 0.3490140438079834, "learning_rate": 0.00018339914129538826, "loss": 0.655, "step": 1421 }, { "epoch": 0.21728998739351338, "grad_norm": 0.3189198076725006, "learning_rate": 0.0001833715894410454, "loss": 0.6823, "step": 1422 }, { "epoch": 0.21744279329182106, "grad_norm": 0.27019596099853516, "learning_rate": 0.00018334401681537093, "loss": 0.7576, "step": 1423 }, { "epoch": 0.21759559919012875, "grad_norm": 0.25581902265548706, "learning_rate": 0.00018331642342523424, "loss": 0.5591, "step": 1424 }, { "epoch": 0.2177484050884364, "grad_norm": 0.32097867131233215, "learning_rate": 0.00018328880927751003, "loss": 0.8419, "step": 1425 }, { "epoch": 0.2179012109867441, "grad_norm": 0.29979658126831055, "learning_rate": 0.00018326117437907815, "loss": 0.6647, "step": 1426 }, { "epoch": 0.21805401688505177, "grad_norm": 0.2586615979671478, "learning_rate": 0.00018323351873682358, "loss": 0.7442, "step": 1427 }, { "epoch": 0.21820682278335943, "grad_norm": 0.25333309173583984, "learning_rate": 0.0001832058423576365, "loss": 0.5425, "step": 1428 }, { "epoch": 0.2183596286816671, "grad_norm": 0.33443784713745117, "learning_rate": 0.00018317814524841224, "loss": 0.7438, "step": 1429 }, { "epoch": 0.2185124345799748, "grad_norm": 0.28940871357917786, "learning_rate": 0.00018315042741605132, "loss": 0.7608, "step": 1430 }, { "epoch": 0.21866524047828245, "grad_norm": 0.2656884491443634, "learning_rate": 0.0001831226888674594, "loss": 0.8089, "step": 1431 }, { "epoch": 0.21881804637659014, "grad_norm": 0.24992115795612335, "learning_rate": 0.0001830949296095473, "loss": 0.7727, "step": 1432 }, { "epoch": 0.21897085227489782, "grad_norm": 0.2528163492679596, "learning_rate": 0.00018306714964923097, "loss": 0.7669, "step": 1433 }, { "epoch": 0.21912365817320548, "grad_norm": 0.2704116106033325, "learning_rate": 0.00018303934899343161, "loss": 0.6762, "step": 1434 }, { "epoch": 0.21927646407151316, "grad_norm": 0.36805975437164307, "learning_rate": 0.00018301152764907554, "loss": 0.7063, "step": 1435 }, { "epoch": 0.21942926996982084, "grad_norm": 0.47136247158050537, "learning_rate": 0.00018298368562309414, "loss": 0.6372, "step": 1436 }, { "epoch": 0.2195820758681285, "grad_norm": 0.2653694152832031, "learning_rate": 0.00018295582292242405, "loss": 0.5245, "step": 1437 }, { "epoch": 0.21973488176643619, "grad_norm": 0.3250346779823303, "learning_rate": 0.00018292793955400702, "loss": 0.8513, "step": 1438 }, { "epoch": 0.21988768766474387, "grad_norm": 0.282665878534317, "learning_rate": 0.00018290003552479003, "loss": 0.712, "step": 1439 }, { "epoch": 0.22004049356305153, "grad_norm": 0.41571080684661865, "learning_rate": 0.000182872110841725, "loss": 0.7272, "step": 1440 }, { "epoch": 0.2201932994613592, "grad_norm": 0.27098286151885986, "learning_rate": 0.00018284416551176923, "loss": 0.775, "step": 1441 }, { "epoch": 0.2203461053596669, "grad_norm": 0.29416346549987793, "learning_rate": 0.00018281619954188506, "loss": 0.9036, "step": 1442 }, { "epoch": 0.22049891125797455, "grad_norm": 0.5548993349075317, "learning_rate": 0.0001827882129390399, "loss": 0.7088, "step": 1443 }, { "epoch": 0.22065171715628223, "grad_norm": 0.3549652695655823, "learning_rate": 0.00018276020571020646, "loss": 0.7721, "step": 1444 }, { "epoch": 0.22080452305458992, "grad_norm": 0.27087077498435974, "learning_rate": 0.0001827321778623625, "loss": 0.784, "step": 1445 }, { "epoch": 0.22095732895289757, "grad_norm": 0.2703557312488556, "learning_rate": 0.00018270412940249087, "loss": 0.7766, "step": 1446 }, { "epoch": 0.22111013485120526, "grad_norm": 0.2888347804546356, "learning_rate": 0.00018267606033757966, "loss": 0.5097, "step": 1447 }, { "epoch": 0.22126294074951294, "grad_norm": 0.2999062240123749, "learning_rate": 0.00018264797067462198, "loss": 0.7807, "step": 1448 }, { "epoch": 0.2214157466478206, "grad_norm": 0.27103522419929504, "learning_rate": 0.0001826198604206162, "loss": 0.6412, "step": 1449 }, { "epoch": 0.22156855254612828, "grad_norm": 0.3419981300830841, "learning_rate": 0.00018259172958256574, "loss": 0.6001, "step": 1450 }, { "epoch": 0.22172135844443597, "grad_norm": 0.2843935191631317, "learning_rate": 0.00018256357816747912, "loss": 0.6716, "step": 1451 }, { "epoch": 0.22187416434274362, "grad_norm": 0.30148133635520935, "learning_rate": 0.00018253540618237007, "loss": 0.5867, "step": 1452 }, { "epoch": 0.2220269702410513, "grad_norm": 0.30288344621658325, "learning_rate": 0.0001825072136342574, "loss": 0.8905, "step": 1453 }, { "epoch": 0.222179776139359, "grad_norm": 0.26123127341270447, "learning_rate": 0.00018247900053016504, "loss": 1.0583, "step": 1454 }, { "epoch": 0.22233258203766665, "grad_norm": 0.32431039214134216, "learning_rate": 0.00018245076687712204, "loss": 0.734, "step": 1455 }, { "epoch": 0.22248538793597433, "grad_norm": 0.3419983386993408, "learning_rate": 0.0001824225126821626, "loss": 0.6659, "step": 1456 }, { "epoch": 0.22263819383428202, "grad_norm": 0.2817968428134918, "learning_rate": 0.00018239423795232598, "loss": 0.8019, "step": 1457 }, { "epoch": 0.22279099973258967, "grad_norm": 0.287589430809021, "learning_rate": 0.0001823659426946566, "loss": 0.6464, "step": 1458 }, { "epoch": 0.22294380563089736, "grad_norm": 0.29114627838134766, "learning_rate": 0.00018233762691620403, "loss": 0.7999, "step": 1459 }, { "epoch": 0.22309661152920504, "grad_norm": 0.2640954852104187, "learning_rate": 0.00018230929062402286, "loss": 0.7596, "step": 1460 }, { "epoch": 0.2232494174275127, "grad_norm": 0.28602683544158936, "learning_rate": 0.00018228093382517284, "loss": 0.5454, "step": 1461 }, { "epoch": 0.22340222332582038, "grad_norm": 0.27925559878349304, "learning_rate": 0.00018225255652671888, "loss": 0.6297, "step": 1462 }, { "epoch": 0.22355502922412804, "grad_norm": 0.2729659676551819, "learning_rate": 0.0001822241587357309, "loss": 0.698, "step": 1463 }, { "epoch": 0.22370783512243572, "grad_norm": 0.2541782557964325, "learning_rate": 0.00018219574045928396, "loss": 0.701, "step": 1464 }, { "epoch": 0.2238606410207434, "grad_norm": 0.24185724556446075, "learning_rate": 0.00018216730170445827, "loss": 0.6249, "step": 1465 }, { "epoch": 0.22401344691905106, "grad_norm": 0.2809990346431732, "learning_rate": 0.00018213884247833908, "loss": 0.6986, "step": 1466 }, { "epoch": 0.22416625281735875, "grad_norm": 0.25735121965408325, "learning_rate": 0.00018211036278801678, "loss": 0.6038, "step": 1467 }, { "epoch": 0.22431905871566643, "grad_norm": 0.24584175646305084, "learning_rate": 0.00018208186264058687, "loss": 0.7304, "step": 1468 }, { "epoch": 0.2244718646139741, "grad_norm": 0.27485716342926025, "learning_rate": 0.00018205334204314988, "loss": 0.6952, "step": 1469 }, { "epoch": 0.22462467051228177, "grad_norm": 0.254385769367218, "learning_rate": 0.00018202480100281147, "loss": 0.7846, "step": 1470 }, { "epoch": 0.22477747641058946, "grad_norm": 0.263438880443573, "learning_rate": 0.00018199623952668245, "loss": 0.5202, "step": 1471 }, { "epoch": 0.2249302823088971, "grad_norm": 0.31777387857437134, "learning_rate": 0.0001819676576218787, "loss": 0.6698, "step": 1472 }, { "epoch": 0.2250830882072048, "grad_norm": 0.2673969566822052, "learning_rate": 0.00018193905529552103, "loss": 0.6729, "step": 1473 }, { "epoch": 0.22523589410551248, "grad_norm": 0.2530229091644287, "learning_rate": 0.0001819104325547356, "loss": 0.8424, "step": 1474 }, { "epoch": 0.22538870000382014, "grad_norm": 0.255082368850708, "learning_rate": 0.00018188178940665344, "loss": 0.7186, "step": 1475 }, { "epoch": 0.22554150590212782, "grad_norm": 0.2778492271900177, "learning_rate": 0.00018185312585841082, "loss": 0.6712, "step": 1476 }, { "epoch": 0.2256943118004355, "grad_norm": 0.29443123936653137, "learning_rate": 0.00018182444191714895, "loss": 0.6747, "step": 1477 }, { "epoch": 0.22584711769874316, "grad_norm": 0.2747706472873688, "learning_rate": 0.00018179573759001424, "loss": 0.6845, "step": 1478 }, { "epoch": 0.22599992359705084, "grad_norm": 0.24506688117980957, "learning_rate": 0.00018176701288415817, "loss": 0.6688, "step": 1479 }, { "epoch": 0.22615272949535853, "grad_norm": 0.29049941897392273, "learning_rate": 0.00018173826780673715, "loss": 0.9259, "step": 1480 }, { "epoch": 0.22630553539366619, "grad_norm": 0.2717791795730591, "learning_rate": 0.00018170950236491286, "loss": 0.6171, "step": 1481 }, { "epoch": 0.22645834129197387, "grad_norm": 0.504237174987793, "learning_rate": 0.00018168071656585194, "loss": 0.9517, "step": 1482 }, { "epoch": 0.22661114719028155, "grad_norm": 0.4023924171924591, "learning_rate": 0.00018165191041672615, "loss": 0.8926, "step": 1483 }, { "epoch": 0.2267639530885892, "grad_norm": 0.2688741683959961, "learning_rate": 0.0001816230839247123, "loss": 0.7705, "step": 1484 }, { "epoch": 0.2269167589868969, "grad_norm": 0.28259801864624023, "learning_rate": 0.00018159423709699222, "loss": 0.6859, "step": 1485 }, { "epoch": 0.22706956488520458, "grad_norm": 0.28821465373039246, "learning_rate": 0.00018156536994075288, "loss": 0.5653, "step": 1486 }, { "epoch": 0.22722237078351223, "grad_norm": 0.35280266404151917, "learning_rate": 0.00018153648246318634, "loss": 0.7832, "step": 1487 }, { "epoch": 0.22737517668181992, "grad_norm": 0.4009726345539093, "learning_rate": 0.0001815075746714896, "loss": 0.8749, "step": 1488 }, { "epoch": 0.2275279825801276, "grad_norm": 0.2834427058696747, "learning_rate": 0.00018147864657286483, "loss": 0.8026, "step": 1489 }, { "epoch": 0.22768078847843526, "grad_norm": 0.325809121131897, "learning_rate": 0.00018144969817451923, "loss": 0.6645, "step": 1490 }, { "epoch": 0.22783359437674294, "grad_norm": 0.273645281791687, "learning_rate": 0.00018142072948366505, "loss": 0.7575, "step": 1491 }, { "epoch": 0.22798640027505063, "grad_norm": 0.3082992434501648, "learning_rate": 0.00018139174050751957, "loss": 0.7648, "step": 1492 }, { "epoch": 0.22813920617335828, "grad_norm": 0.28899475932121277, "learning_rate": 0.00018136273125330513, "loss": 0.7882, "step": 1493 }, { "epoch": 0.22829201207166597, "grad_norm": 0.31805676221847534, "learning_rate": 0.0001813337017282492, "loss": 0.7491, "step": 1494 }, { "epoch": 0.22844481796997365, "grad_norm": 0.2605206072330475, "learning_rate": 0.00018130465193958424, "loss": 0.7592, "step": 1495 }, { "epoch": 0.2285976238682813, "grad_norm": 0.5190498232841492, "learning_rate": 0.00018127558189454774, "loss": 0.6756, "step": 1496 }, { "epoch": 0.228750429766589, "grad_norm": 0.286194384098053, "learning_rate": 0.00018124649160038226, "loss": 0.5045, "step": 1497 }, { "epoch": 0.22890323566489668, "grad_norm": 0.2897211015224457, "learning_rate": 0.00018121738106433537, "loss": 0.611, "step": 1498 }, { "epoch": 0.22905604156320433, "grad_norm": 0.26120197772979736, "learning_rate": 0.00018118825029365975, "loss": 0.6519, "step": 1499 }, { "epoch": 0.22920884746151202, "grad_norm": 0.32554882764816284, "learning_rate": 0.0001811590992956131, "loss": 0.9085, "step": 1500 }, { "epoch": 0.2293616533598197, "grad_norm": 0.26989874243736267, "learning_rate": 0.00018112992807745815, "loss": 0.7141, "step": 1501 }, { "epoch": 0.22951445925812736, "grad_norm": 0.28747060894966125, "learning_rate": 0.00018110073664646262, "loss": 0.7211, "step": 1502 }, { "epoch": 0.22966726515643504, "grad_norm": 0.22999897599220276, "learning_rate": 0.0001810715250098993, "loss": 0.6093, "step": 1503 }, { "epoch": 0.22982007105474273, "grad_norm": 0.31016895174980164, "learning_rate": 0.00018104229317504614, "loss": 0.715, "step": 1504 }, { "epoch": 0.22997287695305038, "grad_norm": 0.2531152069568634, "learning_rate": 0.00018101304114918583, "loss": 0.5904, "step": 1505 }, { "epoch": 0.23012568285135807, "grad_norm": 0.3257233798503876, "learning_rate": 0.00018098376893960642, "loss": 0.7489, "step": 1506 }, { "epoch": 0.23027848874966575, "grad_norm": 0.2525555491447449, "learning_rate": 0.00018095447655360077, "loss": 0.7849, "step": 1507 }, { "epoch": 0.2304312946479734, "grad_norm": 0.24588941037654877, "learning_rate": 0.00018092516399846682, "loss": 0.6703, "step": 1508 }, { "epoch": 0.2305841005462811, "grad_norm": 0.30313611030578613, "learning_rate": 0.0001808958312815076, "loss": 0.8151, "step": 1509 }, { "epoch": 0.23073690644458877, "grad_norm": 0.29434850811958313, "learning_rate": 0.00018086647841003103, "loss": 0.7981, "step": 1510 }, { "epoch": 0.23088971234289643, "grad_norm": 0.24499566853046417, "learning_rate": 0.0001808371053913502, "loss": 0.8504, "step": 1511 }, { "epoch": 0.23104251824120411, "grad_norm": 0.2640714943408966, "learning_rate": 0.00018080771223278315, "loss": 0.6601, "step": 1512 }, { "epoch": 0.2311953241395118, "grad_norm": 0.23578722774982452, "learning_rate": 0.00018077829894165288, "loss": 0.6778, "step": 1513 }, { "epoch": 0.23134813003781945, "grad_norm": 0.47748589515686035, "learning_rate": 0.00018074886552528753, "loss": 0.7285, "step": 1514 }, { "epoch": 0.23150093593612714, "grad_norm": 0.27540603280067444, "learning_rate": 0.00018071941199102013, "loss": 0.9043, "step": 1515 }, { "epoch": 0.2316537418344348, "grad_norm": 0.2582077980041504, "learning_rate": 0.00018068993834618883, "loss": 0.6843, "step": 1516 }, { "epoch": 0.23180654773274248, "grad_norm": 0.2842862010002136, "learning_rate": 0.0001806604445981367, "loss": 0.7826, "step": 1517 }, { "epoch": 0.23195935363105016, "grad_norm": 0.3156132698059082, "learning_rate": 0.0001806309307542119, "loss": 0.6503, "step": 1518 }, { "epoch": 0.23211215952935782, "grad_norm": 0.29756492376327515, "learning_rate": 0.00018060139682176754, "loss": 0.7223, "step": 1519 }, { "epoch": 0.2322649654276655, "grad_norm": 0.26929807662963867, "learning_rate": 0.00018057184280816175, "loss": 0.6358, "step": 1520 }, { "epoch": 0.2324177713259732, "grad_norm": 0.3058578670024872, "learning_rate": 0.00018054226872075768, "loss": 0.6521, "step": 1521 }, { "epoch": 0.23257057722428084, "grad_norm": 0.3043581247329712, "learning_rate": 0.00018051267456692345, "loss": 0.6487, "step": 1522 }, { "epoch": 0.23272338312258853, "grad_norm": 0.2621524930000305, "learning_rate": 0.00018048306035403216, "loss": 0.7336, "step": 1523 }, { "epoch": 0.2328761890208962, "grad_norm": 0.2857302129268646, "learning_rate": 0.000180453426089462, "loss": 0.7413, "step": 1524 }, { "epoch": 0.23302899491920387, "grad_norm": 0.3124992847442627, "learning_rate": 0.00018042377178059606, "loss": 0.83, "step": 1525 }, { "epoch": 0.23318180081751155, "grad_norm": 0.24599871039390564, "learning_rate": 0.0001803940974348225, "loss": 0.658, "step": 1526 }, { "epoch": 0.23333460671581924, "grad_norm": 0.2612040042877197, "learning_rate": 0.0001803644030595344, "loss": 0.6338, "step": 1527 }, { "epoch": 0.2334874126141269, "grad_norm": 0.3595271110534668, "learning_rate": 0.00018033468866212986, "loss": 0.6995, "step": 1528 }, { "epoch": 0.23364021851243458, "grad_norm": 0.32448646426200867, "learning_rate": 0.00018030495425001202, "loss": 0.6831, "step": 1529 }, { "epoch": 0.23379302441074226, "grad_norm": 0.3007851243019104, "learning_rate": 0.0001802751998305889, "loss": 0.6032, "step": 1530 }, { "epoch": 0.23394583030904992, "grad_norm": 0.2284546047449112, "learning_rate": 0.00018024542541127358, "loss": 0.6778, "step": 1531 }, { "epoch": 0.2340986362073576, "grad_norm": 0.24730284512043, "learning_rate": 0.00018021563099948414, "loss": 0.5785, "step": 1532 }, { "epoch": 0.2342514421056653, "grad_norm": 0.31631672382354736, "learning_rate": 0.0001801858166026436, "loss": 0.749, "step": 1533 }, { "epoch": 0.23440424800397294, "grad_norm": 0.30484116077423096, "learning_rate": 0.00018015598222817996, "loss": 0.656, "step": 1534 }, { "epoch": 0.23455705390228063, "grad_norm": 0.24168114364147186, "learning_rate": 0.00018012612788352616, "loss": 0.6987, "step": 1535 }, { "epoch": 0.2347098598005883, "grad_norm": 0.33276891708374023, "learning_rate": 0.00018009625357612023, "loss": 0.7676, "step": 1536 }, { "epoch": 0.23486266569889597, "grad_norm": 0.25853464007377625, "learning_rate": 0.00018006635931340506, "loss": 0.6653, "step": 1537 }, { "epoch": 0.23501547159720365, "grad_norm": 0.3082162141799927, "learning_rate": 0.00018003644510282855, "loss": 0.557, "step": 1538 }, { "epoch": 0.23516827749551134, "grad_norm": 0.4157916307449341, "learning_rate": 0.00018000651095184358, "loss": 0.5726, "step": 1539 }, { "epoch": 0.235321083393819, "grad_norm": 0.24570941925048828, "learning_rate": 0.00017997655686790803, "loss": 0.7184, "step": 1540 }, { "epoch": 0.23547388929212668, "grad_norm": 0.269633024930954, "learning_rate": 0.00017994658285848465, "loss": 0.5958, "step": 1541 }, { "epoch": 0.23562669519043436, "grad_norm": 0.24222281575202942, "learning_rate": 0.00017991658893104124, "loss": 0.7112, "step": 1542 }, { "epoch": 0.23577950108874202, "grad_norm": 0.26471802592277527, "learning_rate": 0.00017988657509305055, "loss": 0.799, "step": 1543 }, { "epoch": 0.2359323069870497, "grad_norm": 0.26221612095832825, "learning_rate": 0.00017985654135199027, "loss": 0.6478, "step": 1544 }, { "epoch": 0.23608511288535738, "grad_norm": 0.47572270035743713, "learning_rate": 0.00017982648771534306, "loss": 0.8253, "step": 1545 }, { "epoch": 0.23623791878366504, "grad_norm": 0.4527488052845001, "learning_rate": 0.00017979641419059648, "loss": 0.7986, "step": 1546 }, { "epoch": 0.23639072468197272, "grad_norm": 0.25220146775245667, "learning_rate": 0.0001797663207852432, "loss": 0.5181, "step": 1547 }, { "epoch": 0.2365435305802804, "grad_norm": 0.2821711599826813, "learning_rate": 0.00017973620750678059, "loss": 0.7455, "step": 1548 }, { "epoch": 0.23669633647858807, "grad_norm": 0.32014167308807373, "learning_rate": 0.00017970607436271126, "loss": 0.6829, "step": 1549 }, { "epoch": 0.23684914237689575, "grad_norm": 0.2855893671512604, "learning_rate": 0.00017967592136054257, "loss": 0.6884, "step": 1550 }, { "epoch": 0.23700194827520343, "grad_norm": 0.2573263645172119, "learning_rate": 0.00017964574850778687, "loss": 0.7325, "step": 1551 }, { "epoch": 0.2371547541735111, "grad_norm": 0.3028693199157715, "learning_rate": 0.0001796155558119615, "loss": 0.7879, "step": 1552 }, { "epoch": 0.23730756007181877, "grad_norm": 0.26804086565971375, "learning_rate": 0.00017958534328058872, "loss": 0.7159, "step": 1553 }, { "epoch": 0.23746036597012646, "grad_norm": 0.2625160217285156, "learning_rate": 0.0001795551109211957, "loss": 0.8026, "step": 1554 }, { "epoch": 0.23761317186843411, "grad_norm": 0.34923064708709717, "learning_rate": 0.00017952485874131463, "loss": 0.7361, "step": 1555 }, { "epoch": 0.2377659777667418, "grad_norm": 0.23876674473285675, "learning_rate": 0.00017949458674848255, "loss": 0.6431, "step": 1556 }, { "epoch": 0.23791878366504948, "grad_norm": 0.3087947964668274, "learning_rate": 0.00017946429495024145, "loss": 0.7473, "step": 1557 }, { "epoch": 0.23807158956335714, "grad_norm": 0.24753893911838531, "learning_rate": 0.00017943398335413835, "loss": 0.6258, "step": 1558 }, { "epoch": 0.23822439546166482, "grad_norm": 0.3573136627674103, "learning_rate": 0.00017940365196772508, "loss": 0.6592, "step": 1559 }, { "epoch": 0.2383772013599725, "grad_norm": 0.2909756004810333, "learning_rate": 0.00017937330079855843, "loss": 0.7145, "step": 1560 }, { "epoch": 0.23853000725828016, "grad_norm": 0.29025787115097046, "learning_rate": 0.00017934292985420015, "loss": 0.4892, "step": 1561 }, { "epoch": 0.23868281315658785, "grad_norm": 0.27839645743370056, "learning_rate": 0.00017931253914221698, "loss": 0.6972, "step": 1562 }, { "epoch": 0.23883561905489553, "grad_norm": 0.3256765604019165, "learning_rate": 0.00017928212867018042, "loss": 0.8926, "step": 1563 }, { "epoch": 0.2389884249532032, "grad_norm": 0.40683630108833313, "learning_rate": 0.000179251698445667, "loss": 0.7542, "step": 1564 }, { "epoch": 0.23914123085151087, "grad_norm": 0.3646388053894043, "learning_rate": 0.00017922124847625818, "loss": 0.6908, "step": 1565 }, { "epoch": 0.23929403674981856, "grad_norm": 0.30164778232574463, "learning_rate": 0.00017919077876954028, "loss": 0.7484, "step": 1566 }, { "epoch": 0.2394468426481262, "grad_norm": 0.2960456609725952, "learning_rate": 0.00017916028933310463, "loss": 0.5881, "step": 1567 }, { "epoch": 0.2395996485464339, "grad_norm": 0.3058547079563141, "learning_rate": 0.00017912978017454737, "loss": 0.6527, "step": 1568 }, { "epoch": 0.23975245444474158, "grad_norm": 0.286178857088089, "learning_rate": 0.00017909925130146962, "loss": 0.6846, "step": 1569 }, { "epoch": 0.23990526034304924, "grad_norm": 0.30656570196151733, "learning_rate": 0.00017906870272147742, "loss": 0.8488, "step": 1570 }, { "epoch": 0.24005806624135692, "grad_norm": 0.2935943305492401, "learning_rate": 0.0001790381344421816, "loss": 0.5972, "step": 1571 }, { "epoch": 0.24021087213966458, "grad_norm": 0.2677885591983795, "learning_rate": 0.0001790075464711981, "loss": 0.687, "step": 1572 }, { "epoch": 0.24036367803797226, "grad_norm": 0.34668412804603577, "learning_rate": 0.00017897693881614756, "loss": 0.6028, "step": 1573 }, { "epoch": 0.24051648393627995, "grad_norm": 0.2729659974575043, "learning_rate": 0.0001789463114846557, "loss": 0.5658, "step": 1574 }, { "epoch": 0.2406692898345876, "grad_norm": 0.30905503034591675, "learning_rate": 0.00017891566448435302, "loss": 0.7118, "step": 1575 }, { "epoch": 0.24082209573289529, "grad_norm": 0.3494151830673218, "learning_rate": 0.00017888499782287495, "loss": 0.6256, "step": 1576 }, { "epoch": 0.24097490163120297, "grad_norm": 0.2960670590400696, "learning_rate": 0.00017885431150786187, "loss": 0.6596, "step": 1577 }, { "epoch": 0.24112770752951063, "grad_norm": 0.3915059268474579, "learning_rate": 0.00017882360554695898, "loss": 0.7953, "step": 1578 }, { "epoch": 0.2412805134278183, "grad_norm": 0.2618657946586609, "learning_rate": 0.00017879287994781645, "loss": 0.5848, "step": 1579 }, { "epoch": 0.241433319326126, "grad_norm": 0.2906941771507263, "learning_rate": 0.0001787621347180893, "loss": 0.6394, "step": 1580 }, { "epoch": 0.24158612522443365, "grad_norm": 0.3870331346988678, "learning_rate": 0.00017873136986543744, "loss": 0.694, "step": 1581 }, { "epoch": 0.24173893112274134, "grad_norm": 0.3732616603374481, "learning_rate": 0.00017870058539752565, "loss": 0.9083, "step": 1582 }, { "epoch": 0.24189173702104902, "grad_norm": 0.2706056535243988, "learning_rate": 0.00017866978132202363, "loss": 0.6549, "step": 1583 }, { "epoch": 0.24204454291935668, "grad_norm": 0.24143487215042114, "learning_rate": 0.00017863895764660596, "loss": 0.5494, "step": 1584 }, { "epoch": 0.24219734881766436, "grad_norm": 0.3117775321006775, "learning_rate": 0.00017860811437895216, "loss": 0.5881, "step": 1585 }, { "epoch": 0.24235015471597204, "grad_norm": 0.31304922699928284, "learning_rate": 0.00017857725152674645, "loss": 0.7791, "step": 1586 }, { "epoch": 0.2425029606142797, "grad_norm": 0.2848494350910187, "learning_rate": 0.00017854636909767817, "loss": 0.755, "step": 1587 }, { "epoch": 0.24265576651258738, "grad_norm": 0.32094806432724, "learning_rate": 0.00017851546709944134, "loss": 0.9501, "step": 1588 }, { "epoch": 0.24280857241089507, "grad_norm": 0.3411880433559418, "learning_rate": 0.00017848454553973496, "loss": 0.839, "step": 1589 }, { "epoch": 0.24296137830920272, "grad_norm": 0.2581973969936371, "learning_rate": 0.00017845360442626289, "loss": 0.7196, "step": 1590 }, { "epoch": 0.2431141842075104, "grad_norm": 0.3275444507598877, "learning_rate": 0.00017842264376673384, "loss": 0.7177, "step": 1591 }, { "epoch": 0.2432669901058181, "grad_norm": 0.3695151209831238, "learning_rate": 0.0001783916635688614, "loss": 0.5328, "step": 1592 }, { "epoch": 0.24341979600412575, "grad_norm": 0.30539098381996155, "learning_rate": 0.000178360663840364, "loss": 0.7765, "step": 1593 }, { "epoch": 0.24357260190243343, "grad_norm": 1.2540498971939087, "learning_rate": 0.00017832964458896496, "loss": 0.5908, "step": 1594 }, { "epoch": 0.24372540780074112, "grad_norm": 0.33404994010925293, "learning_rate": 0.00017829860582239252, "loss": 0.6183, "step": 1595 }, { "epoch": 0.24387821369904877, "grad_norm": 0.2673746347427368, "learning_rate": 0.0001782675475483797, "loss": 0.6918, "step": 1596 }, { "epoch": 0.24403101959735646, "grad_norm": 0.3022105395793915, "learning_rate": 0.0001782364697746644, "loss": 0.7363, "step": 1597 }, { "epoch": 0.24418382549566414, "grad_norm": 0.28759056329727173, "learning_rate": 0.00017820537250898939, "loss": 0.6314, "step": 1598 }, { "epoch": 0.2443366313939718, "grad_norm": 0.30016323924064636, "learning_rate": 0.00017817425575910228, "loss": 0.5981, "step": 1599 }, { "epoch": 0.24448943729227948, "grad_norm": 0.3459952771663666, "learning_rate": 0.0001781431195327556, "loss": 0.6946, "step": 1600 }, { "epoch": 0.24464224319058717, "grad_norm": 0.2870761752128601, "learning_rate": 0.0001781119638377066, "loss": 1.0651, "step": 1601 }, { "epoch": 0.24479504908889482, "grad_norm": 0.2479250431060791, "learning_rate": 0.0001780807886817175, "loss": 0.7689, "step": 1602 }, { "epoch": 0.2449478549872025, "grad_norm": 0.28080081939697266, "learning_rate": 0.00017804959407255537, "loss": 0.7588, "step": 1603 }, { "epoch": 0.2451006608855102, "grad_norm": 0.36210957169532776, "learning_rate": 0.00017801838001799204, "loss": 0.9166, "step": 1604 }, { "epoch": 0.24525346678381785, "grad_norm": 0.3351990282535553, "learning_rate": 0.0001779871465258042, "loss": 0.5861, "step": 1605 }, { "epoch": 0.24540627268212553, "grad_norm": 0.2677927613258362, "learning_rate": 0.00017795589360377346, "loss": 0.7167, "step": 1606 }, { "epoch": 0.24555907858043322, "grad_norm": 0.32118043303489685, "learning_rate": 0.0001779246212596862, "loss": 0.6725, "step": 1607 }, { "epoch": 0.24571188447874087, "grad_norm": 0.34118232131004333, "learning_rate": 0.00017789332950133367, "loss": 0.6812, "step": 1608 }, { "epoch": 0.24586469037704856, "grad_norm": 0.3112337589263916, "learning_rate": 0.00017786201833651198, "loss": 0.7925, "step": 1609 }, { "epoch": 0.24601749627535624, "grad_norm": 0.2794199287891388, "learning_rate": 0.000177830687773022, "loss": 0.5572, "step": 1610 }, { "epoch": 0.2461703021736639, "grad_norm": 0.2742474675178528, "learning_rate": 0.0001777993378186695, "loss": 0.8369, "step": 1611 }, { "epoch": 0.24632310807197158, "grad_norm": 0.3400551378726959, "learning_rate": 0.00017776796848126503, "loss": 0.6672, "step": 1612 }, { "epoch": 0.24647591397027926, "grad_norm": 0.26672646403312683, "learning_rate": 0.00017773657976862399, "loss": 0.6773, "step": 1613 }, { "epoch": 0.24662871986858692, "grad_norm": 0.4063914120197296, "learning_rate": 0.0001777051716885667, "loss": 0.8437, "step": 1614 }, { "epoch": 0.2467815257668946, "grad_norm": 0.30643579363822937, "learning_rate": 0.00017767374424891813, "loss": 0.6577, "step": 1615 }, { "epoch": 0.2469343316652023, "grad_norm": 0.308155357837677, "learning_rate": 0.0001776422974575082, "loss": 0.7437, "step": 1616 }, { "epoch": 0.24708713756350995, "grad_norm": 0.2763682007789612, "learning_rate": 0.0001776108313221716, "loss": 0.6708, "step": 1617 }, { "epoch": 0.24723994346181763, "grad_norm": 0.2899835705757141, "learning_rate": 0.00017757934585074784, "loss": 0.6588, "step": 1618 }, { "epoch": 0.2473927493601253, "grad_norm": 0.2819088101387024, "learning_rate": 0.0001775478410510813, "loss": 0.6812, "step": 1619 }, { "epoch": 0.24754555525843297, "grad_norm": 0.31118282675743103, "learning_rate": 0.00017751631693102113, "loss": 0.8102, "step": 1620 }, { "epoch": 0.24769836115674065, "grad_norm": 0.37787777185440063, "learning_rate": 0.0001774847734984213, "loss": 0.4826, "step": 1621 }, { "epoch": 0.24785116705504834, "grad_norm": 0.2848077416419983, "learning_rate": 0.00017745321076114055, "loss": 0.7106, "step": 1622 }, { "epoch": 0.248003972953356, "grad_norm": 0.3185364902019501, "learning_rate": 0.0001774216287270425, "loss": 0.9623, "step": 1623 }, { "epoch": 0.24815677885166368, "grad_norm": 0.2377936691045761, "learning_rate": 0.00017739002740399556, "loss": 0.5686, "step": 1624 }, { "epoch": 0.24830958474997136, "grad_norm": 0.2903329133987427, "learning_rate": 0.0001773584067998729, "loss": 0.5424, "step": 1625 }, { "epoch": 0.24846239064827902, "grad_norm": 0.4083361327648163, "learning_rate": 0.0001773267669225526, "loss": 0.5707, "step": 1626 }, { "epoch": 0.2486151965465867, "grad_norm": 0.2921728789806366, "learning_rate": 0.00017729510777991737, "loss": 0.6263, "step": 1627 }, { "epoch": 0.24876800244489436, "grad_norm": 0.6114667057991028, "learning_rate": 0.0001772634293798549, "loss": 1.0645, "step": 1628 }, { "epoch": 0.24892080834320204, "grad_norm": 0.3961344361305237, "learning_rate": 0.00017723173173025755, "loss": 0.869, "step": 1629 }, { "epoch": 0.24907361424150973, "grad_norm": 0.26961538195610046, "learning_rate": 0.00017720001483902256, "loss": 0.7127, "step": 1630 }, { "epoch": 0.24922642013981738, "grad_norm": 0.3177333474159241, "learning_rate": 0.00017716827871405187, "loss": 0.6924, "step": 1631 }, { "epoch": 0.24937922603812507, "grad_norm": 0.3858714997768402, "learning_rate": 0.00017713652336325236, "loss": 0.6336, "step": 1632 }, { "epoch": 0.24953203193643275, "grad_norm": 0.311939537525177, "learning_rate": 0.00017710474879453552, "loss": 0.8036, "step": 1633 }, { "epoch": 0.2496848378347404, "grad_norm": 0.3360339105129242, "learning_rate": 0.0001770729550158178, "loss": 0.4666, "step": 1634 }, { "epoch": 0.2498376437330481, "grad_norm": 0.31603994965553284, "learning_rate": 0.00017704114203502023, "loss": 0.7722, "step": 1635 }, { "epoch": 0.24999044963135578, "grad_norm": 0.26751232147216797, "learning_rate": 0.0001770093098600689, "loss": 0.5519, "step": 1636 }, { "epoch": 0.25014325552966343, "grad_norm": 0.3818928301334381, "learning_rate": 0.00017697745849889443, "loss": 0.5669, "step": 1637 }, { "epoch": 0.2502960614279711, "grad_norm": 0.2536742091178894, "learning_rate": 0.00017694558795943233, "loss": 0.6771, "step": 1638 }, { "epoch": 0.2504488673262788, "grad_norm": 0.30967119336128235, "learning_rate": 0.0001769136982496229, "loss": 0.7453, "step": 1639 }, { "epoch": 0.2506016732245865, "grad_norm": 0.4818248152732849, "learning_rate": 0.00017688178937741116, "loss": 0.8101, "step": 1640 }, { "epoch": 0.25075447912289417, "grad_norm": 0.3688648045063019, "learning_rate": 0.000176849861350747, "loss": 0.9537, "step": 1641 }, { "epoch": 0.2509072850212018, "grad_norm": 0.3641200363636017, "learning_rate": 0.00017681791417758496, "loss": 0.6488, "step": 1642 }, { "epoch": 0.2510600909195095, "grad_norm": 0.2852065861225128, "learning_rate": 0.00017678594786588444, "loss": 0.6796, "step": 1643 }, { "epoch": 0.25121289681781717, "grad_norm": 0.2645616829395294, "learning_rate": 0.00017675396242360956, "loss": 0.6531, "step": 1644 }, { "epoch": 0.25136570271612485, "grad_norm": 0.31781354546546936, "learning_rate": 0.00017672195785872923, "loss": 0.7932, "step": 1645 }, { "epoch": 0.25151850861443253, "grad_norm": 0.2881321907043457, "learning_rate": 0.0001766899341792171, "loss": 0.7011, "step": 1646 }, { "epoch": 0.2516713145127402, "grad_norm": 0.30181893706321716, "learning_rate": 0.00017665789139305167, "loss": 0.6204, "step": 1647 }, { "epoch": 0.25182412041104785, "grad_norm": 0.31074196100234985, "learning_rate": 0.00017662582950821607, "loss": 0.5312, "step": 1648 }, { "epoch": 0.25197692630935553, "grad_norm": 0.30659812688827515, "learning_rate": 0.00017659374853269824, "loss": 0.7559, "step": 1649 }, { "epoch": 0.2521297322076632, "grad_norm": 0.27685531973838806, "learning_rate": 0.00017656164847449092, "loss": 0.8348, "step": 1650 }, { "epoch": 0.2522825381059709, "grad_norm": 0.33594951033592224, "learning_rate": 0.00017652952934159156, "loss": 0.9363, "step": 1651 }, { "epoch": 0.2524353440042786, "grad_norm": 0.26607444882392883, "learning_rate": 0.0001764973911420024, "loss": 0.8577, "step": 1652 }, { "epoch": 0.25258814990258627, "grad_norm": 0.36697816848754883, "learning_rate": 0.00017646523388373036, "loss": 0.8523, "step": 1653 }, { "epoch": 0.2527409558008939, "grad_norm": 0.34142768383026123, "learning_rate": 0.00017643305757478715, "loss": 0.7041, "step": 1654 }, { "epoch": 0.2528937616992016, "grad_norm": 0.27803367376327515, "learning_rate": 0.00017640086222318925, "loss": 0.7229, "step": 1655 }, { "epoch": 0.25304656759750926, "grad_norm": 0.3529926836490631, "learning_rate": 0.00017636864783695787, "loss": 0.6857, "step": 1656 }, { "epoch": 0.25319937349581695, "grad_norm": 0.28934431076049805, "learning_rate": 0.0001763364144241189, "loss": 0.572, "step": 1657 }, { "epoch": 0.25335217939412463, "grad_norm": 0.25978824496269226, "learning_rate": 0.0001763041619927031, "loss": 0.6781, "step": 1658 }, { "epoch": 0.25350498529243226, "grad_norm": 0.25913646817207336, "learning_rate": 0.00017627189055074584, "loss": 0.6003, "step": 1659 }, { "epoch": 0.25365779119073995, "grad_norm": 0.26361286640167236, "learning_rate": 0.0001762396001062873, "loss": 0.6378, "step": 1660 }, { "epoch": 0.25381059708904763, "grad_norm": 0.3600101172924042, "learning_rate": 0.00017620729066737236, "loss": 0.8028, "step": 1661 }, { "epoch": 0.2539634029873553, "grad_norm": 0.33632373809814453, "learning_rate": 0.00017617496224205069, "loss": 0.548, "step": 1662 }, { "epoch": 0.254116208885663, "grad_norm": 0.2929910123348236, "learning_rate": 0.00017614261483837656, "loss": 0.7541, "step": 1663 }, { "epoch": 0.2542690147839707, "grad_norm": 0.2633499503135681, "learning_rate": 0.00017611024846440911, "loss": 0.6927, "step": 1664 }, { "epoch": 0.2544218206822783, "grad_norm": 0.3387669026851654, "learning_rate": 0.00017607786312821215, "loss": 0.6464, "step": 1665 }, { "epoch": 0.254574626580586, "grad_norm": 0.27477720379829407, "learning_rate": 0.0001760454588378542, "loss": 0.7899, "step": 1666 }, { "epoch": 0.2547274324788937, "grad_norm": 0.28007772564888, "learning_rate": 0.00017601303560140855, "loss": 0.7528, "step": 1667 }, { "epoch": 0.25488023837720136, "grad_norm": 0.2639395594596863, "learning_rate": 0.00017598059342695312, "loss": 0.6548, "step": 1668 }, { "epoch": 0.25503304427550905, "grad_norm": 0.26824504137039185, "learning_rate": 0.00017594813232257067, "loss": 0.6124, "step": 1669 }, { "epoch": 0.25518585017381673, "grad_norm": 0.3445259928703308, "learning_rate": 0.00017591565229634857, "loss": 0.6439, "step": 1670 }, { "epoch": 0.25533865607212436, "grad_norm": 0.2897654175758362, "learning_rate": 0.00017588315335637894, "loss": 0.7828, "step": 1671 }, { "epoch": 0.25549146197043204, "grad_norm": 0.24861669540405273, "learning_rate": 0.00017585063551075862, "loss": 0.6605, "step": 1672 }, { "epoch": 0.2556442678687397, "grad_norm": 0.31479543447494507, "learning_rate": 0.00017581809876758922, "loss": 0.7128, "step": 1673 }, { "epoch": 0.2557970737670474, "grad_norm": 0.35827958583831787, "learning_rate": 0.0001757855431349769, "loss": 0.8827, "step": 1674 }, { "epoch": 0.2559498796653551, "grad_norm": 0.3290880024433136, "learning_rate": 0.0001757529686210327, "loss": 0.5662, "step": 1675 }, { "epoch": 0.2561026855636628, "grad_norm": 0.3016948699951172, "learning_rate": 0.00017572037523387227, "loss": 0.6582, "step": 1676 }, { "epoch": 0.2562554914619704, "grad_norm": 0.3011932969093323, "learning_rate": 0.0001756877629816159, "loss": 0.7446, "step": 1677 }, { "epoch": 0.2564082973602781, "grad_norm": 0.340775728225708, "learning_rate": 0.00017565513187238878, "loss": 0.5849, "step": 1678 }, { "epoch": 0.2565611032585858, "grad_norm": 0.28078779578208923, "learning_rate": 0.00017562248191432063, "loss": 0.6052, "step": 1679 }, { "epoch": 0.25671390915689346, "grad_norm": 0.5157023072242737, "learning_rate": 0.00017558981311554587, "loss": 0.7927, "step": 1680 }, { "epoch": 0.25686671505520114, "grad_norm": 0.9208202958106995, "learning_rate": 0.00017555712548420372, "loss": 0.5478, "step": 1681 }, { "epoch": 0.25701952095350883, "grad_norm": 0.2912384569644928, "learning_rate": 0.00017552441902843796, "loss": 0.6392, "step": 1682 }, { "epoch": 0.25717232685181646, "grad_norm": 0.2768346071243286, "learning_rate": 0.0001754916937563972, "loss": 0.7224, "step": 1683 }, { "epoch": 0.25732513275012414, "grad_norm": 0.25909632444381714, "learning_rate": 0.00017545894967623462, "loss": 0.7825, "step": 1684 }, { "epoch": 0.2574779386484318, "grad_norm": 0.2799522578716278, "learning_rate": 0.00017542618679610816, "loss": 0.6992, "step": 1685 }, { "epoch": 0.2576307445467395, "grad_norm": 0.3406999409198761, "learning_rate": 0.0001753934051241804, "loss": 0.6755, "step": 1686 }, { "epoch": 0.2577835504450472, "grad_norm": 0.3167582154273987, "learning_rate": 0.00017536060466861864, "loss": 0.9995, "step": 1687 }, { "epoch": 0.2579363563433549, "grad_norm": 0.3207745850086212, "learning_rate": 0.00017532778543759482, "loss": 0.6792, "step": 1688 }, { "epoch": 0.2580891622416625, "grad_norm": 0.3233521282672882, "learning_rate": 0.00017529494743928555, "loss": 0.4878, "step": 1689 }, { "epoch": 0.2582419681399702, "grad_norm": 0.34627631306648254, "learning_rate": 0.00017526209068187217, "loss": 0.8386, "step": 1690 }, { "epoch": 0.2583947740382779, "grad_norm": 0.28958991169929504, "learning_rate": 0.00017522921517354071, "loss": 0.6777, "step": 1691 }, { "epoch": 0.25854757993658556, "grad_norm": 0.28479400277137756, "learning_rate": 0.00017519632092248175, "loss": 0.5887, "step": 1692 }, { "epoch": 0.25870038583489324, "grad_norm": 0.3165437579154968, "learning_rate": 0.00017516340793689066, "loss": 0.7553, "step": 1693 }, { "epoch": 0.2588531917332009, "grad_norm": 0.40525293350219727, "learning_rate": 0.0001751304762249674, "loss": 0.8909, "step": 1694 }, { "epoch": 0.25900599763150856, "grad_norm": 0.28751781582832336, "learning_rate": 0.00017509752579491667, "loss": 0.8133, "step": 1695 }, { "epoch": 0.25915880352981624, "grad_norm": 0.2711454927921295, "learning_rate": 0.00017506455665494775, "loss": 0.7187, "step": 1696 }, { "epoch": 0.2593116094281239, "grad_norm": 0.3209768533706665, "learning_rate": 0.0001750315688132747, "loss": 0.8423, "step": 1697 }, { "epoch": 0.2594644153264316, "grad_norm": 0.24135245382785797, "learning_rate": 0.0001749985622781161, "loss": 0.5551, "step": 1698 }, { "epoch": 0.2596172212247393, "grad_norm": 0.2836229205131531, "learning_rate": 0.00017496553705769526, "loss": 0.6805, "step": 1699 }, { "epoch": 0.259770027123047, "grad_norm": 0.3115346431732178, "learning_rate": 0.00017493249316024011, "loss": 0.7877, "step": 1700 }, { "epoch": 0.2599228330213546, "grad_norm": 0.25913530588150024, "learning_rate": 0.00017489943059398333, "loss": 0.7332, "step": 1701 }, { "epoch": 0.2600756389196623, "grad_norm": 0.2903793454170227, "learning_rate": 0.0001748663493671621, "loss": 0.7419, "step": 1702 }, { "epoch": 0.26022844481797, "grad_norm": 0.3538905382156372, "learning_rate": 0.0001748332494880184, "loss": 0.7564, "step": 1703 }, { "epoch": 0.26038125071627766, "grad_norm": 0.3246188163757324, "learning_rate": 0.00017480013096479876, "loss": 0.7791, "step": 1704 }, { "epoch": 0.26053405661458534, "grad_norm": 0.26643890142440796, "learning_rate": 0.00017476699380575438, "loss": 0.6845, "step": 1705 }, { "epoch": 0.260686862512893, "grad_norm": 0.34562361240386963, "learning_rate": 0.00017473383801914108, "loss": 0.676, "step": 1706 }, { "epoch": 0.26083966841120065, "grad_norm": 0.27726852893829346, "learning_rate": 0.0001747006636132194, "loss": 0.6042, "step": 1707 }, { "epoch": 0.26099247430950834, "grad_norm": 0.3062208890914917, "learning_rate": 0.00017466747059625444, "loss": 0.64, "step": 1708 }, { "epoch": 0.261145280207816, "grad_norm": 0.25582143664360046, "learning_rate": 0.00017463425897651594, "loss": 0.5985, "step": 1709 }, { "epoch": 0.2612980861061237, "grad_norm": 0.3339386284351349, "learning_rate": 0.00017460102876227832, "loss": 0.6921, "step": 1710 }, { "epoch": 0.2614508920044314, "grad_norm": 0.28748372197151184, "learning_rate": 0.00017456777996182062, "loss": 0.605, "step": 1711 }, { "epoch": 0.261603697902739, "grad_norm": 0.4000266194343567, "learning_rate": 0.00017453451258342645, "loss": 0.866, "step": 1712 }, { "epoch": 0.2617565038010467, "grad_norm": 0.33299750089645386, "learning_rate": 0.00017450122663538415, "loss": 0.7733, "step": 1713 }, { "epoch": 0.2619093096993544, "grad_norm": 0.3416946530342102, "learning_rate": 0.0001744679221259866, "loss": 0.8105, "step": 1714 }, { "epoch": 0.26206211559766207, "grad_norm": 0.2502969205379486, "learning_rate": 0.0001744345990635314, "loss": 0.6472, "step": 1715 }, { "epoch": 0.26221492149596975, "grad_norm": 0.2692801058292389, "learning_rate": 0.0001744012574563206, "loss": 0.7379, "step": 1716 }, { "epoch": 0.26236772739427744, "grad_norm": 0.30326759815216064, "learning_rate": 0.0001743678973126611, "loss": 0.8629, "step": 1717 }, { "epoch": 0.26252053329258507, "grad_norm": 0.2786160409450531, "learning_rate": 0.0001743345186408642, "loss": 0.6748, "step": 1718 }, { "epoch": 0.26267333919089275, "grad_norm": 0.28507113456726074, "learning_rate": 0.000174301121449246, "loss": 0.5543, "step": 1719 }, { "epoch": 0.26282614508920044, "grad_norm": 0.2629023492336273, "learning_rate": 0.0001742677057461271, "loss": 0.8118, "step": 1720 }, { "epoch": 0.2629789509875081, "grad_norm": 0.24799314141273499, "learning_rate": 0.0001742342715398327, "loss": 0.6217, "step": 1721 }, { "epoch": 0.2631317568858158, "grad_norm": 0.30429476499557495, "learning_rate": 0.0001742008188386927, "loss": 0.739, "step": 1722 }, { "epoch": 0.2632845627841235, "grad_norm": 0.30273282527923584, "learning_rate": 0.00017416734765104156, "loss": 0.8007, "step": 1723 }, { "epoch": 0.2634373686824311, "grad_norm": 0.321262925863266, "learning_rate": 0.0001741338579852183, "loss": 0.6496, "step": 1724 }, { "epoch": 0.2635901745807388, "grad_norm": 0.31347712874412537, "learning_rate": 0.00017410034984956666, "loss": 0.6371, "step": 1725 }, { "epoch": 0.2637429804790465, "grad_norm": 0.33219581842422485, "learning_rate": 0.00017406682325243485, "loss": 0.8095, "step": 1726 }, { "epoch": 0.26389578637735417, "grad_norm": 0.3433677852153778, "learning_rate": 0.00017403327820217577, "loss": 0.7147, "step": 1727 }, { "epoch": 0.26404859227566185, "grad_norm": 0.34055739641189575, "learning_rate": 0.00017399971470714686, "loss": 0.6552, "step": 1728 }, { "epoch": 0.26420139817396954, "grad_norm": 0.3190424144268036, "learning_rate": 0.00017396613277571022, "loss": 0.8839, "step": 1729 }, { "epoch": 0.26435420407227717, "grad_norm": 0.32356637716293335, "learning_rate": 0.00017393253241623245, "loss": 0.7138, "step": 1730 }, { "epoch": 0.26450700997058485, "grad_norm": 0.2742416262626648, "learning_rate": 0.0001738989136370849, "loss": 0.6513, "step": 1731 }, { "epoch": 0.26465981586889253, "grad_norm": 0.2900165319442749, "learning_rate": 0.0001738652764466433, "loss": 0.7172, "step": 1732 }, { "epoch": 0.2648126217672002, "grad_norm": 0.2783643901348114, "learning_rate": 0.00017383162085328816, "loss": 0.6468, "step": 1733 }, { "epoch": 0.2649654276655079, "grad_norm": 0.38063931465148926, "learning_rate": 0.0001737979468654044, "loss": 0.6689, "step": 1734 }, { "epoch": 0.2651182335638156, "grad_norm": 0.43439793586730957, "learning_rate": 0.00017376425449138166, "loss": 0.4789, "step": 1735 }, { "epoch": 0.2652710394621232, "grad_norm": 0.30460643768310547, "learning_rate": 0.00017373054373961413, "loss": 0.7675, "step": 1736 }, { "epoch": 0.2654238453604309, "grad_norm": 0.3618842661380768, "learning_rate": 0.00017369681461850052, "loss": 0.5867, "step": 1737 }, { "epoch": 0.2655766512587386, "grad_norm": 0.3465817868709564, "learning_rate": 0.00017366306713644417, "loss": 0.8111, "step": 1738 }, { "epoch": 0.26572945715704627, "grad_norm": 0.37939634919166565, "learning_rate": 0.00017362930130185303, "loss": 0.599, "step": 1739 }, { "epoch": 0.26588226305535395, "grad_norm": 0.25240159034729004, "learning_rate": 0.0001735955171231395, "loss": 0.6037, "step": 1740 }, { "epoch": 0.26603506895366164, "grad_norm": 0.25819000601768494, "learning_rate": 0.00017356171460872064, "loss": 0.6909, "step": 1741 }, { "epoch": 0.26618787485196926, "grad_norm": 0.29703691601753235, "learning_rate": 0.0001735278937670181, "loss": 0.7321, "step": 1742 }, { "epoch": 0.26634068075027695, "grad_norm": 0.4220583438873291, "learning_rate": 0.00017349405460645806, "loss": 0.6388, "step": 1743 }, { "epoch": 0.26649348664858463, "grad_norm": 0.2786288857460022, "learning_rate": 0.00017346019713547123, "loss": 0.748, "step": 1744 }, { "epoch": 0.2666462925468923, "grad_norm": 0.2728956639766693, "learning_rate": 0.00017342632136249292, "loss": 0.4844, "step": 1745 }, { "epoch": 0.2667990984452, "grad_norm": 0.2649093270301819, "learning_rate": 0.000173392427295963, "loss": 0.6031, "step": 1746 }, { "epoch": 0.2669519043435077, "grad_norm": 0.4376051723957062, "learning_rate": 0.0001733585149443259, "loss": 0.7994, "step": 1747 }, { "epoch": 0.2671047102418153, "grad_norm": 0.42373695969581604, "learning_rate": 0.00017332458431603057, "loss": 0.7156, "step": 1748 }, { "epoch": 0.267257516140123, "grad_norm": 0.33878302574157715, "learning_rate": 0.0001732906354195306, "loss": 0.6929, "step": 1749 }, { "epoch": 0.2674103220384307, "grad_norm": 0.28887563943862915, "learning_rate": 0.000173256668263284, "loss": 0.4979, "step": 1750 }, { "epoch": 0.26756312793673837, "grad_norm": 0.3251109719276428, "learning_rate": 0.00017322268285575344, "loss": 0.6312, "step": 1751 }, { "epoch": 0.26771593383504605, "grad_norm": 0.2713668942451477, "learning_rate": 0.00017318867920540615, "loss": 0.7334, "step": 1752 }, { "epoch": 0.26786873973335373, "grad_norm": 0.2358706146478653, "learning_rate": 0.00017315465732071372, "loss": 0.908, "step": 1753 }, { "epoch": 0.26802154563166136, "grad_norm": 0.35049954056739807, "learning_rate": 0.00017312061721015253, "loss": 0.8059, "step": 1754 }, { "epoch": 0.26817435152996905, "grad_norm": 0.26363444328308105, "learning_rate": 0.00017308655888220335, "loss": 0.6745, "step": 1755 }, { "epoch": 0.26832715742827673, "grad_norm": 0.2871282696723938, "learning_rate": 0.00017305248234535158, "loss": 0.7254, "step": 1756 }, { "epoch": 0.2684799633265844, "grad_norm": 0.3954513669013977, "learning_rate": 0.00017301838760808697, "loss": 0.7484, "step": 1757 }, { "epoch": 0.2686327692248921, "grad_norm": 0.28392788767814636, "learning_rate": 0.00017298427467890405, "loss": 0.8204, "step": 1758 }, { "epoch": 0.2687855751231998, "grad_norm": 0.2613278329372406, "learning_rate": 0.00017295014356630178, "loss": 0.8254, "step": 1759 }, { "epoch": 0.2689383810215074, "grad_norm": 0.2831525504589081, "learning_rate": 0.00017291599427878357, "loss": 0.4994, "step": 1760 }, { "epoch": 0.2690911869198151, "grad_norm": 0.36036214232444763, "learning_rate": 0.00017288182682485747, "loss": 0.8176, "step": 1761 }, { "epoch": 0.2692439928181228, "grad_norm": 0.31184542179107666, "learning_rate": 0.00017284764121303602, "loss": 0.7208, "step": 1762 }, { "epoch": 0.26939679871643046, "grad_norm": 0.3088816702365875, "learning_rate": 0.00017281343745183622, "loss": 0.6944, "step": 1763 }, { "epoch": 0.26954960461473815, "grad_norm": 0.2538401484489441, "learning_rate": 0.0001727792155497797, "loss": 0.7502, "step": 1764 }, { "epoch": 0.2697024105130458, "grad_norm": 0.3166246712207794, "learning_rate": 0.00017274497551539257, "loss": 0.7718, "step": 1765 }, { "epoch": 0.26985521641135346, "grad_norm": 0.2860322892665863, "learning_rate": 0.00017271071735720542, "loss": 0.6644, "step": 1766 }, { "epoch": 0.27000802230966114, "grad_norm": 0.2913316488265991, "learning_rate": 0.0001726764410837534, "loss": 0.7526, "step": 1767 }, { "epoch": 0.27016082820796883, "grad_norm": 0.29444432258605957, "learning_rate": 0.00017264214670357616, "loss": 0.5704, "step": 1768 }, { "epoch": 0.2703136341062765, "grad_norm": 0.3528589904308319, "learning_rate": 0.00017260783422521785, "loss": 0.6162, "step": 1769 }, { "epoch": 0.2704664400045842, "grad_norm": 0.2790892720222473, "learning_rate": 0.0001725735036572271, "loss": 0.7002, "step": 1770 }, { "epoch": 0.2706192459028918, "grad_norm": 0.29821377992630005, "learning_rate": 0.00017253915500815712, "loss": 0.6549, "step": 1771 }, { "epoch": 0.2707720518011995, "grad_norm": 0.32526329159736633, "learning_rate": 0.00017250478828656558, "loss": 0.7888, "step": 1772 }, { "epoch": 0.2709248576995072, "grad_norm": 0.3157137334346771, "learning_rate": 0.0001724704035010147, "loss": 0.6242, "step": 1773 }, { "epoch": 0.2710776635978149, "grad_norm": 0.27002689242362976, "learning_rate": 0.00017243600066007105, "loss": 0.6096, "step": 1774 }, { "epoch": 0.27123046949612256, "grad_norm": 0.32272231578826904, "learning_rate": 0.00017240157977230593, "loss": 0.6981, "step": 1775 }, { "epoch": 0.27138327539443025, "grad_norm": 0.3192976713180542, "learning_rate": 0.00017236714084629498, "loss": 0.7044, "step": 1776 }, { "epoch": 0.2715360812927379, "grad_norm": 0.30380040407180786, "learning_rate": 0.0001723326838906183, "loss": 0.9246, "step": 1777 }, { "epoch": 0.27168888719104556, "grad_norm": 0.33051881194114685, "learning_rate": 0.00017229820891386064, "loss": 0.7069, "step": 1778 }, { "epoch": 0.27184169308935324, "grad_norm": 0.33114558458328247, "learning_rate": 0.00017226371592461113, "loss": 0.7682, "step": 1779 }, { "epoch": 0.2719944989876609, "grad_norm": 0.3122152090072632, "learning_rate": 0.00017222920493146338, "loss": 0.6132, "step": 1780 }, { "epoch": 0.2721473048859686, "grad_norm": 0.2902887761592865, "learning_rate": 0.00017219467594301553, "loss": 0.5294, "step": 1781 }, { "epoch": 0.2723001107842763, "grad_norm": 0.3151678740978241, "learning_rate": 0.0001721601289678702, "loss": 0.5898, "step": 1782 }, { "epoch": 0.2724529166825839, "grad_norm": 0.27645257115364075, "learning_rate": 0.00017212556401463447, "loss": 0.5349, "step": 1783 }, { "epoch": 0.2726057225808916, "grad_norm": 0.6515416502952576, "learning_rate": 0.00017209098109191988, "loss": 0.6182, "step": 1784 }, { "epoch": 0.2727585284791993, "grad_norm": 0.28953275084495544, "learning_rate": 0.0001720563802083425, "loss": 0.7238, "step": 1785 }, { "epoch": 0.272911334377507, "grad_norm": 0.3147642910480499, "learning_rate": 0.00017202176137252287, "loss": 0.8161, "step": 1786 }, { "epoch": 0.27306414027581466, "grad_norm": 0.2654118239879608, "learning_rate": 0.00017198712459308598, "loss": 0.7172, "step": 1787 }, { "epoch": 0.27321694617412234, "grad_norm": 0.2835211157798767, "learning_rate": 0.00017195246987866124, "loss": 0.6829, "step": 1788 }, { "epoch": 0.27336975207243, "grad_norm": 0.2858033776283264, "learning_rate": 0.00017191779723788262, "loss": 0.7478, "step": 1789 }, { "epoch": 0.27352255797073766, "grad_norm": 0.3065092861652374, "learning_rate": 0.00017188310667938853, "loss": 0.663, "step": 1790 }, { "epoch": 0.27367536386904534, "grad_norm": 0.2754859924316406, "learning_rate": 0.0001718483982118218, "loss": 0.6342, "step": 1791 }, { "epoch": 0.273828169767353, "grad_norm": 0.4353344440460205, "learning_rate": 0.00017181367184382977, "loss": 0.8865, "step": 1792 }, { "epoch": 0.2739809756656607, "grad_norm": 0.27528804540634155, "learning_rate": 0.00017177892758406425, "loss": 0.6648, "step": 1793 }, { "epoch": 0.2741337815639684, "grad_norm": 0.28295937180519104, "learning_rate": 0.0001717441654411814, "loss": 0.63, "step": 1794 }, { "epoch": 0.274286587462276, "grad_norm": 0.2904326915740967, "learning_rate": 0.00017170938542384202, "loss": 0.6147, "step": 1795 }, { "epoch": 0.2744393933605837, "grad_norm": 0.29448312520980835, "learning_rate": 0.00017167458754071118, "loss": 0.6123, "step": 1796 }, { "epoch": 0.2745921992588914, "grad_norm": 0.28427526354789734, "learning_rate": 0.00017163977180045855, "loss": 0.6606, "step": 1797 }, { "epoch": 0.2747450051571991, "grad_norm": 0.2858867943286896, "learning_rate": 0.00017160493821175807, "loss": 0.7894, "step": 1798 }, { "epoch": 0.27489781105550676, "grad_norm": 0.29473498463630676, "learning_rate": 0.00017157008678328833, "loss": 0.7398, "step": 1799 }, { "epoch": 0.27505061695381444, "grad_norm": 0.2681381106376648, "learning_rate": 0.00017153521752373227, "loss": 0.76, "step": 1800 }, { "epoch": 0.27520342285212207, "grad_norm": 0.30416882038116455, "learning_rate": 0.00017150033044177723, "loss": 0.8435, "step": 1801 }, { "epoch": 0.27535622875042975, "grad_norm": 0.2652147710323334, "learning_rate": 0.00017146542554611504, "loss": 0.7317, "step": 1802 }, { "epoch": 0.27550903464873744, "grad_norm": 0.3508474826812744, "learning_rate": 0.00017143050284544197, "loss": 0.8121, "step": 1803 }, { "epoch": 0.2756618405470451, "grad_norm": 0.3159068524837494, "learning_rate": 0.00017139556234845876, "loss": 0.6164, "step": 1804 }, { "epoch": 0.2758146464453528, "grad_norm": 0.311353862285614, "learning_rate": 0.00017136060406387044, "loss": 0.6027, "step": 1805 }, { "epoch": 0.2759674523436605, "grad_norm": 0.282478004693985, "learning_rate": 0.0001713256280003867, "loss": 0.8524, "step": 1806 }, { "epoch": 0.2761202582419681, "grad_norm": 0.30795755982398987, "learning_rate": 0.00017129063416672144, "loss": 0.8327, "step": 1807 }, { "epoch": 0.2762730641402758, "grad_norm": 0.33893677592277527, "learning_rate": 0.00017125562257159311, "loss": 0.7226, "step": 1808 }, { "epoch": 0.2764258700385835, "grad_norm": 0.3511805236339569, "learning_rate": 0.00017122059322372457, "loss": 0.6256, "step": 1809 }, { "epoch": 0.27657867593689117, "grad_norm": 0.31907960772514343, "learning_rate": 0.00017118554613184303, "loss": 0.8154, "step": 1810 }, { "epoch": 0.27673148183519886, "grad_norm": 0.301350861787796, "learning_rate": 0.00017115048130468026, "loss": 0.7192, "step": 1811 }, { "epoch": 0.27688428773350654, "grad_norm": 0.28029438853263855, "learning_rate": 0.0001711153987509723, "loss": 0.7313, "step": 1812 }, { "epoch": 0.27703709363181417, "grad_norm": 0.28119203448295593, "learning_rate": 0.00017108029847945973, "loss": 0.7761, "step": 1813 }, { "epoch": 0.27718989953012185, "grad_norm": 0.27862101793289185, "learning_rate": 0.00017104518049888742, "loss": 0.8314, "step": 1814 }, { "epoch": 0.27734270542842954, "grad_norm": 0.3129073977470398, "learning_rate": 0.00017101004481800478, "loss": 0.7194, "step": 1815 }, { "epoch": 0.2774955113267372, "grad_norm": 0.3475363552570343, "learning_rate": 0.00017097489144556557, "loss": 0.6641, "step": 1816 }, { "epoch": 0.2776483172250449, "grad_norm": 0.3343164324760437, "learning_rate": 0.0001709397203903279, "loss": 0.5883, "step": 1817 }, { "epoch": 0.2778011231233526, "grad_norm": 0.31171104311943054, "learning_rate": 0.0001709045316610544, "loss": 0.7454, "step": 1818 }, { "epoch": 0.2779539290216602, "grad_norm": 0.27940425276756287, "learning_rate": 0.00017086932526651203, "loss": 0.5857, "step": 1819 }, { "epoch": 0.2781067349199679, "grad_norm": 0.3223339021205902, "learning_rate": 0.00017083410121547217, "loss": 0.6367, "step": 1820 }, { "epoch": 0.2782595408182756, "grad_norm": 0.2605260908603668, "learning_rate": 0.00017079885951671057, "loss": 0.4917, "step": 1821 }, { "epoch": 0.27841234671658327, "grad_norm": 0.31542614102363586, "learning_rate": 0.00017076360017900742, "loss": 0.8394, "step": 1822 }, { "epoch": 0.27856515261489095, "grad_norm": 0.30797964334487915, "learning_rate": 0.0001707283232111473, "loss": 0.864, "step": 1823 }, { "epoch": 0.2787179585131986, "grad_norm": 0.3249169588088989, "learning_rate": 0.00017069302862191918, "loss": 0.8062, "step": 1824 }, { "epoch": 0.27887076441150627, "grad_norm": 0.2909657657146454, "learning_rate": 0.00017065771642011638, "loss": 0.6884, "step": 1825 }, { "epoch": 0.27902357030981395, "grad_norm": 0.3967931568622589, "learning_rate": 0.00017062238661453666, "loss": 0.9421, "step": 1826 }, { "epoch": 0.27917637620812163, "grad_norm": 0.26568347215652466, "learning_rate": 0.00017058703921398212, "loss": 0.7233, "step": 1827 }, { "epoch": 0.2793291821064293, "grad_norm": 0.29785382747650146, "learning_rate": 0.0001705516742272593, "loss": 0.7348, "step": 1828 }, { "epoch": 0.279481988004737, "grad_norm": 0.2735860347747803, "learning_rate": 0.00017051629166317907, "loss": 0.7623, "step": 1829 }, { "epoch": 0.27963479390304463, "grad_norm": 0.2826923131942749, "learning_rate": 0.0001704808915305567, "loss": 0.9977, "step": 1830 }, { "epoch": 0.2797875998013523, "grad_norm": 0.3087044656276703, "learning_rate": 0.00017044547383821183, "loss": 0.7793, "step": 1831 }, { "epoch": 0.27994040569966, "grad_norm": 0.2505679130554199, "learning_rate": 0.00017041003859496852, "loss": 0.6777, "step": 1832 }, { "epoch": 0.2800932115979677, "grad_norm": 0.34263527393341064, "learning_rate": 0.0001703745858096551, "loss": 0.7956, "step": 1833 }, { "epoch": 0.28024601749627537, "grad_norm": 0.35427922010421753, "learning_rate": 0.0001703391154911044, "loss": 0.7152, "step": 1834 }, { "epoch": 0.28039882339458305, "grad_norm": 0.37862199544906616, "learning_rate": 0.00017030362764815346, "loss": 0.8365, "step": 1835 }, { "epoch": 0.2805516292928907, "grad_norm": 0.3023863732814789, "learning_rate": 0.00017026812228964388, "loss": 0.6895, "step": 1836 }, { "epoch": 0.28070443519119836, "grad_norm": 0.2860608994960785, "learning_rate": 0.0001702325994244215, "loss": 0.6517, "step": 1837 }, { "epoch": 0.28085724108950605, "grad_norm": 0.36048266291618347, "learning_rate": 0.00017019705906133647, "loss": 0.722, "step": 1838 }, { "epoch": 0.28101004698781373, "grad_norm": 0.28945350646972656, "learning_rate": 0.0001701615012092435, "loss": 0.6221, "step": 1839 }, { "epoch": 0.2811628528861214, "grad_norm": 0.33494412899017334, "learning_rate": 0.00017012592587700137, "loss": 0.6345, "step": 1840 }, { "epoch": 0.2813156587844291, "grad_norm": 0.28341996669769287, "learning_rate": 0.00017009033307347353, "loss": 0.6094, "step": 1841 }, { "epoch": 0.28146846468273673, "grad_norm": 0.26581546664237976, "learning_rate": 0.00017005472280752753, "loss": 0.694, "step": 1842 }, { "epoch": 0.2816212705810444, "grad_norm": 0.44395822286605835, "learning_rate": 0.00017001909508803539, "loss": 0.87, "step": 1843 }, { "epoch": 0.2817740764793521, "grad_norm": 0.28351446986198425, "learning_rate": 0.00016998344992387348, "loss": 0.6607, "step": 1844 }, { "epoch": 0.2819268823776598, "grad_norm": 0.25666123628616333, "learning_rate": 0.0001699477873239225, "loss": 0.5578, "step": 1845 }, { "epoch": 0.28207968827596747, "grad_norm": 0.2706250548362732, "learning_rate": 0.00016991210729706743, "loss": 0.8191, "step": 1846 }, { "epoch": 0.28223249417427515, "grad_norm": 0.31308892369270325, "learning_rate": 0.0001698764098521977, "loss": 0.6986, "step": 1847 }, { "epoch": 0.2823853000725828, "grad_norm": 0.3809071183204651, "learning_rate": 0.00016984069499820703, "loss": 0.7085, "step": 1848 }, { "epoch": 0.28253810597089046, "grad_norm": 0.344176322221756, "learning_rate": 0.00016980496274399343, "loss": 0.8155, "step": 1849 }, { "epoch": 0.28269091186919815, "grad_norm": 0.2814493179321289, "learning_rate": 0.00016976921309845935, "loss": 0.7716, "step": 1850 }, { "epoch": 0.28284371776750583, "grad_norm": 0.2829038202762604, "learning_rate": 0.00016973344607051146, "loss": 0.6583, "step": 1851 }, { "epoch": 0.2829965236658135, "grad_norm": 0.33343979716300964, "learning_rate": 0.00016969766166906086, "loss": 0.5937, "step": 1852 }, { "epoch": 0.2831493295641212, "grad_norm": 0.30288052558898926, "learning_rate": 0.00016966185990302293, "loss": 0.6314, "step": 1853 }, { "epoch": 0.28330213546242883, "grad_norm": 0.31784963607788086, "learning_rate": 0.00016962604078131732, "loss": 0.7495, "step": 1854 }, { "epoch": 0.2834549413607365, "grad_norm": 0.34505629539489746, "learning_rate": 0.00016959020431286815, "loss": 0.5472, "step": 1855 }, { "epoch": 0.2836077472590442, "grad_norm": 0.33716028928756714, "learning_rate": 0.0001695543505066037, "loss": 0.6097, "step": 1856 }, { "epoch": 0.2837605531573519, "grad_norm": 0.25437086820602417, "learning_rate": 0.0001695184793714567, "loss": 0.8513, "step": 1857 }, { "epoch": 0.28391335905565956, "grad_norm": 0.3078169524669647, "learning_rate": 0.00016948259091636411, "loss": 0.6392, "step": 1858 }, { "epoch": 0.28406616495396725, "grad_norm": 0.3437117338180542, "learning_rate": 0.00016944668515026724, "loss": 0.6377, "step": 1859 }, { "epoch": 0.2842189708522749, "grad_norm": 0.34416788816452026, "learning_rate": 0.00016941076208211176, "loss": 0.7733, "step": 1860 }, { "epoch": 0.28437177675058256, "grad_norm": 0.24578352272510529, "learning_rate": 0.0001693748217208475, "loss": 0.6101, "step": 1861 }, { "epoch": 0.28452458264889025, "grad_norm": 0.2477305680513382, "learning_rate": 0.00016933886407542877, "loss": 0.6225, "step": 1862 }, { "epoch": 0.28467738854719793, "grad_norm": 0.31731775403022766, "learning_rate": 0.00016930288915481412, "loss": 0.6381, "step": 1863 }, { "epoch": 0.2848301944455056, "grad_norm": 0.28951868414878845, "learning_rate": 0.00016926689696796638, "loss": 0.6691, "step": 1864 }, { "epoch": 0.2849830003438133, "grad_norm": 0.25854676961898804, "learning_rate": 0.0001692308875238527, "loss": 0.6969, "step": 1865 }, { "epoch": 0.2851358062421209, "grad_norm": 0.287022203207016, "learning_rate": 0.00016919486083144455, "loss": 0.5745, "step": 1866 }, { "epoch": 0.2852886121404286, "grad_norm": 0.2648172080516815, "learning_rate": 0.00016915881689971764, "loss": 0.6962, "step": 1867 }, { "epoch": 0.2854414180387363, "grad_norm": 0.3037422001361847, "learning_rate": 0.00016912275573765205, "loss": 0.5824, "step": 1868 }, { "epoch": 0.285594223937044, "grad_norm": 0.27793166041374207, "learning_rate": 0.00016908667735423207, "loss": 0.5969, "step": 1869 }, { "epoch": 0.28574702983535166, "grad_norm": 0.28627026081085205, "learning_rate": 0.0001690505817584464, "loss": 0.6728, "step": 1870 }, { "epoch": 0.28589983573365935, "grad_norm": 0.26689401268959045, "learning_rate": 0.0001690144689592879, "loss": 0.5442, "step": 1871 }, { "epoch": 0.286052641631967, "grad_norm": 0.24917180836200714, "learning_rate": 0.00016897833896575376, "loss": 0.6467, "step": 1872 }, { "epoch": 0.28620544753027466, "grad_norm": 0.5315676927566528, "learning_rate": 0.0001689421917868455, "loss": 0.7294, "step": 1873 }, { "epoch": 0.28635825342858234, "grad_norm": 0.27206841111183167, "learning_rate": 0.00016890602743156887, "loss": 0.7553, "step": 1874 }, { "epoch": 0.28651105932689, "grad_norm": 0.3734266757965088, "learning_rate": 0.00016886984590893395, "loss": 0.7681, "step": 1875 }, { "epoch": 0.2866638652251977, "grad_norm": 0.32891905307769775, "learning_rate": 0.000168833647227955, "loss": 0.5659, "step": 1876 }, { "epoch": 0.28681667112350534, "grad_norm": 0.2967846989631653, "learning_rate": 0.00016879743139765066, "loss": 0.8604, "step": 1877 }, { "epoch": 0.286969477021813, "grad_norm": 0.31898820400238037, "learning_rate": 0.00016876119842704378, "loss": 0.6555, "step": 1878 }, { "epoch": 0.2871222829201207, "grad_norm": 0.2863745093345642, "learning_rate": 0.00016872494832516151, "loss": 0.5857, "step": 1879 }, { "epoch": 0.2872750888184284, "grad_norm": 0.27155035734176636, "learning_rate": 0.00016868868110103528, "loss": 0.7242, "step": 1880 }, { "epoch": 0.2874278947167361, "grad_norm": 0.35988694429397583, "learning_rate": 0.00016865239676370074, "loss": 0.5611, "step": 1881 }, { "epoch": 0.28758070061504376, "grad_norm": 0.28384700417518616, "learning_rate": 0.00016861609532219782, "loss": 0.7728, "step": 1882 }, { "epoch": 0.2877335065133514, "grad_norm": 0.2415805160999298, "learning_rate": 0.00016857977678557073, "loss": 0.707, "step": 1883 }, { "epoch": 0.2878863124116591, "grad_norm": 0.3154737949371338, "learning_rate": 0.0001685434411628679, "loss": 0.667, "step": 1884 }, { "epoch": 0.28803911830996676, "grad_norm": 0.6415541768074036, "learning_rate": 0.00016850708846314214, "loss": 0.8504, "step": 1885 }, { "epoch": 0.28819192420827444, "grad_norm": 0.28032657504081726, "learning_rate": 0.0001684707186954503, "loss": 0.7435, "step": 1886 }, { "epoch": 0.2883447301065821, "grad_norm": 0.355499804019928, "learning_rate": 0.0001684343318688537, "loss": 0.6103, "step": 1887 }, { "epoch": 0.2884975360048898, "grad_norm": 0.27321726083755493, "learning_rate": 0.00016839792799241773, "loss": 0.6049, "step": 1888 }, { "epoch": 0.28865034190319744, "grad_norm": 0.3146701157093048, "learning_rate": 0.00016836150707521218, "loss": 0.6746, "step": 1889 }, { "epoch": 0.2888031478015051, "grad_norm": 0.28722837567329407, "learning_rate": 0.00016832506912631097, "loss": 0.531, "step": 1890 }, { "epoch": 0.2889559536998128, "grad_norm": 0.24441641569137573, "learning_rate": 0.00016828861415479234, "loss": 0.5921, "step": 1891 }, { "epoch": 0.2891087595981205, "grad_norm": 0.35211068391799927, "learning_rate": 0.00016825214216973874, "loss": 0.6858, "step": 1892 }, { "epoch": 0.2892615654964282, "grad_norm": 0.3029313385486603, "learning_rate": 0.00016821565318023677, "loss": 0.8322, "step": 1893 }, { "epoch": 0.28941437139473586, "grad_norm": 0.5261650681495667, "learning_rate": 0.00016817914719537749, "loss": 0.6593, "step": 1894 }, { "epoch": 0.2895671772930435, "grad_norm": 0.25222522020339966, "learning_rate": 0.00016814262422425597, "loss": 0.6869, "step": 1895 }, { "epoch": 0.28971998319135117, "grad_norm": 0.28435593843460083, "learning_rate": 0.00016810608427597162, "loss": 0.7868, "step": 1896 }, { "epoch": 0.28987278908965886, "grad_norm": 0.3061954975128174, "learning_rate": 0.0001680695273596281, "loss": 0.5183, "step": 1897 }, { "epoch": 0.29002559498796654, "grad_norm": 0.2951694130897522, "learning_rate": 0.00016803295348433324, "loss": 0.8351, "step": 1898 }, { "epoch": 0.2901784008862742, "grad_norm": 0.2741797864437103, "learning_rate": 0.00016799636265919912, "loss": 0.5841, "step": 1899 }, { "epoch": 0.2903312067845819, "grad_norm": 0.31489941477775574, "learning_rate": 0.00016795975489334195, "loss": 0.4929, "step": 1900 }, { "epoch": 0.29048401268288954, "grad_norm": 0.3178277313709259, "learning_rate": 0.0001679231301958824, "loss": 0.9264, "step": 1901 }, { "epoch": 0.2906368185811972, "grad_norm": 0.3057640790939331, "learning_rate": 0.00016788648857594507, "loss": 0.76, "step": 1902 }, { "epoch": 0.2907896244795049, "grad_norm": 0.24972616136074066, "learning_rate": 0.00016784983004265898, "loss": 0.7661, "step": 1903 }, { "epoch": 0.2909424303778126, "grad_norm": 0.4688352644443512, "learning_rate": 0.00016781315460515726, "loss": 0.5192, "step": 1904 }, { "epoch": 0.2910952362761203, "grad_norm": 0.2574828863143921, "learning_rate": 0.00016777646227257736, "loss": 0.6807, "step": 1905 }, { "epoch": 0.29124804217442796, "grad_norm": 0.2910616993904114, "learning_rate": 0.0001677397530540608, "loss": 0.6761, "step": 1906 }, { "epoch": 0.2914008480727356, "grad_norm": 0.3315010964870453, "learning_rate": 0.00016770302695875335, "loss": 0.6879, "step": 1907 }, { "epoch": 0.29155365397104327, "grad_norm": 0.2516402304172516, "learning_rate": 0.0001676662839958051, "loss": 0.6013, "step": 1908 }, { "epoch": 0.29170645986935095, "grad_norm": 0.30346551537513733, "learning_rate": 0.00016762952417437017, "loss": 0.5684, "step": 1909 }, { "epoch": 0.29185926576765864, "grad_norm": 0.3043062686920166, "learning_rate": 0.00016759274750360702, "loss": 0.7597, "step": 1910 }, { "epoch": 0.2920120716659663, "grad_norm": 0.34094980359077454, "learning_rate": 0.00016755595399267818, "loss": 0.6636, "step": 1911 }, { "epoch": 0.292164877564274, "grad_norm": 0.4071411192417145, "learning_rate": 0.0001675191436507505, "loss": 0.7575, "step": 1912 }, { "epoch": 0.29231768346258163, "grad_norm": 0.4260135591030121, "learning_rate": 0.00016748231648699497, "loss": 0.6146, "step": 1913 }, { "epoch": 0.2924704893608893, "grad_norm": 0.3323768377304077, "learning_rate": 0.00016744547251058674, "loss": 0.8563, "step": 1914 }, { "epoch": 0.292623295259197, "grad_norm": 0.27061089873313904, "learning_rate": 0.0001674086117307052, "loss": 0.7546, "step": 1915 }, { "epoch": 0.2927761011575047, "grad_norm": 0.2773573696613312, "learning_rate": 0.00016737173415653386, "loss": 0.6676, "step": 1916 }, { "epoch": 0.29292890705581237, "grad_norm": 0.34023284912109375, "learning_rate": 0.0001673348397972605, "loss": 0.6807, "step": 1917 }, { "epoch": 0.29308171295412005, "grad_norm": 0.2626526951789856, "learning_rate": 0.00016729792866207704, "loss": 0.6535, "step": 1918 }, { "epoch": 0.2932345188524277, "grad_norm": 0.27187174558639526, "learning_rate": 0.00016726100076017955, "loss": 0.6563, "step": 1919 }, { "epoch": 0.29338732475073537, "grad_norm": 0.26734450459480286, "learning_rate": 0.00016722405610076834, "loss": 0.6657, "step": 1920 }, { "epoch": 0.29354013064904305, "grad_norm": 0.26832592487335205, "learning_rate": 0.00016718709469304787, "loss": 0.6849, "step": 1921 }, { "epoch": 0.29369293654735074, "grad_norm": 0.32193028926849365, "learning_rate": 0.00016715011654622671, "loss": 0.8245, "step": 1922 }, { "epoch": 0.2938457424456584, "grad_norm": 0.28302088379859924, "learning_rate": 0.00016711312166951768, "loss": 0.6771, "step": 1923 }, { "epoch": 0.2939985483439661, "grad_norm": 0.33167263865470886, "learning_rate": 0.0001670761100721378, "loss": 0.7652, "step": 1924 }, { "epoch": 0.29415135424227373, "grad_norm": 0.2596791982650757, "learning_rate": 0.0001670390817633081, "loss": 0.9119, "step": 1925 }, { "epoch": 0.2943041601405814, "grad_norm": 0.34436488151550293, "learning_rate": 0.00016700203675225393, "loss": 0.4812, "step": 1926 }, { "epoch": 0.2944569660388891, "grad_norm": 0.23007529973983765, "learning_rate": 0.00016696497504820474, "loss": 0.7726, "step": 1927 }, { "epoch": 0.2946097719371968, "grad_norm": 0.3009369969367981, "learning_rate": 0.00016692789666039416, "loss": 0.8195, "step": 1928 }, { "epoch": 0.29476257783550447, "grad_norm": 0.36392152309417725, "learning_rate": 0.0001668908015980599, "loss": 0.7136, "step": 1929 }, { "epoch": 0.2949153837338121, "grad_norm": 0.27673837542533875, "learning_rate": 0.00016685368987044393, "loss": 0.8112, "step": 1930 }, { "epoch": 0.2950681896321198, "grad_norm": 0.22855332493782043, "learning_rate": 0.00016681656148679233, "loss": 0.569, "step": 1931 }, { "epoch": 0.29522099553042747, "grad_norm": 0.27440088987350464, "learning_rate": 0.00016677941645635528, "loss": 0.6439, "step": 1932 }, { "epoch": 0.29537380142873515, "grad_norm": 0.27212488651275635, "learning_rate": 0.00016674225478838724, "loss": 0.6632, "step": 1933 }, { "epoch": 0.29552660732704283, "grad_norm": 0.35999348759651184, "learning_rate": 0.00016670507649214658, "loss": 0.7135, "step": 1934 }, { "epoch": 0.2956794132253505, "grad_norm": 0.3128264248371124, "learning_rate": 0.00016666788157689615, "loss": 0.7247, "step": 1935 }, { "epoch": 0.29583221912365815, "grad_norm": 0.4011903405189514, "learning_rate": 0.00016663067005190255, "loss": 0.6513, "step": 1936 }, { "epoch": 0.29598502502196583, "grad_norm": 0.3012496531009674, "learning_rate": 0.00016659344192643691, "loss": 0.9517, "step": 1937 }, { "epoch": 0.2961378309202735, "grad_norm": 0.30148524045944214, "learning_rate": 0.00016655619720977417, "loss": 0.7281, "step": 1938 }, { "epoch": 0.2962906368185812, "grad_norm": 0.2771994471549988, "learning_rate": 0.00016651893591119362, "loss": 0.5911, "step": 1939 }, { "epoch": 0.2964434427168889, "grad_norm": 0.3425995111465454, "learning_rate": 0.00016648165803997853, "loss": 0.6915, "step": 1940 }, { "epoch": 0.29659624861519657, "grad_norm": 0.2747991681098938, "learning_rate": 0.00016644436360541639, "loss": 0.8483, "step": 1941 }, { "epoch": 0.2967490545135042, "grad_norm": 0.2685058116912842, "learning_rate": 0.00016640705261679887, "loss": 0.7119, "step": 1942 }, { "epoch": 0.2969018604118119, "grad_norm": 0.4401903748512268, "learning_rate": 0.00016636972508342156, "loss": 0.6441, "step": 1943 }, { "epoch": 0.29705466631011956, "grad_norm": 0.3246113657951355, "learning_rate": 0.0001663323810145844, "loss": 0.6992, "step": 1944 }, { "epoch": 0.29720747220842725, "grad_norm": 0.2685340642929077, "learning_rate": 0.00016629502041959132, "loss": 0.6728, "step": 1945 }, { "epoch": 0.29736027810673493, "grad_norm": 0.3675488233566284, "learning_rate": 0.0001662576433077504, "loss": 0.814, "step": 1946 }, { "epoch": 0.2975130840050426, "grad_norm": 0.3309258818626404, "learning_rate": 0.0001662202496883738, "loss": 0.7315, "step": 1947 }, { "epoch": 0.29766588990335024, "grad_norm": 0.308794766664505, "learning_rate": 0.0001661828395707779, "loss": 0.7723, "step": 1948 }, { "epoch": 0.29781869580165793, "grad_norm": 0.3159720301628113, "learning_rate": 0.00016614541296428308, "loss": 0.742, "step": 1949 }, { "epoch": 0.2979715016999656, "grad_norm": 0.29673030972480774, "learning_rate": 0.0001661079698782138, "loss": 0.8073, "step": 1950 }, { "epoch": 0.2981243075982733, "grad_norm": 0.37807202339172363, "learning_rate": 0.00016607051032189882, "loss": 0.6621, "step": 1951 }, { "epoch": 0.298277113496581, "grad_norm": 0.31420621275901794, "learning_rate": 0.00016603303430467076, "loss": 0.6614, "step": 1952 }, { "epoch": 0.29842991939488867, "grad_norm": 0.2999773621559143, "learning_rate": 0.0001659955418358665, "loss": 0.668, "step": 1953 }, { "epoch": 0.2985827252931963, "grad_norm": 0.28562691807746887, "learning_rate": 0.00016595803292482702, "loss": 0.6569, "step": 1954 }, { "epoch": 0.298735531191504, "grad_norm": 0.3034481406211853, "learning_rate": 0.00016592050758089727, "loss": 0.7714, "step": 1955 }, { "epoch": 0.29888833708981166, "grad_norm": 0.2904307246208191, "learning_rate": 0.00016588296581342645, "loss": 0.67, "step": 1956 }, { "epoch": 0.29904114298811935, "grad_norm": 0.28086069226264954, "learning_rate": 0.0001658454076317677, "loss": 0.6636, "step": 1957 }, { "epoch": 0.29919394888642703, "grad_norm": 0.46149349212646484, "learning_rate": 0.00016580783304527837, "loss": 0.672, "step": 1958 }, { "epoch": 0.2993467547847347, "grad_norm": 0.29076477885246277, "learning_rate": 0.0001657702420633198, "loss": 0.653, "step": 1959 }, { "epoch": 0.29949956068304234, "grad_norm": 0.3762834370136261, "learning_rate": 0.00016573263469525754, "loss": 0.6253, "step": 1960 }, { "epoch": 0.29965236658135, "grad_norm": 0.36436668038368225, "learning_rate": 0.00016569501095046115, "loss": 0.6948, "step": 1961 }, { "epoch": 0.2998051724796577, "grad_norm": 0.2805554270744324, "learning_rate": 0.00016565737083830423, "loss": 0.6422, "step": 1962 }, { "epoch": 0.2999579783779654, "grad_norm": 0.3336206376552582, "learning_rate": 0.0001656197143681645, "loss": 0.7162, "step": 1963 }, { "epoch": 0.3001107842762731, "grad_norm": 0.2919718027114868, "learning_rate": 0.00016558204154942376, "loss": 0.6768, "step": 1964 }, { "epoch": 0.30026359017458076, "grad_norm": 0.282857209444046, "learning_rate": 0.0001655443523914679, "loss": 0.6641, "step": 1965 }, { "epoch": 0.3004163960728884, "grad_norm": 0.301281601190567, "learning_rate": 0.0001655066469036868, "loss": 0.6862, "step": 1966 }, { "epoch": 0.3005692019711961, "grad_norm": 0.2959330081939697, "learning_rate": 0.00016546892509547453, "loss": 0.7397, "step": 1967 }, { "epoch": 0.30072200786950376, "grad_norm": 0.34630370140075684, "learning_rate": 0.0001654311869762291, "loss": 0.6867, "step": 1968 }, { "epoch": 0.30087481376781144, "grad_norm": 0.30559927225112915, "learning_rate": 0.00016539343255535274, "loss": 0.7231, "step": 1969 }, { "epoch": 0.30102761966611913, "grad_norm": 0.4635114073753357, "learning_rate": 0.00016535566184225155, "loss": 0.6228, "step": 1970 }, { "epoch": 0.3011804255644268, "grad_norm": 0.2548908591270447, "learning_rate": 0.0001653178748463358, "loss": 0.7538, "step": 1971 }, { "epoch": 0.30133323146273444, "grad_norm": 0.29736220836639404, "learning_rate": 0.00016528007157701988, "loss": 0.6011, "step": 1972 }, { "epoch": 0.3014860373610421, "grad_norm": 0.2700873911380768, "learning_rate": 0.0001652422520437221, "loss": 0.7904, "step": 1973 }, { "epoch": 0.3016388432593498, "grad_norm": 0.3336293399333954, "learning_rate": 0.00016520441625586486, "loss": 0.5836, "step": 1974 }, { "epoch": 0.3017916491576575, "grad_norm": 0.37030869722366333, "learning_rate": 0.00016516656422287462, "loss": 0.5687, "step": 1975 }, { "epoch": 0.3019444550559652, "grad_norm": 0.34676826000213623, "learning_rate": 0.00016512869595418196, "loss": 0.6698, "step": 1976 }, { "epoch": 0.30209726095427286, "grad_norm": 0.333841472864151, "learning_rate": 0.00016509081145922144, "loss": 0.8103, "step": 1977 }, { "epoch": 0.3022500668525805, "grad_norm": 0.3339821696281433, "learning_rate": 0.00016505291074743158, "loss": 0.8072, "step": 1978 }, { "epoch": 0.3024028727508882, "grad_norm": 0.29102015495300293, "learning_rate": 0.00016501499382825513, "loss": 0.6759, "step": 1979 }, { "epoch": 0.30255567864919586, "grad_norm": 0.3134000301361084, "learning_rate": 0.00016497706071113866, "loss": 0.7879, "step": 1980 }, { "epoch": 0.30270848454750354, "grad_norm": 0.2835538983345032, "learning_rate": 0.00016493911140553298, "loss": 0.8545, "step": 1981 }, { "epoch": 0.3028612904458112, "grad_norm": 0.30128997564315796, "learning_rate": 0.0001649011459208928, "loss": 0.8597, "step": 1982 }, { "epoch": 0.3030140963441189, "grad_norm": 0.281778484582901, "learning_rate": 0.0001648631642666769, "loss": 0.9106, "step": 1983 }, { "epoch": 0.30316690224242654, "grad_norm": 0.351546049118042, "learning_rate": 0.00016482516645234814, "loss": 0.635, "step": 1984 }, { "epoch": 0.3033197081407342, "grad_norm": 0.2829291522502899, "learning_rate": 0.0001647871524873733, "loss": 0.8733, "step": 1985 }, { "epoch": 0.3034725140390419, "grad_norm": 0.26995211839675903, "learning_rate": 0.00016474912238122324, "loss": 0.7474, "step": 1986 }, { "epoch": 0.3036253199373496, "grad_norm": 0.2779309153556824, "learning_rate": 0.00016471107614337286, "loss": 0.7124, "step": 1987 }, { "epoch": 0.3037781258356573, "grad_norm": 0.31373247504234314, "learning_rate": 0.00016467301378330108, "loss": 0.6688, "step": 1988 }, { "epoch": 0.3039309317339649, "grad_norm": 0.33604127168655396, "learning_rate": 0.00016463493531049077, "loss": 0.7257, "step": 1989 }, { "epoch": 0.3040837376322726, "grad_norm": 0.32262903451919556, "learning_rate": 0.0001645968407344289, "loss": 0.8301, "step": 1990 }, { "epoch": 0.3042365435305803, "grad_norm": 1.0630546808242798, "learning_rate": 0.0001645587300646064, "loss": 0.7924, "step": 1991 }, { "epoch": 0.30438934942888796, "grad_norm": 0.28364071249961853, "learning_rate": 0.00016452060331051822, "loss": 0.5656, "step": 1992 }, { "epoch": 0.30454215532719564, "grad_norm": 0.3063963055610657, "learning_rate": 0.00016448246048166335, "loss": 0.7863, "step": 1993 }, { "epoch": 0.3046949612255033, "grad_norm": 0.3313276171684265, "learning_rate": 0.0001644443015875447, "loss": 0.6872, "step": 1994 }, { "epoch": 0.30484776712381095, "grad_norm": 0.30340656638145447, "learning_rate": 0.0001644061266376693, "loss": 0.6541, "step": 1995 }, { "epoch": 0.30500057302211864, "grad_norm": 0.31530138850212097, "learning_rate": 0.00016436793564154808, "loss": 0.6445, "step": 1996 }, { "epoch": 0.3051533789204263, "grad_norm": 0.2796996533870697, "learning_rate": 0.00016432972860869603, "loss": 0.7765, "step": 1997 }, { "epoch": 0.305306184818734, "grad_norm": 0.28395867347717285, "learning_rate": 0.0001642915055486321, "loss": 0.8339, "step": 1998 }, { "epoch": 0.3054589907170417, "grad_norm": 0.32714176177978516, "learning_rate": 0.0001642532664708792, "loss": 0.7457, "step": 1999 }, { "epoch": 0.3056117966153494, "grad_norm": 0.2937332093715668, "learning_rate": 0.00016421501138496431, "loss": 0.6448, "step": 2000 }, { "epoch": 0.305764602513657, "grad_norm": 0.3177519142627716, "learning_rate": 0.00016417674030041841, "loss": 0.5321, "step": 2001 }, { "epoch": 0.3059174084119647, "grad_norm": 0.3196076452732086, "learning_rate": 0.00016413845322677637, "loss": 0.6613, "step": 2002 }, { "epoch": 0.30607021431027237, "grad_norm": 0.34329482913017273, "learning_rate": 0.00016410015017357708, "loss": 0.6171, "step": 2003 }, { "epoch": 0.30622302020858005, "grad_norm": 0.33029940724372864, "learning_rate": 0.0001640618311503635, "loss": 0.5726, "step": 2004 }, { "epoch": 0.30637582610688774, "grad_norm": 0.2705060839653015, "learning_rate": 0.0001640234961666824, "loss": 0.8022, "step": 2005 }, { "epoch": 0.3065286320051954, "grad_norm": 0.28415077924728394, "learning_rate": 0.00016398514523208467, "loss": 0.7025, "step": 2006 }, { "epoch": 0.30668143790350305, "grad_norm": 0.2916298508644104, "learning_rate": 0.0001639467783561251, "loss": 0.7154, "step": 2007 }, { "epoch": 0.30683424380181074, "grad_norm": 0.2760631740093231, "learning_rate": 0.0001639083955483625, "loss": 0.677, "step": 2008 }, { "epoch": 0.3069870497001184, "grad_norm": 0.2400038093328476, "learning_rate": 0.00016386999681835963, "loss": 0.7028, "step": 2009 }, { "epoch": 0.3071398555984261, "grad_norm": 0.30769845843315125, "learning_rate": 0.00016383158217568315, "loss": 0.6044, "step": 2010 }, { "epoch": 0.3072926614967338, "grad_norm": 0.23859497904777527, "learning_rate": 0.00016379315162990378, "loss": 0.4944, "step": 2011 }, { "epoch": 0.30744546739504147, "grad_norm": 0.30362075567245483, "learning_rate": 0.00016375470519059624, "loss": 0.8197, "step": 2012 }, { "epoch": 0.3075982732933491, "grad_norm": 0.29339346289634705, "learning_rate": 0.000163716242867339, "loss": 0.768, "step": 2013 }, { "epoch": 0.3077510791916568, "grad_norm": 0.24972614645957947, "learning_rate": 0.00016367776466971477, "loss": 0.7026, "step": 2014 }, { "epoch": 0.30790388508996447, "grad_norm": 0.306267648935318, "learning_rate": 0.00016363927060730995, "loss": 0.6663, "step": 2015 }, { "epoch": 0.30805669098827215, "grad_norm": 0.2774108350276947, "learning_rate": 0.0001636007606897151, "loss": 0.6631, "step": 2016 }, { "epoch": 0.30820949688657984, "grad_norm": 0.2911866307258606, "learning_rate": 0.0001635622349265246, "loss": 0.7182, "step": 2017 }, { "epoch": 0.3083623027848875, "grad_norm": 0.2774654030799866, "learning_rate": 0.00016352369332733679, "loss": 0.646, "step": 2018 }, { "epoch": 0.30851510868319515, "grad_norm": 0.28200235962867737, "learning_rate": 0.00016348513590175404, "loss": 0.7008, "step": 2019 }, { "epoch": 0.30866791458150283, "grad_norm": 0.2759782075881958, "learning_rate": 0.00016344656265938258, "loss": 0.8021, "step": 2020 }, { "epoch": 0.3088207204798105, "grad_norm": 0.24658828973770142, "learning_rate": 0.0001634079736098326, "loss": 0.6682, "step": 2021 }, { "epoch": 0.3089735263781182, "grad_norm": 0.2983681857585907, "learning_rate": 0.00016336936876271832, "loss": 0.7827, "step": 2022 }, { "epoch": 0.3091263322764259, "grad_norm": 0.3705412447452545, "learning_rate": 0.00016333074812765772, "loss": 0.9868, "step": 2023 }, { "epoch": 0.30927913817473357, "grad_norm": 0.2668742835521698, "learning_rate": 0.0001632921117142728, "loss": 0.8599, "step": 2024 }, { "epoch": 0.3094319440730412, "grad_norm": 0.30914178490638733, "learning_rate": 0.0001632534595321896, "loss": 0.9663, "step": 2025 }, { "epoch": 0.3095847499713489, "grad_norm": 0.27188578248023987, "learning_rate": 0.00016321479159103788, "loss": 0.6205, "step": 2026 }, { "epoch": 0.30973755586965657, "grad_norm": 0.26725485920906067, "learning_rate": 0.0001631761079004515, "loss": 0.6375, "step": 2027 }, { "epoch": 0.30989036176796425, "grad_norm": 0.3215772807598114, "learning_rate": 0.00016313740847006812, "loss": 0.8451, "step": 2028 }, { "epoch": 0.31004316766627193, "grad_norm": 0.3371334373950958, "learning_rate": 0.00016309869330952945, "loss": 0.6311, "step": 2029 }, { "epoch": 0.3101959735645796, "grad_norm": 0.29730215668678284, "learning_rate": 0.00016305996242848097, "loss": 0.7364, "step": 2030 }, { "epoch": 0.31034877946288725, "grad_norm": 0.32004204392433167, "learning_rate": 0.0001630212158365722, "loss": 0.7113, "step": 2031 }, { "epoch": 0.31050158536119493, "grad_norm": 0.29394999146461487, "learning_rate": 0.00016298245354345655, "loss": 0.5658, "step": 2032 }, { "epoch": 0.3106543912595026, "grad_norm": 0.3030238747596741, "learning_rate": 0.00016294367555879126, "loss": 0.7351, "step": 2033 }, { "epoch": 0.3108071971578103, "grad_norm": 0.2705308794975281, "learning_rate": 0.00016290488189223758, "loss": 0.6108, "step": 2034 }, { "epoch": 0.310960003056118, "grad_norm": 0.31228870153427124, "learning_rate": 0.00016286607255346062, "loss": 0.7637, "step": 2035 }, { "epoch": 0.31111280895442567, "grad_norm": 0.2760096490383148, "learning_rate": 0.0001628272475521294, "loss": 0.6447, "step": 2036 }, { "epoch": 0.3112656148527333, "grad_norm": 0.2895592451095581, "learning_rate": 0.0001627884068979168, "loss": 0.6692, "step": 2037 }, { "epoch": 0.311418420751041, "grad_norm": 0.2655385434627533, "learning_rate": 0.00016274955060049972, "loss": 0.6578, "step": 2038 }, { "epoch": 0.31157122664934866, "grad_norm": 0.30148744583129883, "learning_rate": 0.00016271067866955883, "loss": 0.5564, "step": 2039 }, { "epoch": 0.31172403254765635, "grad_norm": 0.2806140184402466, "learning_rate": 0.00016267179111477878, "loss": 0.7039, "step": 2040 }, { "epoch": 0.31187683844596403, "grad_norm": 0.5120315551757812, "learning_rate": 0.00016263288794584805, "loss": 0.6463, "step": 2041 }, { "epoch": 0.31202964434427166, "grad_norm": 0.30157095193862915, "learning_rate": 0.00016259396917245902, "loss": 0.782, "step": 2042 }, { "epoch": 0.31218245024257935, "grad_norm": 0.6643047332763672, "learning_rate": 0.00016255503480430803, "loss": 0.7354, "step": 2043 }, { "epoch": 0.31233525614088703, "grad_norm": 0.33008846640586853, "learning_rate": 0.0001625160848510952, "loss": 0.7089, "step": 2044 }, { "epoch": 0.3124880620391947, "grad_norm": 0.3063755929470062, "learning_rate": 0.0001624771193225246, "loss": 0.8467, "step": 2045 }, { "epoch": 0.3126408679375024, "grad_norm": 0.33746209740638733, "learning_rate": 0.00016243813822830417, "loss": 0.7556, "step": 2046 }, { "epoch": 0.3127936738358101, "grad_norm": 0.28747060894966125, "learning_rate": 0.00016239914157814572, "loss": 0.8213, "step": 2047 }, { "epoch": 0.3129464797341177, "grad_norm": 0.292519748210907, "learning_rate": 0.00016236012938176497, "loss": 0.7229, "step": 2048 }, { "epoch": 0.3130992856324254, "grad_norm": 0.3621499836444855, "learning_rate": 0.00016232110164888142, "loss": 0.6529, "step": 2049 }, { "epoch": 0.3132520915307331, "grad_norm": 0.31153249740600586, "learning_rate": 0.00016228205838921854, "loss": 0.9509, "step": 2050 }, { "epoch": 0.31340489742904076, "grad_norm": 0.2779485583305359, "learning_rate": 0.00016224299961250363, "loss": 0.8127, "step": 2051 }, { "epoch": 0.31355770332734845, "grad_norm": 0.3095969259738922, "learning_rate": 0.00016220392532846785, "loss": 0.5948, "step": 2052 }, { "epoch": 0.31371050922565613, "grad_norm": 0.2988138496875763, "learning_rate": 0.00016216483554684622, "loss": 0.6803, "step": 2053 }, { "epoch": 0.31386331512396376, "grad_norm": 0.3316000699996948, "learning_rate": 0.00016212573027737763, "loss": 0.7689, "step": 2054 }, { "epoch": 0.31401612102227144, "grad_norm": 0.32596075534820557, "learning_rate": 0.00016208660952980486, "loss": 0.7207, "step": 2055 }, { "epoch": 0.31416892692057913, "grad_norm": 0.29113471508026123, "learning_rate": 0.0001620474733138745, "loss": 0.7664, "step": 2056 }, { "epoch": 0.3143217328188868, "grad_norm": 0.3138737976551056, "learning_rate": 0.000162008321639337, "loss": 0.6088, "step": 2057 }, { "epoch": 0.3144745387171945, "grad_norm": 0.28373363614082336, "learning_rate": 0.00016196915451594665, "loss": 0.7374, "step": 2058 }, { "epoch": 0.3146273446155022, "grad_norm": 0.29363298416137695, "learning_rate": 0.00016192997195346167, "loss": 0.8168, "step": 2059 }, { "epoch": 0.3147801505138098, "grad_norm": 2.7054712772369385, "learning_rate": 0.000161890773961644, "loss": 0.6765, "step": 2060 }, { "epoch": 0.3149329564121175, "grad_norm": 0.29709509015083313, "learning_rate": 0.00016185156055025955, "loss": 0.6439, "step": 2061 }, { "epoch": 0.3150857623104252, "grad_norm": 0.25600048899650574, "learning_rate": 0.00016181233172907797, "loss": 0.6808, "step": 2062 }, { "epoch": 0.31523856820873286, "grad_norm": 0.5635945796966553, "learning_rate": 0.0001617730875078728, "loss": 0.7865, "step": 2063 }, { "epoch": 0.31539137410704055, "grad_norm": 0.6080973744392395, "learning_rate": 0.00016173382789642145, "loss": 0.7357, "step": 2064 }, { "epoch": 0.31554418000534823, "grad_norm": 0.24305948615074158, "learning_rate": 0.00016169455290450507, "loss": 0.5111, "step": 2065 }, { "epoch": 0.31569698590365586, "grad_norm": 0.3057420551776886, "learning_rate": 0.00016165526254190873, "loss": 0.758, "step": 2066 }, { "epoch": 0.31584979180196354, "grad_norm": 1.9893947839736938, "learning_rate": 0.00016161595681842125, "loss": 0.708, "step": 2067 }, { "epoch": 0.3160025977002712, "grad_norm": 0.29663994908332825, "learning_rate": 0.0001615766357438354, "loss": 0.6464, "step": 2068 }, { "epoch": 0.3161554035985789, "grad_norm": 0.3185891807079315, "learning_rate": 0.00016153729932794756, "loss": 0.8377, "step": 2069 }, { "epoch": 0.3163082094968866, "grad_norm": 0.3387928307056427, "learning_rate": 0.0001614979475805582, "loss": 0.6747, "step": 2070 }, { "epoch": 0.3164610153951943, "grad_norm": 0.40630900859832764, "learning_rate": 0.00016145858051147145, "loss": 0.6742, "step": 2071 }, { "epoch": 0.3166138212935019, "grad_norm": 0.2950742840766907, "learning_rate": 0.0001614191981304952, "loss": 0.6839, "step": 2072 }, { "epoch": 0.3167666271918096, "grad_norm": 0.3646473288536072, "learning_rate": 0.00016137980044744136, "loss": 0.8953, "step": 2073 }, { "epoch": 0.3169194330901173, "grad_norm": 0.647003710269928, "learning_rate": 0.00016134038747212545, "loss": 0.5832, "step": 2074 }, { "epoch": 0.31707223898842496, "grad_norm": 0.44197747111320496, "learning_rate": 0.00016130095921436692, "loss": 0.8293, "step": 2075 }, { "epoch": 0.31722504488673264, "grad_norm": 0.33136236667633057, "learning_rate": 0.00016126151568398897, "loss": 0.7455, "step": 2076 }, { "epoch": 0.3173778507850403, "grad_norm": 0.2798633277416229, "learning_rate": 0.00016122205689081864, "loss": 0.6635, "step": 2077 }, { "epoch": 0.31753065668334796, "grad_norm": 0.34174054861068726, "learning_rate": 0.00016118258284468671, "loss": 0.6709, "step": 2078 }, { "epoch": 0.31768346258165564, "grad_norm": 0.31651896238327026, "learning_rate": 0.0001611430935554279, "loss": 0.8362, "step": 2079 }, { "epoch": 0.3178362684799633, "grad_norm": 0.3442460000514984, "learning_rate": 0.00016110358903288056, "loss": 0.9762, "step": 2080 }, { "epoch": 0.317989074378271, "grad_norm": 0.270297646522522, "learning_rate": 0.00016106406928688693, "loss": 0.5487, "step": 2081 }, { "epoch": 0.3181418802765787, "grad_norm": 0.312498539686203, "learning_rate": 0.000161024534327293, "loss": 0.7125, "step": 2082 }, { "epoch": 0.3182946861748864, "grad_norm": 0.27466461062431335, "learning_rate": 0.00016098498416394864, "loss": 0.7155, "step": 2083 }, { "epoch": 0.318447492073194, "grad_norm": 0.3596421480178833, "learning_rate": 0.0001609454188067074, "loss": 0.6314, "step": 2084 }, { "epoch": 0.3186002979715017, "grad_norm": 0.36655640602111816, "learning_rate": 0.0001609058382654266, "loss": 0.6903, "step": 2085 }, { "epoch": 0.3187531038698094, "grad_norm": 0.37121638655662537, "learning_rate": 0.00016086624254996748, "loss": 0.6563, "step": 2086 }, { "epoch": 0.31890590976811706, "grad_norm": 0.2979934811592102, "learning_rate": 0.000160826631670195, "loss": 0.5967, "step": 2087 }, { "epoch": 0.31905871566642474, "grad_norm": 0.2676079273223877, "learning_rate": 0.00016078700563597776, "loss": 0.4784, "step": 2088 }, { "epoch": 0.3192115215647324, "grad_norm": 0.2784518897533417, "learning_rate": 0.0001607473644571884, "loss": 0.654, "step": 2089 }, { "epoch": 0.31936432746304005, "grad_norm": 0.3202001750469208, "learning_rate": 0.00016070770814370305, "loss": 0.7928, "step": 2090 }, { "epoch": 0.31951713336134774, "grad_norm": 0.39485278725624084, "learning_rate": 0.00016066803670540183, "loss": 0.6701, "step": 2091 }, { "epoch": 0.3196699392596554, "grad_norm": 0.37572166323661804, "learning_rate": 0.00016062835015216855, "loss": 0.7101, "step": 2092 }, { "epoch": 0.3198227451579631, "grad_norm": 0.6303053498268127, "learning_rate": 0.00016058864849389075, "loss": 0.8098, "step": 2093 }, { "epoch": 0.3199755510562708, "grad_norm": 0.3596165180206299, "learning_rate": 0.00016054893174045974, "loss": 0.6311, "step": 2094 }, { "epoch": 0.3201283569545784, "grad_norm": 0.2687673270702362, "learning_rate": 0.00016050919990177068, "loss": 0.626, "step": 2095 }, { "epoch": 0.3202811628528861, "grad_norm": 0.25072038173675537, "learning_rate": 0.0001604694529877224, "loss": 0.768, "step": 2096 }, { "epoch": 0.3204339687511938, "grad_norm": 0.2828698754310608, "learning_rate": 0.0001604296910082175, "loss": 0.6626, "step": 2097 }, { "epoch": 0.32058677464950147, "grad_norm": 0.4138115346431732, "learning_rate": 0.00016038991397316233, "loss": 0.8001, "step": 2098 }, { "epoch": 0.32073958054780916, "grad_norm": 0.29085302352905273, "learning_rate": 0.000160350121892467, "loss": 0.7274, "step": 2099 }, { "epoch": 0.32089238644611684, "grad_norm": 0.2617502808570862, "learning_rate": 0.00016031031477604547, "loss": 0.6377, "step": 2100 }, { "epoch": 0.32104519234442447, "grad_norm": 0.3535154461860657, "learning_rate": 0.0001602704926338152, "loss": 0.9398, "step": 2101 }, { "epoch": 0.32119799824273215, "grad_norm": 0.3721776306629181, "learning_rate": 0.00016023065547569765, "loss": 0.8525, "step": 2102 }, { "epoch": 0.32135080414103984, "grad_norm": 0.27641820907592773, "learning_rate": 0.00016019080331161788, "loss": 0.8148, "step": 2103 }, { "epoch": 0.3215036100393475, "grad_norm": 0.3367394506931305, "learning_rate": 0.00016015093615150472, "loss": 0.7703, "step": 2104 }, { "epoch": 0.3216564159376552, "grad_norm": 0.3287603557109833, "learning_rate": 0.00016011105400529072, "loss": 0.7462, "step": 2105 }, { "epoch": 0.3218092218359629, "grad_norm": 0.31794461607933044, "learning_rate": 0.0001600711568829122, "loss": 0.6779, "step": 2106 }, { "epoch": 0.3219620277342705, "grad_norm": 0.2856120765209198, "learning_rate": 0.0001600312447943092, "loss": 0.5557, "step": 2107 }, { "epoch": 0.3221148336325782, "grad_norm": 0.34538280963897705, "learning_rate": 0.00015999131774942552, "loss": 0.746, "step": 2108 }, { "epoch": 0.3222676395308859, "grad_norm": 0.30335336923599243, "learning_rate": 0.00015995137575820857, "loss": 0.8004, "step": 2109 }, { "epoch": 0.32242044542919357, "grad_norm": 0.31408512592315674, "learning_rate": 0.0001599114188306096, "loss": 0.7996, "step": 2110 }, { "epoch": 0.32257325132750125, "grad_norm": 0.40797099471092224, "learning_rate": 0.00015987144697658353, "loss": 0.599, "step": 2111 }, { "epoch": 0.32272605722580894, "grad_norm": 0.29327741265296936, "learning_rate": 0.00015983146020608904, "loss": 0.5498, "step": 2112 }, { "epoch": 0.32287886312411657, "grad_norm": 0.31773462891578674, "learning_rate": 0.00015979145852908845, "loss": 0.6583, "step": 2113 }, { "epoch": 0.32303166902242425, "grad_norm": 0.2868436574935913, "learning_rate": 0.00015975144195554786, "loss": 0.5934, "step": 2114 }, { "epoch": 0.32318447492073193, "grad_norm": 0.25718802213668823, "learning_rate": 0.0001597114104954371, "loss": 0.7702, "step": 2115 }, { "epoch": 0.3233372808190396, "grad_norm": 0.3285646140575409, "learning_rate": 0.00015967136415872968, "loss": 0.6344, "step": 2116 }, { "epoch": 0.3234900867173473, "grad_norm": 0.342434823513031, "learning_rate": 0.00015963130295540274, "loss": 0.6717, "step": 2117 }, { "epoch": 0.323642892615655, "grad_norm": 0.31285926699638367, "learning_rate": 0.00015959122689543725, "loss": 0.8469, "step": 2118 }, { "epoch": 0.3237956985139626, "grad_norm": 0.3020860552787781, "learning_rate": 0.00015955113598881777, "loss": 0.5288, "step": 2119 }, { "epoch": 0.3239485044122703, "grad_norm": 0.28416410088539124, "learning_rate": 0.00015951103024553268, "loss": 0.6605, "step": 2120 }, { "epoch": 0.324101310310578, "grad_norm": 0.46280670166015625, "learning_rate": 0.00015947090967557393, "loss": 0.6801, "step": 2121 }, { "epoch": 0.32425411620888567, "grad_norm": 0.3016008138656616, "learning_rate": 0.00015943077428893726, "loss": 0.758, "step": 2122 }, { "epoch": 0.32440692210719335, "grad_norm": 0.33130350708961487, "learning_rate": 0.00015939062409562203, "loss": 0.5521, "step": 2123 }, { "epoch": 0.32455972800550104, "grad_norm": 0.2970220744609833, "learning_rate": 0.00015935045910563136, "loss": 0.8987, "step": 2124 }, { "epoch": 0.32471253390380866, "grad_norm": 0.2839277386665344, "learning_rate": 0.000159310279328972, "loss": 0.647, "step": 2125 }, { "epoch": 0.32486533980211635, "grad_norm": 0.7329890131950378, "learning_rate": 0.00015927008477565444, "loss": 0.7763, "step": 2126 }, { "epoch": 0.32501814570042403, "grad_norm": 0.4290359318256378, "learning_rate": 0.00015922987545569274, "loss": 0.7703, "step": 2127 }, { "epoch": 0.3251709515987317, "grad_norm": 0.29252350330352783, "learning_rate": 0.0001591896513791048, "loss": 0.823, "step": 2128 }, { "epoch": 0.3253237574970394, "grad_norm": 0.8785410523414612, "learning_rate": 0.00015914941255591204, "loss": 0.7813, "step": 2129 }, { "epoch": 0.3254765633953471, "grad_norm": 0.29600057005882263, "learning_rate": 0.00015910915899613968, "loss": 0.7444, "step": 2130 }, { "epoch": 0.3256293692936547, "grad_norm": 0.3276137709617615, "learning_rate": 0.0001590688907098165, "loss": 0.6706, "step": 2131 }, { "epoch": 0.3257821751919624, "grad_norm": 0.31205666065216064, "learning_rate": 0.00015902860770697507, "loss": 0.6286, "step": 2132 }, { "epoch": 0.3259349810902701, "grad_norm": 0.27540236711502075, "learning_rate": 0.0001589883099976515, "loss": 0.771, "step": 2133 }, { "epoch": 0.32608778698857777, "grad_norm": 0.2716180086135864, "learning_rate": 0.00015894799759188572, "loss": 0.7021, "step": 2134 }, { "epoch": 0.32624059288688545, "grad_norm": 0.25274068117141724, "learning_rate": 0.00015890767049972114, "loss": 0.6938, "step": 2135 }, { "epoch": 0.32639339878519313, "grad_norm": 0.3524169921875, "learning_rate": 0.0001588673287312049, "loss": 0.7538, "step": 2136 }, { "epoch": 0.32654620468350076, "grad_norm": 0.384371817111969, "learning_rate": 0.00015882697229638787, "loss": 0.5032, "step": 2137 }, { "epoch": 0.32669901058180845, "grad_norm": 0.4573408365249634, "learning_rate": 0.00015878660120532452, "loss": 0.7039, "step": 2138 }, { "epoch": 0.32685181648011613, "grad_norm": 0.30394843220710754, "learning_rate": 0.0001587462154680729, "loss": 0.8154, "step": 2139 }, { "epoch": 0.3270046223784238, "grad_norm": 0.2765500247478485, "learning_rate": 0.00015870581509469487, "loss": 0.7432, "step": 2140 }, { "epoch": 0.3271574282767315, "grad_norm": 0.29486072063446045, "learning_rate": 0.0001586654000952558, "loss": 0.6089, "step": 2141 }, { "epoch": 0.3273102341750392, "grad_norm": 0.25128594040870667, "learning_rate": 0.00015862497047982473, "loss": 0.6048, "step": 2142 }, { "epoch": 0.3274630400733468, "grad_norm": 0.3318636417388916, "learning_rate": 0.0001585845262584744, "loss": 0.8185, "step": 2143 }, { "epoch": 0.3276158459716545, "grad_norm": 0.3293468654155731, "learning_rate": 0.00015854406744128112, "loss": 0.7598, "step": 2144 }, { "epoch": 0.3277686518699622, "grad_norm": 0.312021404504776, "learning_rate": 0.00015850359403832485, "loss": 0.6752, "step": 2145 }, { "epoch": 0.32792145776826986, "grad_norm": 0.46644726395606995, "learning_rate": 0.00015846310605968923, "loss": 0.7358, "step": 2146 }, { "epoch": 0.32807426366657755, "grad_norm": 0.3361137807369232, "learning_rate": 0.0001584226035154615, "loss": 0.7287, "step": 2147 }, { "epoch": 0.32822706956488523, "grad_norm": 0.30001696944236755, "learning_rate": 0.00015838208641573252, "loss": 0.9108, "step": 2148 }, { "epoch": 0.32837987546319286, "grad_norm": 0.2829294502735138, "learning_rate": 0.00015834155477059672, "loss": 0.6461, "step": 2149 }, { "epoch": 0.32853268136150054, "grad_norm": 0.283859521150589, "learning_rate": 0.00015830100859015237, "loss": 0.8114, "step": 2150 }, { "epoch": 0.32868548725980823, "grad_norm": 0.2840181291103363, "learning_rate": 0.0001582604478845011, "loss": 0.7424, "step": 2151 }, { "epoch": 0.3288382931581159, "grad_norm": 0.2813766896724701, "learning_rate": 0.00015821987266374828, "loss": 0.7707, "step": 2152 }, { "epoch": 0.3289910990564236, "grad_norm": 0.3396928608417511, "learning_rate": 0.00015817928293800288, "loss": 0.8722, "step": 2153 }, { "epoch": 0.3291439049547312, "grad_norm": 0.2577609717845917, "learning_rate": 0.00015813867871737752, "loss": 0.662, "step": 2154 }, { "epoch": 0.3292967108530389, "grad_norm": 0.28058573603630066, "learning_rate": 0.0001580980600119884, "loss": 0.6872, "step": 2155 }, { "epoch": 0.3294495167513466, "grad_norm": 0.2761460244655609, "learning_rate": 0.00015805742683195527, "loss": 0.7247, "step": 2156 }, { "epoch": 0.3296023226496543, "grad_norm": 0.8709086179733276, "learning_rate": 0.00015801677918740167, "loss": 0.5721, "step": 2157 }, { "epoch": 0.32975512854796196, "grad_norm": 0.2942737340927124, "learning_rate": 0.00015797611708845449, "loss": 0.5378, "step": 2158 }, { "epoch": 0.32990793444626965, "grad_norm": 0.33184701204299927, "learning_rate": 0.0001579354405452444, "loss": 0.8383, "step": 2159 }, { "epoch": 0.3300607403445773, "grad_norm": 0.31365641951560974, "learning_rate": 0.00015789474956790563, "loss": 0.6231, "step": 2160 }, { "epoch": 0.33021354624288496, "grad_norm": 0.3012298047542572, "learning_rate": 0.00015785404416657602, "loss": 0.76, "step": 2161 }, { "epoch": 0.33036635214119264, "grad_norm": 0.38045307993888855, "learning_rate": 0.00015781332435139693, "loss": 0.9937, "step": 2162 }, { "epoch": 0.3305191580395003, "grad_norm": 0.3367868661880493, "learning_rate": 0.00015777259013251334, "loss": 0.8202, "step": 2163 }, { "epoch": 0.330671963937808, "grad_norm": 0.2767188847064972, "learning_rate": 0.00015773184152007393, "loss": 0.7562, "step": 2164 }, { "epoch": 0.3308247698361157, "grad_norm": 0.296550452709198, "learning_rate": 0.0001576910785242308, "loss": 0.8002, "step": 2165 }, { "epoch": 0.3309775757344233, "grad_norm": 0.288141667842865, "learning_rate": 0.0001576503011551397, "loss": 0.5228, "step": 2166 }, { "epoch": 0.331130381632731, "grad_norm": 0.3159697651863098, "learning_rate": 0.00015760950942296002, "loss": 0.6596, "step": 2167 }, { "epoch": 0.3312831875310387, "grad_norm": 0.42363399267196655, "learning_rate": 0.00015756870333785464, "loss": 0.9706, "step": 2168 }, { "epoch": 0.3314359934293464, "grad_norm": 0.37459617853164673, "learning_rate": 0.00015752788290999013, "loss": 0.648, "step": 2169 }, { "epoch": 0.33158879932765406, "grad_norm": 0.30844661593437195, "learning_rate": 0.00015748704814953643, "loss": 0.7611, "step": 2170 }, { "epoch": 0.33174160522596174, "grad_norm": 0.2618614733219147, "learning_rate": 0.00015744619906666725, "loss": 0.6118, "step": 2171 }, { "epoch": 0.3318944111242694, "grad_norm": 0.6982774138450623, "learning_rate": 0.0001574053356715598, "loss": 0.6533, "step": 2172 }, { "epoch": 0.33204721702257706, "grad_norm": 0.29015034437179565, "learning_rate": 0.00015736445797439488, "loss": 0.6744, "step": 2173 }, { "epoch": 0.33220002292088474, "grad_norm": 0.38911595940589905, "learning_rate": 0.00015732356598535676, "loss": 0.6925, "step": 2174 }, { "epoch": 0.3323528288191924, "grad_norm": 0.4622102677822113, "learning_rate": 0.00015728265971463333, "loss": 0.7888, "step": 2175 }, { "epoch": 0.3325056347175001, "grad_norm": 0.27185773849487305, "learning_rate": 0.00015724173917241614, "loss": 0.5808, "step": 2176 }, { "epoch": 0.3326584406158078, "grad_norm": 0.3064304292201996, "learning_rate": 0.00015720080436890007, "loss": 0.8677, "step": 2177 }, { "epoch": 0.3328112465141154, "grad_norm": 0.2787809669971466, "learning_rate": 0.00015715985531428379, "loss": 0.7143, "step": 2178 }, { "epoch": 0.3329640524124231, "grad_norm": 0.48594728112220764, "learning_rate": 0.00015711889201876935, "loss": 0.773, "step": 2179 }, { "epoch": 0.3331168583107308, "grad_norm": 0.3239424228668213, "learning_rate": 0.00015707791449256247, "loss": 0.5903, "step": 2180 }, { "epoch": 0.3332696642090385, "grad_norm": 0.26795390248298645, "learning_rate": 0.0001570369227458723, "loss": 0.7426, "step": 2181 }, { "epoch": 0.33342247010734616, "grad_norm": 0.34255295991897583, "learning_rate": 0.0001569959167889116, "loss": 0.7992, "step": 2182 }, { "epoch": 0.33342247010734616, "eval_loss": 0.7136940956115723, "eval_runtime": 1441.2558, "eval_samples_per_second": 7.738, "eval_steps_per_second": 3.869, "step": 2182 }, { "epoch": 0.33357527600565384, "grad_norm": 0.2938944697380066, "learning_rate": 0.00015695489663189666, "loss": 0.6712, "step": 2183 }, { "epoch": 0.33372808190396147, "grad_norm": 0.28934624791145325, "learning_rate": 0.00015691386228504733, "loss": 0.797, "step": 2184 }, { "epoch": 0.33388088780226916, "grad_norm": 0.2854679226875305, "learning_rate": 0.00015687281375858695, "loss": 0.6246, "step": 2185 }, { "epoch": 0.33403369370057684, "grad_norm": 0.3314021825790405, "learning_rate": 0.00015683175106274242, "loss": 0.5735, "step": 2186 }, { "epoch": 0.3341864995988845, "grad_norm": 0.2750674784183502, "learning_rate": 0.00015679067420774423, "loss": 0.6508, "step": 2187 }, { "epoch": 0.3343393054971922, "grad_norm": 0.3193671405315399, "learning_rate": 0.00015674958320382624, "loss": 0.5197, "step": 2188 }, { "epoch": 0.3344921113954999, "grad_norm": 0.3214784264564514, "learning_rate": 0.00015670847806122597, "loss": 0.5785, "step": 2189 }, { "epoch": 0.3346449172938075, "grad_norm": 0.27920016646385193, "learning_rate": 0.0001566673587901844, "loss": 0.5692, "step": 2190 }, { "epoch": 0.3347977231921152, "grad_norm": 0.43938395380973816, "learning_rate": 0.00015662622540094608, "loss": 0.7549, "step": 2191 }, { "epoch": 0.3349505290904229, "grad_norm": 0.30526235699653625, "learning_rate": 0.00015658507790375904, "loss": 0.94, "step": 2192 }, { "epoch": 0.3351033349887306, "grad_norm": 0.33049049973487854, "learning_rate": 0.0001565439163088748, "loss": 0.7732, "step": 2193 }, { "epoch": 0.33525614088703826, "grad_norm": 0.29554682970046997, "learning_rate": 0.00015650274062654847, "loss": 0.6675, "step": 2194 }, { "epoch": 0.33540894678534594, "grad_norm": 0.2941046357154846, "learning_rate": 0.0001564615508670386, "loss": 0.7829, "step": 2195 }, { "epoch": 0.33556175268365357, "grad_norm": 0.3211367726325989, "learning_rate": 0.00015642034704060732, "loss": 0.5786, "step": 2196 }, { "epoch": 0.33571455858196125, "grad_norm": 0.27026689052581787, "learning_rate": 0.00015637912915752016, "loss": 0.6511, "step": 2197 }, { "epoch": 0.33586736448026894, "grad_norm": 0.31031954288482666, "learning_rate": 0.00015633789722804622, "loss": 0.7701, "step": 2198 }, { "epoch": 0.3360201703785766, "grad_norm": 0.342227578163147, "learning_rate": 0.00015629665126245813, "loss": 0.6661, "step": 2199 }, { "epoch": 0.3361729762768843, "grad_norm": 0.3071631193161011, "learning_rate": 0.0001562553912710319, "loss": 0.6731, "step": 2200 }, { "epoch": 0.336325782175192, "grad_norm": 0.26992887258529663, "learning_rate": 0.00015621411726404717, "loss": 0.7173, "step": 2201 }, { "epoch": 0.3364785880734996, "grad_norm": 0.3526805639266968, "learning_rate": 0.00015617282925178705, "loss": 0.6753, "step": 2202 }, { "epoch": 0.3366313939718073, "grad_norm": 0.30212274193763733, "learning_rate": 0.00015613152724453799, "loss": 0.715, "step": 2203 }, { "epoch": 0.336784199870115, "grad_norm": 0.34919580817222595, "learning_rate": 0.0001560902112525901, "loss": 0.6164, "step": 2204 }, { "epoch": 0.33693700576842267, "grad_norm": 0.2764431834220886, "learning_rate": 0.00015604888128623693, "loss": 0.6118, "step": 2205 }, { "epoch": 0.33708981166673035, "grad_norm": 0.3970886766910553, "learning_rate": 0.0001560075373557755, "loss": 0.6037, "step": 2206 }, { "epoch": 0.337242617565038, "grad_norm": 0.29963481426239014, "learning_rate": 0.00015596617947150624, "loss": 0.5707, "step": 2207 }, { "epoch": 0.33739542346334567, "grad_norm": 0.3079460561275482, "learning_rate": 0.0001559248076437332, "loss": 0.8306, "step": 2208 }, { "epoch": 0.33754822936165335, "grad_norm": 0.3257281482219696, "learning_rate": 0.00015588342188276375, "loss": 0.5394, "step": 2209 }, { "epoch": 0.33770103525996104, "grad_norm": 0.4615156650543213, "learning_rate": 0.00015584202219890884, "loss": 0.7179, "step": 2210 }, { "epoch": 0.3378538411582687, "grad_norm": 0.33638259768486023, "learning_rate": 0.00015580060860248286, "loss": 0.6865, "step": 2211 }, { "epoch": 0.3380066470565764, "grad_norm": 0.3506909906864166, "learning_rate": 0.00015575918110380364, "loss": 0.6989, "step": 2212 }, { "epoch": 0.33815945295488403, "grad_norm": 0.3745541572570801, "learning_rate": 0.00015571773971319251, "loss": 0.8131, "step": 2213 }, { "epoch": 0.3383122588531917, "grad_norm": 0.31607136130332947, "learning_rate": 0.0001556762844409742, "loss": 0.8365, "step": 2214 }, { "epoch": 0.3384650647514994, "grad_norm": 0.33056318759918213, "learning_rate": 0.00015563481529747705, "loss": 0.5826, "step": 2215 }, { "epoch": 0.3386178706498071, "grad_norm": 0.3306300938129425, "learning_rate": 0.00015559333229303262, "loss": 0.7303, "step": 2216 }, { "epoch": 0.33877067654811477, "grad_norm": 0.24888025224208832, "learning_rate": 0.00015555183543797618, "loss": 0.5677, "step": 2217 }, { "epoch": 0.33892348244642245, "grad_norm": 0.3338901400566101, "learning_rate": 0.0001555103247426462, "loss": 0.6068, "step": 2218 }, { "epoch": 0.3390762883447301, "grad_norm": 0.26496437191963196, "learning_rate": 0.00015546880021738478, "loss": 0.6084, "step": 2219 }, { "epoch": 0.33922909424303777, "grad_norm": 0.3822322189807892, "learning_rate": 0.00015542726187253744, "loss": 0.7601, "step": 2220 }, { "epoch": 0.33938190014134545, "grad_norm": 0.3385266661643982, "learning_rate": 0.00015538570971845305, "loss": 0.5632, "step": 2221 }, { "epoch": 0.33953470603965313, "grad_norm": 0.2914586365222931, "learning_rate": 0.00015534414376548402, "loss": 0.7443, "step": 2222 }, { "epoch": 0.3396875119379608, "grad_norm": 0.2860872745513916, "learning_rate": 0.0001553025640239861, "loss": 0.6005, "step": 2223 }, { "epoch": 0.3398403178362685, "grad_norm": 0.2960110604763031, "learning_rate": 0.00015526097050431865, "loss": 0.7422, "step": 2224 }, { "epoch": 0.33999312373457613, "grad_norm": 0.2951801121234894, "learning_rate": 0.0001552193632168442, "loss": 0.7805, "step": 2225 }, { "epoch": 0.3401459296328838, "grad_norm": 0.5373976230621338, "learning_rate": 0.00015517774217192897, "loss": 0.7439, "step": 2226 }, { "epoch": 0.3402987355311915, "grad_norm": 0.296344131231308, "learning_rate": 0.00015513610737994245, "loss": 0.6432, "step": 2227 }, { "epoch": 0.3404515414294992, "grad_norm": 0.26670217514038086, "learning_rate": 0.0001550944588512576, "loss": 0.6878, "step": 2228 }, { "epoch": 0.34060434732780687, "grad_norm": 0.3236304223537445, "learning_rate": 0.0001550527965962508, "loss": 0.5546, "step": 2229 }, { "epoch": 0.34075715322611455, "grad_norm": 0.3119784891605377, "learning_rate": 0.00015501112062530186, "loss": 0.6956, "step": 2230 }, { "epoch": 0.3409099591244222, "grad_norm": 0.47150805592536926, "learning_rate": 0.00015496943094879398, "loss": 0.785, "step": 2231 }, { "epoch": 0.34106276502272986, "grad_norm": 0.6498871445655823, "learning_rate": 0.0001549277275771138, "loss": 0.6983, "step": 2232 }, { "epoch": 0.34121557092103755, "grad_norm": 0.33664408326148987, "learning_rate": 0.0001548860105206514, "loss": 0.5466, "step": 2233 }, { "epoch": 0.34136837681934523, "grad_norm": 0.2958558201789856, "learning_rate": 0.00015484427978980017, "loss": 0.804, "step": 2234 }, { "epoch": 0.3415211827176529, "grad_norm": 0.2821539044380188, "learning_rate": 0.00015480253539495707, "loss": 0.6465, "step": 2235 }, { "epoch": 0.3416739886159606, "grad_norm": 0.30043548345565796, "learning_rate": 0.00015476077734652224, "loss": 0.6388, "step": 2236 }, { "epoch": 0.34182679451426823, "grad_norm": 0.3065933287143707, "learning_rate": 0.0001547190056548994, "loss": 0.6553, "step": 2237 }, { "epoch": 0.3419796004125759, "grad_norm": 0.29310041666030884, "learning_rate": 0.00015467722033049567, "loss": 0.7219, "step": 2238 }, { "epoch": 0.3421324063108836, "grad_norm": 0.3400419354438782, "learning_rate": 0.00015463542138372148, "loss": 0.7735, "step": 2239 }, { "epoch": 0.3422852122091913, "grad_norm": 0.33613109588623047, "learning_rate": 0.00015459360882499063, "loss": 0.7178, "step": 2240 }, { "epoch": 0.34243801810749896, "grad_norm": 0.26561689376831055, "learning_rate": 0.00015455178266472045, "loss": 0.4622, "step": 2241 }, { "epoch": 0.34259082400580665, "grad_norm": 0.3775576055049896, "learning_rate": 0.00015450994291333153, "loss": 0.7419, "step": 2242 }, { "epoch": 0.3427436299041143, "grad_norm": 3.781869649887085, "learning_rate": 0.00015446808958124785, "loss": 0.9276, "step": 2243 }, { "epoch": 0.34289643580242196, "grad_norm": 0.389053612947464, "learning_rate": 0.00015442622267889693, "loss": 0.8774, "step": 2244 }, { "epoch": 0.34304924170072965, "grad_norm": 0.2652193307876587, "learning_rate": 0.0001543843422167095, "loss": 0.737, "step": 2245 }, { "epoch": 0.34320204759903733, "grad_norm": 0.3126509487628937, "learning_rate": 0.00015434244820511966, "loss": 0.683, "step": 2246 }, { "epoch": 0.343354853497345, "grad_norm": 0.30898094177246094, "learning_rate": 0.00015430054065456507, "loss": 0.7826, "step": 2247 }, { "epoch": 0.3435076593956527, "grad_norm": 0.2741771340370178, "learning_rate": 0.00015425861957548656, "loss": 0.7594, "step": 2248 }, { "epoch": 0.3436604652939603, "grad_norm": 0.3694680333137512, "learning_rate": 0.00015421668497832847, "loss": 0.6474, "step": 2249 }, { "epoch": 0.343813271192268, "grad_norm": 0.36894744634628296, "learning_rate": 0.0001541747368735384, "loss": 0.6786, "step": 2250 }, { "epoch": 0.3439660770905757, "grad_norm": 0.3785475790500641, "learning_rate": 0.00015413277527156742, "loss": 0.4514, "step": 2251 }, { "epoch": 0.3441188829888834, "grad_norm": 0.3092028498649597, "learning_rate": 0.00015409080018286987, "loss": 0.7509, "step": 2252 }, { "epoch": 0.34427168888719106, "grad_norm": 0.31305885314941406, "learning_rate": 0.00015404881161790353, "loss": 0.6581, "step": 2253 }, { "epoch": 0.34442449478549875, "grad_norm": 0.2979021668434143, "learning_rate": 0.00015400680958712942, "loss": 0.5952, "step": 2254 }, { "epoch": 0.3445773006838064, "grad_norm": 0.3245038390159607, "learning_rate": 0.00015396479410101208, "loss": 0.6446, "step": 2255 }, { "epoch": 0.34473010658211406, "grad_norm": 0.35698649287223816, "learning_rate": 0.0001539227651700193, "loss": 0.8561, "step": 2256 }, { "epoch": 0.34488291248042174, "grad_norm": 0.25988495349884033, "learning_rate": 0.00015388072280462218, "loss": 0.537, "step": 2257 }, { "epoch": 0.34503571837872943, "grad_norm": 0.2652510702610016, "learning_rate": 0.0001538386670152953, "loss": 0.6016, "step": 2258 }, { "epoch": 0.3451885242770371, "grad_norm": 0.38364800810813904, "learning_rate": 0.00015379659781251644, "loss": 0.601, "step": 2259 }, { "epoch": 0.34534133017534474, "grad_norm": 0.29123881459236145, "learning_rate": 0.00015375451520676685, "loss": 0.6864, "step": 2260 }, { "epoch": 0.3454941360736524, "grad_norm": 0.37606048583984375, "learning_rate": 0.000153712419208531, "loss": 0.7216, "step": 2261 }, { "epoch": 0.3456469419719601, "grad_norm": 0.30718401074409485, "learning_rate": 0.00015367030982829676, "loss": 0.7234, "step": 2262 }, { "epoch": 0.3457997478702678, "grad_norm": 0.34343576431274414, "learning_rate": 0.00015362818707655536, "loss": 0.7448, "step": 2263 }, { "epoch": 0.3459525537685755, "grad_norm": 0.30725371837615967, "learning_rate": 0.0001535860509638013, "loss": 0.7892, "step": 2264 }, { "epoch": 0.34610535966688316, "grad_norm": 0.28746816515922546, "learning_rate": 0.00015354390150053253, "loss": 0.6234, "step": 2265 }, { "epoch": 0.3462581655651908, "grad_norm": 0.35895246267318726, "learning_rate": 0.0001535017386972501, "loss": 0.7443, "step": 2266 }, { "epoch": 0.3464109714634985, "grad_norm": 0.2841184139251709, "learning_rate": 0.00015345956256445858, "loss": 0.6936, "step": 2267 }, { "epoch": 0.34656377736180616, "grad_norm": 0.2917341887950897, "learning_rate": 0.00015341737311266583, "loss": 0.7372, "step": 2268 }, { "epoch": 0.34671658326011384, "grad_norm": 0.3071459233760834, "learning_rate": 0.00015337517035238294, "loss": 0.6283, "step": 2269 }, { "epoch": 0.3468693891584215, "grad_norm": 0.2792901396751404, "learning_rate": 0.0001533329542941244, "loss": 0.5536, "step": 2270 }, { "epoch": 0.3470221950567292, "grad_norm": 0.2752489447593689, "learning_rate": 0.00015329072494840804, "loss": 0.7074, "step": 2271 }, { "epoch": 0.34717500095503684, "grad_norm": 0.28680381178855896, "learning_rate": 0.00015324848232575484, "loss": 0.7837, "step": 2272 }, { "epoch": 0.3473278068533445, "grad_norm": 0.31360378861427307, "learning_rate": 0.00015320622643668927, "loss": 0.7676, "step": 2273 }, { "epoch": 0.3474806127516522, "grad_norm": 0.29546040296554565, "learning_rate": 0.00015316395729173899, "loss": 0.606, "step": 2274 }, { "epoch": 0.3476334186499599, "grad_norm": 1.059844732284546, "learning_rate": 0.00015312167490143502, "loss": 0.5151, "step": 2275 }, { "epoch": 0.3477862245482676, "grad_norm": 0.29025423526763916, "learning_rate": 0.0001530793792763117, "loss": 0.5859, "step": 2276 }, { "epoch": 0.34793903044657526, "grad_norm": 0.33331283926963806, "learning_rate": 0.0001530370704269066, "loss": 0.6959, "step": 2277 }, { "epoch": 0.3480918363448829, "grad_norm": 0.29462912678718567, "learning_rate": 0.00015299474836376055, "loss": 0.643, "step": 2278 }, { "epoch": 0.34824464224319057, "grad_norm": 0.28086116909980774, "learning_rate": 0.00015295241309741783, "loss": 0.6262, "step": 2279 }, { "epoch": 0.34839744814149826, "grad_norm": 0.3096199333667755, "learning_rate": 0.00015291006463842588, "loss": 0.7098, "step": 2280 }, { "epoch": 0.34855025403980594, "grad_norm": 0.29386383295059204, "learning_rate": 0.00015286770299733547, "loss": 0.5968, "step": 2281 }, { "epoch": 0.3487030599381136, "grad_norm": 0.27785131335258484, "learning_rate": 0.00015282532818470065, "loss": 0.6851, "step": 2282 }, { "epoch": 0.3488558658364213, "grad_norm": 0.2330974042415619, "learning_rate": 0.0001527829402110787, "loss": 0.6592, "step": 2283 }, { "epoch": 0.34900867173472894, "grad_norm": 0.291621595621109, "learning_rate": 0.00015274053908703034, "loss": 0.7363, "step": 2284 }, { "epoch": 0.3491614776330366, "grad_norm": 0.29270750284194946, "learning_rate": 0.0001526981248231193, "loss": 0.5755, "step": 2285 }, { "epoch": 0.3493142835313443, "grad_norm": 0.31941109895706177, "learning_rate": 0.00015265569742991292, "loss": 0.5933, "step": 2286 }, { "epoch": 0.349467089429652, "grad_norm": 0.3711247146129608, "learning_rate": 0.00015261325691798145, "loss": 0.833, "step": 2287 }, { "epoch": 0.3496198953279597, "grad_norm": 0.28365087509155273, "learning_rate": 0.0001525708032978987, "loss": 0.8233, "step": 2288 }, { "epoch": 0.34977270122626736, "grad_norm": 0.33713802695274353, "learning_rate": 0.00015252833658024157, "loss": 0.698, "step": 2289 }, { "epoch": 0.349925507124575, "grad_norm": 0.3051641583442688, "learning_rate": 0.00015248585677559034, "loss": 0.6146, "step": 2290 }, { "epoch": 0.35007831302288267, "grad_norm": 0.29534676671028137, "learning_rate": 0.0001524433638945285, "loss": 0.6897, "step": 2291 }, { "epoch": 0.35023111892119035, "grad_norm": 0.27716103196144104, "learning_rate": 0.00015240085794764272, "loss": 0.7955, "step": 2292 }, { "epoch": 0.35038392481949804, "grad_norm": 0.295163631439209, "learning_rate": 0.00015235833894552308, "loss": 0.6941, "step": 2293 }, { "epoch": 0.3505367307178057, "grad_norm": 0.35691267251968384, "learning_rate": 0.00015231580689876277, "loss": 0.6965, "step": 2294 }, { "epoch": 0.3506895366161134, "grad_norm": 0.3154979348182678, "learning_rate": 0.00015227326181795837, "loss": 0.7597, "step": 2295 }, { "epoch": 0.35084234251442104, "grad_norm": 0.2859799563884735, "learning_rate": 0.00015223070371370954, "loss": 0.6982, "step": 2296 }, { "epoch": 0.3509951484127287, "grad_norm": 0.35966408252716064, "learning_rate": 0.00015218813259661933, "loss": 0.8101, "step": 2297 }, { "epoch": 0.3511479543110364, "grad_norm": 0.6553919315338135, "learning_rate": 0.00015214554847729395, "loss": 0.7671, "step": 2298 }, { "epoch": 0.3513007602093441, "grad_norm": 0.38289788365364075, "learning_rate": 0.00015210295136634293, "loss": 0.5688, "step": 2299 }, { "epoch": 0.35145356610765177, "grad_norm": 0.4104590117931366, "learning_rate": 0.0001520603412743789, "loss": 0.5505, "step": 2300 }, { "epoch": 0.35160637200595946, "grad_norm": 0.3637326955795288, "learning_rate": 0.00015201771821201789, "loss": 0.6424, "step": 2301 }, { "epoch": 0.3517591779042671, "grad_norm": 0.29642170667648315, "learning_rate": 0.000151975082189879, "loss": 0.792, "step": 2302 }, { "epoch": 0.35191198380257477, "grad_norm": 0.3389260172843933, "learning_rate": 0.00015193243321858467, "loss": 0.7985, "step": 2303 }, { "epoch": 0.35206478970088245, "grad_norm": 0.41423532366752625, "learning_rate": 0.00015188977130876056, "loss": 0.682, "step": 2304 }, { "epoch": 0.35221759559919014, "grad_norm": 0.2945079207420349, "learning_rate": 0.0001518470964710355, "loss": 0.8196, "step": 2305 }, { "epoch": 0.3523704014974978, "grad_norm": 0.2931058704853058, "learning_rate": 0.00015180440871604155, "loss": 0.8806, "step": 2306 }, { "epoch": 0.3525232073958055, "grad_norm": 0.2553795874118805, "learning_rate": 0.00015176170805441408, "loss": 0.7261, "step": 2307 }, { "epoch": 0.35267601329411313, "grad_norm": 0.35140493512153625, "learning_rate": 0.0001517189944967915, "loss": 0.6785, "step": 2308 }, { "epoch": 0.3528288191924208, "grad_norm": 0.2723594009876251, "learning_rate": 0.0001516762680538156, "loss": 0.7115, "step": 2309 }, { "epoch": 0.3529816250907285, "grad_norm": 0.4120732247829437, "learning_rate": 0.00015163352873613127, "loss": 0.5396, "step": 2310 }, { "epoch": 0.3531344309890362, "grad_norm": 0.30499234795570374, "learning_rate": 0.00015159077655438674, "loss": 0.787, "step": 2311 }, { "epoch": 0.35328723688734387, "grad_norm": 0.3186348080635071, "learning_rate": 0.00015154801151923323, "loss": 0.6939, "step": 2312 }, { "epoch": 0.35344004278565155, "grad_norm": 0.44435304403305054, "learning_rate": 0.0001515052336413254, "loss": 0.8076, "step": 2313 }, { "epoch": 0.3535928486839592, "grad_norm": 0.26792144775390625, "learning_rate": 0.00015146244293132096, "loss": 0.5881, "step": 2314 }, { "epoch": 0.35374565458226687, "grad_norm": 0.2927224636077881, "learning_rate": 0.00015141963939988083, "loss": 0.6064, "step": 2315 }, { "epoch": 0.35389846048057455, "grad_norm": 0.29608336091041565, "learning_rate": 0.0001513768230576692, "loss": 0.7699, "step": 2316 }, { "epoch": 0.35405126637888223, "grad_norm": 0.30591922998428345, "learning_rate": 0.0001513339939153533, "loss": 0.6701, "step": 2317 }, { "epoch": 0.3542040722771899, "grad_norm": 0.26143383979797363, "learning_rate": 0.0001512911519836038, "loss": 0.5759, "step": 2318 }, { "epoch": 0.35435687817549755, "grad_norm": 0.34693998098373413, "learning_rate": 0.0001512482972730943, "loss": 0.6425, "step": 2319 }, { "epoch": 0.35450968407380523, "grad_norm": 0.2774498462677002, "learning_rate": 0.00015120542979450173, "loss": 0.7096, "step": 2320 }, { "epoch": 0.3546624899721129, "grad_norm": 0.9198269844055176, "learning_rate": 0.0001511625495585062, "loss": 0.9403, "step": 2321 }, { "epoch": 0.3548152958704206, "grad_norm": 0.30706116557121277, "learning_rate": 0.00015111965657579085, "loss": 0.6938, "step": 2322 }, { "epoch": 0.3549681017687283, "grad_norm": 0.3365491032600403, "learning_rate": 0.00015107675085704222, "loss": 0.5908, "step": 2323 }, { "epoch": 0.35512090766703597, "grad_norm": 0.2673099637031555, "learning_rate": 0.00015103383241294984, "loss": 0.7071, "step": 2324 }, { "epoch": 0.3552737135653436, "grad_norm": 0.2802966833114624, "learning_rate": 0.0001509909012542065, "loss": 0.7405, "step": 2325 }, { "epoch": 0.3554265194636513, "grad_norm": 0.2657721936702728, "learning_rate": 0.0001509479573915082, "loss": 0.6928, "step": 2326 }, { "epoch": 0.35557932536195896, "grad_norm": 0.31786054372787476, "learning_rate": 0.00015090500083555394, "loss": 0.735, "step": 2327 }, { "epoch": 0.35573213126026665, "grad_norm": 0.31089332699775696, "learning_rate": 0.000150862031597046, "loss": 0.683, "step": 2328 }, { "epoch": 0.35588493715857433, "grad_norm": 0.310997873544693, "learning_rate": 0.0001508190496866899, "loss": 0.7928, "step": 2329 }, { "epoch": 0.356037743056882, "grad_norm": 0.39357268810272217, "learning_rate": 0.00015077605511519415, "loss": 0.8346, "step": 2330 }, { "epoch": 0.35619054895518965, "grad_norm": 0.3538849353790283, "learning_rate": 0.00015073304789327044, "loss": 0.801, "step": 2331 }, { "epoch": 0.35634335485349733, "grad_norm": 0.25808286666870117, "learning_rate": 0.00015069002803163377, "loss": 0.7358, "step": 2332 }, { "epoch": 0.356496160751805, "grad_norm": 0.27462631464004517, "learning_rate": 0.0001506469955410021, "loss": 0.6066, "step": 2333 }, { "epoch": 0.3566489666501127, "grad_norm": 0.2881491482257843, "learning_rate": 0.00015060395043209663, "loss": 0.8394, "step": 2334 }, { "epoch": 0.3568017725484204, "grad_norm": 0.2899307906627655, "learning_rate": 0.0001505608927156417, "loss": 0.5998, "step": 2335 }, { "epoch": 0.35695457844672807, "grad_norm": 0.3605771064758301, "learning_rate": 0.00015051782240236476, "loss": 0.6971, "step": 2336 }, { "epoch": 0.3571073843450357, "grad_norm": 0.27477413415908813, "learning_rate": 0.00015047473950299643, "loss": 0.7071, "step": 2337 }, { "epoch": 0.3572601902433434, "grad_norm": 0.2961339056491852, "learning_rate": 0.00015043164402827043, "loss": 0.7441, "step": 2338 }, { "epoch": 0.35741299614165106, "grad_norm": 0.30659833550453186, "learning_rate": 0.0001503885359889237, "loss": 0.7664, "step": 2339 }, { "epoch": 0.35756580203995875, "grad_norm": 0.2779198884963989, "learning_rate": 0.00015034541539569616, "loss": 0.7272, "step": 2340 }, { "epoch": 0.35771860793826643, "grad_norm": 0.3521401882171631, "learning_rate": 0.00015030228225933106, "loss": 0.6322, "step": 2341 }, { "epoch": 0.3578714138365741, "grad_norm": 0.39226970076560974, "learning_rate": 0.0001502591365905745, "loss": 0.6431, "step": 2342 }, { "epoch": 0.35802421973488174, "grad_norm": 0.2492583841085434, "learning_rate": 0.000150215978400176, "loss": 0.6294, "step": 2343 }, { "epoch": 0.3581770256331894, "grad_norm": 0.2733481526374817, "learning_rate": 0.00015017280769888793, "loss": 0.5777, "step": 2344 }, { "epoch": 0.3583298315314971, "grad_norm": 0.2837771773338318, "learning_rate": 0.00015012962449746607, "loss": 0.5669, "step": 2345 }, { "epoch": 0.3584826374298048, "grad_norm": 0.2990538477897644, "learning_rate": 0.00015008642880666903, "loss": 0.7183, "step": 2346 }, { "epoch": 0.3586354433281125, "grad_norm": 0.39534905552864075, "learning_rate": 0.00015004322063725872, "loss": 0.6699, "step": 2347 }, { "epoch": 0.35878824922642016, "grad_norm": 0.2837047874927521, "learning_rate": 0.00015000000000000001, "loss": 0.7628, "step": 2348 }, { "epoch": 0.3589410551247278, "grad_norm": 0.3078756630420685, "learning_rate": 0.00014995676690566105, "loss": 0.6729, "step": 2349 }, { "epoch": 0.3590938610230355, "grad_norm": 0.31207966804504395, "learning_rate": 0.00014991352136501296, "loss": 0.6307, "step": 2350 }, { "epoch": 0.35924666692134316, "grad_norm": 0.29956740140914917, "learning_rate": 0.00014987026338882998, "loss": 0.6225, "step": 2351 }, { "epoch": 0.35939947281965084, "grad_norm": 0.23339635133743286, "learning_rate": 0.00014982699298788954, "loss": 0.6805, "step": 2352 }, { "epoch": 0.35955227871795853, "grad_norm": 0.35019242763519287, "learning_rate": 0.000149783710172972, "loss": 0.772, "step": 2353 }, { "epoch": 0.3597050846162662, "grad_norm": 0.4311259388923645, "learning_rate": 0.00014974041495486104, "loss": 0.4946, "step": 2354 }, { "epoch": 0.35985789051457384, "grad_norm": 0.27752676606178284, "learning_rate": 0.0001496971073443432, "loss": 0.7505, "step": 2355 }, { "epoch": 0.3600106964128815, "grad_norm": 0.34155577421188354, "learning_rate": 0.00014965378735220822, "loss": 0.7861, "step": 2356 }, { "epoch": 0.3601635023111892, "grad_norm": 0.2626522481441498, "learning_rate": 0.00014961045498924894, "loss": 0.712, "step": 2357 }, { "epoch": 0.3603163082094969, "grad_norm": 0.2956133186817169, "learning_rate": 0.00014956711026626124, "loss": 0.6818, "step": 2358 }, { "epoch": 0.3604691141078046, "grad_norm": 0.29100513458251953, "learning_rate": 0.0001495237531940441, "loss": 0.6162, "step": 2359 }, { "epoch": 0.36062192000611226, "grad_norm": 0.3451087474822998, "learning_rate": 0.00014948038378339955, "loss": 0.8069, "step": 2360 }, { "epoch": 0.3607747259044199, "grad_norm": 0.2580629885196686, "learning_rate": 0.00014943700204513274, "loss": 0.5483, "step": 2361 }, { "epoch": 0.3609275318027276, "grad_norm": 0.2776690125465393, "learning_rate": 0.00014939360799005183, "loss": 0.7614, "step": 2362 }, { "epoch": 0.36108033770103526, "grad_norm": 0.28003740310668945, "learning_rate": 0.00014935020162896816, "loss": 0.5608, "step": 2363 }, { "epoch": 0.36123314359934294, "grad_norm": 0.28120556473731995, "learning_rate": 0.000149306782972696, "loss": 0.5789, "step": 2364 }, { "epoch": 0.3613859494976506, "grad_norm": 0.32332703471183777, "learning_rate": 0.00014926335203205272, "loss": 0.5761, "step": 2365 }, { "epoch": 0.3615387553959583, "grad_norm": 0.2898085117340088, "learning_rate": 0.00014921990881785886, "loss": 0.7513, "step": 2366 }, { "epoch": 0.36169156129426594, "grad_norm": 0.2950339913368225, "learning_rate": 0.00014917645334093784, "loss": 0.6948, "step": 2367 }, { "epoch": 0.3618443671925736, "grad_norm": 0.34204477071762085, "learning_rate": 0.0001491329856121163, "loss": 0.624, "step": 2368 }, { "epoch": 0.3619971730908813, "grad_norm": 0.3127589523792267, "learning_rate": 0.00014908950564222382, "loss": 0.7177, "step": 2369 }, { "epoch": 0.362149978989189, "grad_norm": 0.28709182143211365, "learning_rate": 0.00014904601344209307, "loss": 0.5862, "step": 2370 }, { "epoch": 0.3623027848874967, "grad_norm": 0.25311172008514404, "learning_rate": 0.00014900250902255977, "loss": 0.8151, "step": 2371 }, { "epoch": 0.3624555907858043, "grad_norm": 0.3411361277103424, "learning_rate": 0.0001489589923944627, "loss": 0.7777, "step": 2372 }, { "epoch": 0.362608396684112, "grad_norm": 0.28155237436294556, "learning_rate": 0.00014891546356864363, "loss": 0.6464, "step": 2373 }, { "epoch": 0.3627612025824197, "grad_norm": 0.28000929951667786, "learning_rate": 0.00014887192255594745, "loss": 0.662, "step": 2374 }, { "epoch": 0.36291400848072736, "grad_norm": 0.27866485714912415, "learning_rate": 0.00014882836936722197, "loss": 0.6344, "step": 2375 }, { "epoch": 0.36306681437903504, "grad_norm": 0.3239542543888092, "learning_rate": 0.00014878480401331817, "loss": 0.8088, "step": 2376 }, { "epoch": 0.3632196202773427, "grad_norm": 0.3022734820842743, "learning_rate": 0.00014874122650508994, "loss": 0.6214, "step": 2377 }, { "epoch": 0.36337242617565035, "grad_norm": 2.2979094982147217, "learning_rate": 0.00014869763685339434, "loss": 0.6594, "step": 2378 }, { "epoch": 0.36352523207395804, "grad_norm": 0.29689502716064453, "learning_rate": 0.0001486540350690912, "loss": 0.5897, "step": 2379 }, { "epoch": 0.3636780379722657, "grad_norm": 0.32437190413475037, "learning_rate": 0.00014861042116304368, "loss": 0.819, "step": 2380 }, { "epoch": 0.3638308438705734, "grad_norm": 0.3406039774417877, "learning_rate": 0.00014856679514611777, "loss": 0.8232, "step": 2381 }, { "epoch": 0.3639836497688811, "grad_norm": 0.7940521836280823, "learning_rate": 0.00014852315702918256, "loss": 0.7804, "step": 2382 }, { "epoch": 0.3641364556671888, "grad_norm": 0.2833361029624939, "learning_rate": 0.00014847950682311004, "loss": 0.6998, "step": 2383 }, { "epoch": 0.3642892615654964, "grad_norm": 0.36385026574134827, "learning_rate": 0.00014843584453877538, "loss": 0.7231, "step": 2384 }, { "epoch": 0.3644420674638041, "grad_norm": 0.33126354217529297, "learning_rate": 0.00014839217018705662, "loss": 0.7127, "step": 2385 }, { "epoch": 0.36459487336211177, "grad_norm": 0.2855713665485382, "learning_rate": 0.0001483484837788349, "loss": 0.7587, "step": 2386 }, { "epoch": 0.36474767926041946, "grad_norm": 0.2814899682998657, "learning_rate": 0.00014830478532499428, "loss": 0.6957, "step": 2387 }, { "epoch": 0.36490048515872714, "grad_norm": 0.6373705267906189, "learning_rate": 0.00014826107483642185, "loss": 0.6314, "step": 2388 }, { "epoch": 0.3650532910570348, "grad_norm": 0.2719639837741852, "learning_rate": 0.00014821735232400777, "loss": 0.6713, "step": 2389 }, { "epoch": 0.36520609695534245, "grad_norm": 0.2806015610694885, "learning_rate": 0.00014817361779864507, "loss": 0.742, "step": 2390 }, { "epoch": 0.36535890285365014, "grad_norm": 0.3191283345222473, "learning_rate": 0.00014812987127122993, "loss": 0.6505, "step": 2391 }, { "epoch": 0.3655117087519578, "grad_norm": 0.2744157612323761, "learning_rate": 0.00014808611275266134, "loss": 0.465, "step": 2392 }, { "epoch": 0.3656645146502655, "grad_norm": 0.33585116267204285, "learning_rate": 0.00014804234225384143, "loss": 0.6132, "step": 2393 }, { "epoch": 0.3658173205485732, "grad_norm": 0.26743748784065247, "learning_rate": 0.0001479985597856752, "loss": 0.7128, "step": 2394 }, { "epoch": 0.3659701264468809, "grad_norm": 0.2847437262535095, "learning_rate": 0.00014795476535907074, "loss": 0.7707, "step": 2395 }, { "epoch": 0.3661229323451885, "grad_norm": 0.2703080177307129, "learning_rate": 0.000147910958984939, "loss": 0.713, "step": 2396 }, { "epoch": 0.3662757382434962, "grad_norm": 0.3985111713409424, "learning_rate": 0.000147867140674194, "loss": 0.5579, "step": 2397 }, { "epoch": 0.36642854414180387, "grad_norm": 0.3106270730495453, "learning_rate": 0.00014782331043775276, "loss": 0.6585, "step": 2398 }, { "epoch": 0.36658135004011155, "grad_norm": 0.3780193030834198, "learning_rate": 0.00014777946828653513, "loss": 0.733, "step": 2399 }, { "epoch": 0.36673415593841924, "grad_norm": 0.3120858371257782, "learning_rate": 0.00014773561423146408, "loss": 0.7741, "step": 2400 }, { "epoch": 0.3668869618367269, "grad_norm": 0.27893051505088806, "learning_rate": 0.00014769174828346542, "loss": 0.7162, "step": 2401 }, { "epoch": 0.36703976773503455, "grad_norm": 0.29121726751327515, "learning_rate": 0.00014764787045346803, "loss": 0.5927, "step": 2402 }, { "epoch": 0.36719257363334223, "grad_norm": 0.28169146180152893, "learning_rate": 0.00014760398075240366, "loss": 0.682, "step": 2403 }, { "epoch": 0.3673453795316499, "grad_norm": 0.3464924693107605, "learning_rate": 0.0001475600791912071, "loss": 0.5718, "step": 2404 }, { "epoch": 0.3674981854299576, "grad_norm": 0.25900718569755554, "learning_rate": 0.00014751616578081604, "loss": 0.6681, "step": 2405 }, { "epoch": 0.3676509913282653, "grad_norm": 0.29786524176597595, "learning_rate": 0.0001474722405321711, "loss": 0.8521, "step": 2406 }, { "epoch": 0.36780379722657297, "grad_norm": 0.36379602551460266, "learning_rate": 0.00014742830345621598, "loss": 0.6777, "step": 2407 }, { "epoch": 0.3679566031248806, "grad_norm": 0.30311641097068787, "learning_rate": 0.00014738435456389717, "loss": 0.747, "step": 2408 }, { "epoch": 0.3681094090231883, "grad_norm": 0.2607172429561615, "learning_rate": 0.00014734039386616417, "loss": 0.5609, "step": 2409 }, { "epoch": 0.36826221492149597, "grad_norm": 0.5097734332084656, "learning_rate": 0.00014729642137396943, "loss": 0.8335, "step": 2410 }, { "epoch": 0.36841502081980365, "grad_norm": 0.32967936992645264, "learning_rate": 0.00014725243709826828, "loss": 0.6682, "step": 2411 }, { "epoch": 0.36856782671811134, "grad_norm": 0.2863999605178833, "learning_rate": 0.00014720844105001912, "loss": 0.7139, "step": 2412 }, { "epoch": 0.368720632616419, "grad_norm": 0.2852937579154968, "learning_rate": 0.00014716443324018315, "loss": 0.605, "step": 2413 }, { "epoch": 0.36887343851472665, "grad_norm": 0.40641558170318604, "learning_rate": 0.00014712041367972452, "loss": 0.686, "step": 2414 }, { "epoch": 0.36902624441303433, "grad_norm": 0.25617754459381104, "learning_rate": 0.00014707638237961037, "loss": 0.7407, "step": 2415 }, { "epoch": 0.369179050311342, "grad_norm": 0.3151395320892334, "learning_rate": 0.00014703233935081073, "loss": 0.6683, "step": 2416 }, { "epoch": 0.3693318562096497, "grad_norm": 0.2913879454135895, "learning_rate": 0.00014698828460429854, "loss": 0.7352, "step": 2417 }, { "epoch": 0.3694846621079574, "grad_norm": 0.2934713363647461, "learning_rate": 0.0001469442181510497, "loss": 0.7851, "step": 2418 }, { "epoch": 0.36963746800626507, "grad_norm": 0.3047449290752411, "learning_rate": 0.00014690014000204294, "loss": 0.6604, "step": 2419 }, { "epoch": 0.3697902739045727, "grad_norm": 0.2977202832698822, "learning_rate": 0.00014685605016825996, "loss": 0.7277, "step": 2420 }, { "epoch": 0.3699430798028804, "grad_norm": 0.2821477949619293, "learning_rate": 0.00014681194866068544, "loss": 0.7307, "step": 2421 }, { "epoch": 0.37009588570118807, "grad_norm": 0.28574398159980774, "learning_rate": 0.00014676783549030686, "loss": 0.6274, "step": 2422 }, { "epoch": 0.37024869159949575, "grad_norm": 0.30137869715690613, "learning_rate": 0.00014672371066811463, "loss": 0.6889, "step": 2423 }, { "epoch": 0.37040149749780343, "grad_norm": 0.3153139054775238, "learning_rate": 0.00014667957420510215, "loss": 0.6823, "step": 2424 }, { "epoch": 0.37055430339611106, "grad_norm": 0.32339897751808167, "learning_rate": 0.00014663542611226553, "loss": 0.7572, "step": 2425 }, { "epoch": 0.37070710929441875, "grad_norm": 0.2944089472293854, "learning_rate": 0.000146591266400604, "loss": 0.7334, "step": 2426 }, { "epoch": 0.37085991519272643, "grad_norm": 0.4568473994731903, "learning_rate": 0.0001465470950811195, "loss": 0.8559, "step": 2427 }, { "epoch": 0.3710127210910341, "grad_norm": 0.2831132411956787, "learning_rate": 0.00014650291216481706, "loss": 0.7136, "step": 2428 }, { "epoch": 0.3711655269893418, "grad_norm": 0.30619436502456665, "learning_rate": 0.00014645871766270436, "loss": 0.7136, "step": 2429 }, { "epoch": 0.3713183328876495, "grad_norm": 0.27592119574546814, "learning_rate": 0.00014641451158579216, "loss": 0.683, "step": 2430 }, { "epoch": 0.3714711387859571, "grad_norm": 0.27662381529808044, "learning_rate": 0.000146370293945094, "loss": 0.5909, "step": 2431 }, { "epoch": 0.3716239446842648, "grad_norm": 0.27695780992507935, "learning_rate": 0.00014632606475162635, "loss": 0.5979, "step": 2432 }, { "epoch": 0.3717767505825725, "grad_norm": 0.2685675024986267, "learning_rate": 0.00014628182401640858, "loss": 0.7144, "step": 2433 }, { "epoch": 0.37192955648088016, "grad_norm": 0.326612263917923, "learning_rate": 0.0001462375717504628, "loss": 0.7619, "step": 2434 }, { "epoch": 0.37208236237918785, "grad_norm": 0.2743641436100006, "learning_rate": 0.0001461933079648142, "loss": 0.4816, "step": 2435 }, { "epoch": 0.37223516827749553, "grad_norm": 0.2942219376564026, "learning_rate": 0.0001461490326704906, "loss": 0.8433, "step": 2436 }, { "epoch": 0.37238797417580316, "grad_norm": 0.25034305453300476, "learning_rate": 0.00014610474587852296, "loss": 0.6961, "step": 2437 }, { "epoch": 0.37254078007411084, "grad_norm": 0.2891073524951935, "learning_rate": 0.0001460604475999449, "loss": 0.5937, "step": 2438 }, { "epoch": 0.37269358597241853, "grad_norm": 0.2591763138771057, "learning_rate": 0.00014601613784579295, "loss": 0.6111, "step": 2439 }, { "epoch": 0.3728463918707262, "grad_norm": 0.3589370846748352, "learning_rate": 0.00014597181662710652, "loss": 0.7989, "step": 2440 }, { "epoch": 0.3729991977690339, "grad_norm": 0.257616251707077, "learning_rate": 0.00014592748395492788, "loss": 0.7384, "step": 2441 }, { "epoch": 0.3731520036673416, "grad_norm": 0.366580605506897, "learning_rate": 0.00014588313984030212, "loss": 0.6911, "step": 2442 }, { "epoch": 0.3733048095656492, "grad_norm": 0.3071226477622986, "learning_rate": 0.00014583878429427725, "loss": 0.5344, "step": 2443 }, { "epoch": 0.3734576154639569, "grad_norm": 0.36921221017837524, "learning_rate": 0.00014579441732790404, "loss": 0.5783, "step": 2444 }, { "epoch": 0.3736104213622646, "grad_norm": 0.30017930269241333, "learning_rate": 0.00014575003895223615, "loss": 0.8363, "step": 2445 }, { "epoch": 0.37376322726057226, "grad_norm": 0.3353256583213806, "learning_rate": 0.0001457056491783301, "loss": 0.5413, "step": 2446 }, { "epoch": 0.37391603315887995, "grad_norm": 0.28771746158599854, "learning_rate": 0.00014566124801724522, "loss": 0.7268, "step": 2447 }, { "epoch": 0.37406883905718763, "grad_norm": 0.2777288854122162, "learning_rate": 0.00014561683548004373, "loss": 0.8383, "step": 2448 }, { "epoch": 0.37422164495549526, "grad_norm": 0.35012948513031006, "learning_rate": 0.00014557241157779055, "loss": 0.7641, "step": 2449 }, { "epoch": 0.37437445085380294, "grad_norm": 0.312569797039032, "learning_rate": 0.0001455279763215536, "loss": 0.6564, "step": 2450 }, { "epoch": 0.3745272567521106, "grad_norm": 0.2877102494239807, "learning_rate": 0.00014548352972240354, "loss": 0.6939, "step": 2451 }, { "epoch": 0.3746800626504183, "grad_norm": 0.5239971876144409, "learning_rate": 0.0001454390717914138, "loss": 0.6307, "step": 2452 }, { "epoch": 0.374832868548726, "grad_norm": 0.3368930518627167, "learning_rate": 0.00014539460253966077, "loss": 0.6324, "step": 2453 }, { "epoch": 0.3749856744470337, "grad_norm": 0.30349984765052795, "learning_rate": 0.00014535012197822357, "loss": 0.7975, "step": 2454 }, { "epoch": 0.3751384803453413, "grad_norm": 0.2840270400047302, "learning_rate": 0.00014530563011818417, "loss": 0.5472, "step": 2455 }, { "epoch": 0.375291286243649, "grad_norm": 0.28692367672920227, "learning_rate": 0.00014526112697062733, "loss": 0.8516, "step": 2456 }, { "epoch": 0.3754440921419567, "grad_norm": 0.29820212721824646, "learning_rate": 0.00014521661254664062, "loss": 0.5865, "step": 2457 }, { "epoch": 0.37559689804026436, "grad_norm": 0.2936681807041168, "learning_rate": 0.00014517208685731447, "loss": 0.6314, "step": 2458 }, { "epoch": 0.37574970393857204, "grad_norm": 0.3220421075820923, "learning_rate": 0.00014512754991374206, "loss": 0.7181, "step": 2459 }, { "epoch": 0.37590250983687973, "grad_norm": 0.29429811239242554, "learning_rate": 0.0001450830017270194, "loss": 0.6019, "step": 2460 }, { "epoch": 0.37605531573518736, "grad_norm": 0.49563896656036377, "learning_rate": 0.0001450384423082453, "loss": 0.8089, "step": 2461 }, { "epoch": 0.37620812163349504, "grad_norm": 0.4126056134700775, "learning_rate": 0.00014499387166852135, "loss": 0.7697, "step": 2462 }, { "epoch": 0.3763609275318027, "grad_norm": 0.3450013995170593, "learning_rate": 0.00014494928981895197, "loss": 0.7991, "step": 2463 }, { "epoch": 0.3765137334301104, "grad_norm": 0.3362366557121277, "learning_rate": 0.00014490469677064436, "loss": 0.9246, "step": 2464 }, { "epoch": 0.3766665393284181, "grad_norm": 0.31218796968460083, "learning_rate": 0.00014486009253470846, "loss": 0.8765, "step": 2465 }, { "epoch": 0.3768193452267258, "grad_norm": 0.3747103810310364, "learning_rate": 0.0001448154771222571, "loss": 0.7145, "step": 2466 }, { "epoch": 0.3769721511250334, "grad_norm": 0.348871648311615, "learning_rate": 0.0001447708505444058, "loss": 0.7798, "step": 2467 }, { "epoch": 0.3771249570233411, "grad_norm": 0.35312315821647644, "learning_rate": 0.00014472621281227293, "loss": 0.5461, "step": 2468 }, { "epoch": 0.3772777629216488, "grad_norm": 0.3236096203327179, "learning_rate": 0.00014468156393697954, "loss": 0.7983, "step": 2469 }, { "epoch": 0.37743056881995646, "grad_norm": 0.23714995384216309, "learning_rate": 0.00014463690392964957, "loss": 0.5793, "step": 2470 }, { "epoch": 0.37758337471826414, "grad_norm": 0.38550207018852234, "learning_rate": 0.0001445922328014097, "loss": 0.695, "step": 2471 }, { "epoch": 0.3777361806165718, "grad_norm": 0.2918228805065155, "learning_rate": 0.00014454755056338934, "loss": 0.7962, "step": 2472 }, { "epoch": 0.37788898651487945, "grad_norm": 0.2856360673904419, "learning_rate": 0.00014450285722672067, "loss": 0.7473, "step": 2473 }, { "epoch": 0.37804179241318714, "grad_norm": 0.33044031262397766, "learning_rate": 0.00014445815280253875, "loss": 0.5781, "step": 2474 }, { "epoch": 0.3781945983114948, "grad_norm": 0.4528699815273285, "learning_rate": 0.00014441343730198117, "loss": 0.9506, "step": 2475 }, { "epoch": 0.3783474042098025, "grad_norm": 0.4538821578025818, "learning_rate": 0.0001443687107361886, "loss": 0.7509, "step": 2476 }, { "epoch": 0.3785002101081102, "grad_norm": 0.5137097239494324, "learning_rate": 0.0001443239731163041, "loss": 0.7929, "step": 2477 }, { "epoch": 0.3786530160064179, "grad_norm": 0.29435819387435913, "learning_rate": 0.0001442792244534738, "loss": 0.7049, "step": 2478 }, { "epoch": 0.3788058219047255, "grad_norm": 0.30987152457237244, "learning_rate": 0.00014423446475884643, "loss": 0.7649, "step": 2479 }, { "epoch": 0.3789586278030332, "grad_norm": 0.3604254424571991, "learning_rate": 0.00014418969404357345, "loss": 0.6638, "step": 2480 }, { "epoch": 0.37911143370134087, "grad_norm": 0.32214394211769104, "learning_rate": 0.00014414491231880917, "loss": 0.6358, "step": 2481 }, { "epoch": 0.37926423959964856, "grad_norm": 0.24983897805213928, "learning_rate": 0.00014410011959571054, "loss": 0.7039, "step": 2482 }, { "epoch": 0.37941704549795624, "grad_norm": 0.31551504135131836, "learning_rate": 0.00014405531588543733, "loss": 0.7776, "step": 2483 }, { "epoch": 0.37956985139626387, "grad_norm": 0.3642079830169678, "learning_rate": 0.00014401050119915192, "loss": 0.7002, "step": 2484 }, { "epoch": 0.37972265729457155, "grad_norm": 0.29362720251083374, "learning_rate": 0.00014396567554801962, "loss": 0.7925, "step": 2485 }, { "epoch": 0.37987546319287924, "grad_norm": 0.3027987480163574, "learning_rate": 0.00014392083894320827, "loss": 0.915, "step": 2486 }, { "epoch": 0.3800282690911869, "grad_norm": 0.38472673296928406, "learning_rate": 0.0001438759913958886, "loss": 0.8979, "step": 2487 }, { "epoch": 0.3801810749894946, "grad_norm": 0.27983352541923523, "learning_rate": 0.000143831132917234, "loss": 0.7819, "step": 2488 }, { "epoch": 0.3803338808878023, "grad_norm": 0.3205126225948334, "learning_rate": 0.00014378626351842054, "loss": 0.6158, "step": 2489 }, { "epoch": 0.3804866867861099, "grad_norm": 0.30116376280784607, "learning_rate": 0.0001437413832106271, "loss": 0.7862, "step": 2490 }, { "epoch": 0.3806394926844176, "grad_norm": 0.3567577004432678, "learning_rate": 0.00014369649200503517, "loss": 0.6213, "step": 2491 }, { "epoch": 0.3807922985827253, "grad_norm": 0.2745025157928467, "learning_rate": 0.00014365158991282907, "loss": 0.8277, "step": 2492 }, { "epoch": 0.38094510448103297, "grad_norm": 0.2893485128879547, "learning_rate": 0.00014360667694519576, "loss": 0.5813, "step": 2493 }, { "epoch": 0.38109791037934065, "grad_norm": 0.3255918323993683, "learning_rate": 0.00014356175311332496, "loss": 0.853, "step": 2494 }, { "epoch": 0.38125071627764834, "grad_norm": 0.3026112914085388, "learning_rate": 0.00014351681842840903, "loss": 0.5956, "step": 2495 }, { "epoch": 0.38140352217595597, "grad_norm": 0.3224642872810364, "learning_rate": 0.00014347187290164308, "loss": 1.1075, "step": 2496 }, { "epoch": 0.38155632807426365, "grad_norm": 0.4730568528175354, "learning_rate": 0.00014342691654422492, "loss": 0.8043, "step": 2497 }, { "epoch": 0.38170913397257134, "grad_norm": 0.2610538601875305, "learning_rate": 0.000143381949367355, "loss": 0.7726, "step": 2498 }, { "epoch": 0.381861939870879, "grad_norm": 0.27993085980415344, "learning_rate": 0.0001433369713822366, "loss": 0.7121, "step": 2499 }, { "epoch": 0.3820147457691867, "grad_norm": 0.3264187276363373, "learning_rate": 0.00014329198260007553, "loss": 0.6973, "step": 2500 }, { "epoch": 0.3821675516674944, "grad_norm": 0.3121355473995209, "learning_rate": 0.00014324698303208038, "loss": 0.6586, "step": 2501 }, { "epoch": 0.382320357565802, "grad_norm": 0.2439948469400406, "learning_rate": 0.0001432019726894625, "loss": 0.4669, "step": 2502 }, { "epoch": 0.3824731634641097, "grad_norm": 0.32409751415252686, "learning_rate": 0.00014315695158343572, "loss": 0.8436, "step": 2503 }, { "epoch": 0.3826259693624174, "grad_norm": 0.3331731855869293, "learning_rate": 0.00014311191972521674, "loss": 0.6083, "step": 2504 }, { "epoch": 0.38277877526072507, "grad_norm": 0.2964318096637726, "learning_rate": 0.00014306687712602485, "loss": 0.7832, "step": 2505 }, { "epoch": 0.38293158115903275, "grad_norm": 0.36654728651046753, "learning_rate": 0.00014302182379708205, "loss": 0.686, "step": 2506 }, { "epoch": 0.38308438705734044, "grad_norm": 0.46433189511299133, "learning_rate": 0.00014297675974961295, "loss": 0.7919, "step": 2507 }, { "epoch": 0.38323719295564806, "grad_norm": 0.29593682289123535, "learning_rate": 0.00014293168499484495, "loss": 0.7321, "step": 2508 }, { "epoch": 0.38338999885395575, "grad_norm": 0.27927494049072266, "learning_rate": 0.000142886599544008, "loss": 0.6574, "step": 2509 }, { "epoch": 0.38354280475226343, "grad_norm": 0.2708612382411957, "learning_rate": 0.00014284150340833476, "loss": 0.6859, "step": 2510 }, { "epoch": 0.3836956106505711, "grad_norm": 0.29946985840797424, "learning_rate": 0.00014279639659906058, "loss": 0.7239, "step": 2511 }, { "epoch": 0.3838484165488788, "grad_norm": 0.35676175355911255, "learning_rate": 0.00014275127912742345, "loss": 0.7373, "step": 2512 }, { "epoch": 0.3840012224471865, "grad_norm": 0.35043084621429443, "learning_rate": 0.00014270615100466397, "loss": 0.6002, "step": 2513 }, { "epoch": 0.3841540283454941, "grad_norm": 0.28143876791000366, "learning_rate": 0.00014266101224202546, "loss": 0.8477, "step": 2514 }, { "epoch": 0.3843068342438018, "grad_norm": 0.2760816514492035, "learning_rate": 0.00014261586285075386, "loss": 0.719, "step": 2515 }, { "epoch": 0.3844596401421095, "grad_norm": 0.27988770604133606, "learning_rate": 0.00014257070284209774, "loss": 0.6344, "step": 2516 }, { "epoch": 0.38461244604041717, "grad_norm": 0.25354310870170593, "learning_rate": 0.00014252553222730838, "loss": 0.7014, "step": 2517 }, { "epoch": 0.38476525193872485, "grad_norm": 0.2850781977176666, "learning_rate": 0.00014248035101763963, "loss": 0.604, "step": 2518 }, { "epoch": 0.38491805783703253, "grad_norm": 0.332959920167923, "learning_rate": 0.000142435159224348, "loss": 0.7286, "step": 2519 }, { "epoch": 0.38507086373534016, "grad_norm": 0.29361769556999207, "learning_rate": 0.00014238995685869268, "loss": 0.7916, "step": 2520 }, { "epoch": 0.38522366963364785, "grad_norm": 0.2901209592819214, "learning_rate": 0.00014234474393193543, "loss": 0.6919, "step": 2521 }, { "epoch": 0.38537647553195553, "grad_norm": 0.2989867329597473, "learning_rate": 0.0001422995204553407, "loss": 0.858, "step": 2522 }, { "epoch": 0.3855292814302632, "grad_norm": 0.2621612250804901, "learning_rate": 0.00014225428644017548, "loss": 0.7048, "step": 2523 }, { "epoch": 0.3856820873285709, "grad_norm": 0.35946473479270935, "learning_rate": 0.00014220904189770952, "loss": 0.8626, "step": 2524 }, { "epoch": 0.3858348932268786, "grad_norm": 0.31629034876823425, "learning_rate": 0.00014216378683921504, "loss": 0.758, "step": 2525 }, { "epoch": 0.3859876991251862, "grad_norm": 0.32585909962654114, "learning_rate": 0.00014211852127596705, "loss": 0.6084, "step": 2526 }, { "epoch": 0.3861405050234939, "grad_norm": 0.32691988348960876, "learning_rate": 0.00014207324521924304, "loss": 0.7124, "step": 2527 }, { "epoch": 0.3862933109218016, "grad_norm": 0.36711400747299194, "learning_rate": 0.00014202795868032312, "loss": 0.6328, "step": 2528 }, { "epoch": 0.38644611682010926, "grad_norm": 0.29490792751312256, "learning_rate": 0.00014198266167049012, "loss": 0.8087, "step": 2529 }, { "epoch": 0.38659892271841695, "grad_norm": 0.3001713752746582, "learning_rate": 0.00014193735420102934, "loss": 0.579, "step": 2530 }, { "epoch": 0.38675172861672463, "grad_norm": 0.2902267575263977, "learning_rate": 0.00014189203628322885, "loss": 0.6978, "step": 2531 }, { "epoch": 0.38690453451503226, "grad_norm": 0.34467917680740356, "learning_rate": 0.0001418467079283791, "loss": 0.7732, "step": 2532 }, { "epoch": 0.38705734041333995, "grad_norm": 0.30216652154922485, "learning_rate": 0.0001418013691477734, "loss": 0.661, "step": 2533 }, { "epoch": 0.38721014631164763, "grad_norm": 0.309682697057724, "learning_rate": 0.00014175601995270747, "loss": 0.8284, "step": 2534 }, { "epoch": 0.3873629522099553, "grad_norm": 0.40115198493003845, "learning_rate": 0.00014171066035447965, "loss": 0.7166, "step": 2535 }, { "epoch": 0.387515758108263, "grad_norm": 0.27580732107162476, "learning_rate": 0.00014166529036439094, "loss": 0.7984, "step": 2536 }, { "epoch": 0.3876685640065706, "grad_norm": 0.30666035413742065, "learning_rate": 0.00014161990999374488, "loss": 0.6308, "step": 2537 }, { "epoch": 0.3878213699048783, "grad_norm": 0.3423399031162262, "learning_rate": 0.00014157451925384763, "loss": 0.6894, "step": 2538 }, { "epoch": 0.387974175803186, "grad_norm": 0.3036220073699951, "learning_rate": 0.00014152911815600784, "loss": 0.8044, "step": 2539 }, { "epoch": 0.3881269817014937, "grad_norm": 0.35811495780944824, "learning_rate": 0.00014148370671153692, "loss": 0.7668, "step": 2540 }, { "epoch": 0.38827978759980136, "grad_norm": 0.29517245292663574, "learning_rate": 0.00014143828493174866, "loss": 0.7531, "step": 2541 }, { "epoch": 0.38843259349810905, "grad_norm": 0.30313557386398315, "learning_rate": 0.0001413928528279596, "loss": 0.7122, "step": 2542 }, { "epoch": 0.3885853993964167, "grad_norm": 0.4556387662887573, "learning_rate": 0.0001413474104114887, "loss": 0.8926, "step": 2543 }, { "epoch": 0.38873820529472436, "grad_norm": 0.30476585030555725, "learning_rate": 0.00014130195769365757, "loss": 0.8802, "step": 2544 }, { "epoch": 0.38889101119303204, "grad_norm": 0.3249836564064026, "learning_rate": 0.00014125649468579038, "loss": 0.8169, "step": 2545 }, { "epoch": 0.3890438170913397, "grad_norm": 0.30261462926864624, "learning_rate": 0.00014121102139921386, "loss": 0.9638, "step": 2546 }, { "epoch": 0.3891966229896474, "grad_norm": 0.27610981464385986, "learning_rate": 0.0001411655378452573, "loss": 0.6815, "step": 2547 }, { "epoch": 0.3893494288879551, "grad_norm": 0.2926682233810425, "learning_rate": 0.00014112004403525253, "loss": 0.5433, "step": 2548 }, { "epoch": 0.3895022347862627, "grad_norm": 0.4395153820514679, "learning_rate": 0.00014107453998053396, "loss": 0.7815, "step": 2549 }, { "epoch": 0.3896550406845704, "grad_norm": 0.287105530500412, "learning_rate": 0.00014102902569243855, "loss": 0.6523, "step": 2550 }, { "epoch": 0.3898078465828781, "grad_norm": 0.34142303466796875, "learning_rate": 0.0001409835011823058, "loss": 0.7796, "step": 2551 }, { "epoch": 0.3899606524811858, "grad_norm": 0.26499852538108826, "learning_rate": 0.0001409379664614777, "loss": 0.7323, "step": 2552 }, { "epoch": 0.39011345837949346, "grad_norm": 0.29665425419807434, "learning_rate": 0.00014089242154129898, "loss": 0.4781, "step": 2553 }, { "epoch": 0.39026626427780114, "grad_norm": 0.271915078163147, "learning_rate": 0.00014084686643311666, "loss": 0.6668, "step": 2554 }, { "epoch": 0.3904190701761088, "grad_norm": 0.2694081962108612, "learning_rate": 0.00014080130114828046, "loss": 0.7001, "step": 2555 }, { "epoch": 0.39057187607441646, "grad_norm": 0.3542138338088989, "learning_rate": 0.00014075572569814256, "loss": 0.701, "step": 2556 }, { "epoch": 0.39072468197272414, "grad_norm": 0.31724610924720764, "learning_rate": 0.0001407101400940577, "loss": 0.9051, "step": 2557 }, { "epoch": 0.3908774878710318, "grad_norm": 0.48276618123054504, "learning_rate": 0.00014066454434738318, "loss": 0.7013, "step": 2558 }, { "epoch": 0.3910302937693395, "grad_norm": 0.34542426466941833, "learning_rate": 0.0001406189384694788, "loss": 0.7639, "step": 2559 }, { "epoch": 0.3911830996676472, "grad_norm": 0.3011816143989563, "learning_rate": 0.00014057332247170685, "loss": 0.6921, "step": 2560 }, { "epoch": 0.3913359055659548, "grad_norm": 0.3123289942741394, "learning_rate": 0.0001405276963654322, "loss": 0.7067, "step": 2561 }, { "epoch": 0.3914887114642625, "grad_norm": 0.3149774670600891, "learning_rate": 0.0001404820601620222, "loss": 0.7665, "step": 2562 }, { "epoch": 0.3916415173625702, "grad_norm": 0.2675241231918335, "learning_rate": 0.0001404364138728467, "loss": 0.7803, "step": 2563 }, { "epoch": 0.3917943232608779, "grad_norm": 0.3044669032096863, "learning_rate": 0.00014039075750927813, "loss": 0.7445, "step": 2564 }, { "epoch": 0.39194712915918556, "grad_norm": 0.27285170555114746, "learning_rate": 0.00014034509108269138, "loss": 0.7312, "step": 2565 }, { "epoch": 0.39209993505749324, "grad_norm": 0.2783736288547516, "learning_rate": 0.00014029941460446389, "loss": 0.84, "step": 2566 }, { "epoch": 0.39225274095580087, "grad_norm": 0.3714994192123413, "learning_rate": 0.00014025372808597548, "loss": 0.6991, "step": 2567 }, { "epoch": 0.39240554685410856, "grad_norm": 0.28046417236328125, "learning_rate": 0.00014020803153860865, "loss": 0.6944, "step": 2568 }, { "epoch": 0.39255835275241624, "grad_norm": 0.28387904167175293, "learning_rate": 0.00014016232497374823, "loss": 0.7067, "step": 2569 }, { "epoch": 0.3927111586507239, "grad_norm": 0.3740023970603943, "learning_rate": 0.00014011660840278174, "loss": 0.7416, "step": 2570 }, { "epoch": 0.3928639645490316, "grad_norm": 0.5043659806251526, "learning_rate": 0.00014007088183709895, "loss": 0.7961, "step": 2571 }, { "epoch": 0.3930167704473393, "grad_norm": 0.3045665919780731, "learning_rate": 0.00014002514528809235, "loss": 0.6823, "step": 2572 }, { "epoch": 0.3931695763456469, "grad_norm": 0.23655778169631958, "learning_rate": 0.0001399793987671568, "loss": 0.651, "step": 2573 }, { "epoch": 0.3933223822439546, "grad_norm": 0.362617552280426, "learning_rate": 0.0001399336422856896, "loss": 0.6424, "step": 2574 }, { "epoch": 0.3934751881422623, "grad_norm": 0.2810218632221222, "learning_rate": 0.0001398878758550907, "loss": 0.5367, "step": 2575 }, { "epoch": 0.39362799404057, "grad_norm": 0.28525862097740173, "learning_rate": 0.00013984209948676233, "loss": 0.6672, "step": 2576 }, { "epoch": 0.39378079993887766, "grad_norm": 0.5437533259391785, "learning_rate": 0.00013979631319210932, "loss": 0.6273, "step": 2577 }, { "epoch": 0.39393360583718534, "grad_norm": 0.29662612080574036, "learning_rate": 0.0001397505169825389, "loss": 0.6011, "step": 2578 }, { "epoch": 0.39408641173549297, "grad_norm": 0.274076908826828, "learning_rate": 0.00013970471086946091, "loss": 0.6626, "step": 2579 }, { "epoch": 0.39423921763380065, "grad_norm": 0.26067155599594116, "learning_rate": 0.00013965889486428743, "loss": 0.666, "step": 2580 }, { "epoch": 0.39439202353210834, "grad_norm": 0.335151731967926, "learning_rate": 0.00013961306897843328, "loss": 0.7958, "step": 2581 }, { "epoch": 0.394544829430416, "grad_norm": 0.29889029264450073, "learning_rate": 0.00013956723322331544, "loss": 0.6133, "step": 2582 }, { "epoch": 0.3946976353287237, "grad_norm": 0.28185123205184937, "learning_rate": 0.00013952138761035363, "loss": 0.6197, "step": 2583 }, { "epoch": 0.3948504412270314, "grad_norm": 0.2703631520271301, "learning_rate": 0.00013947553215096982, "loss": 0.7928, "step": 2584 }, { "epoch": 0.395003247125339, "grad_norm": 0.3054632544517517, "learning_rate": 0.00013942966685658855, "loss": 0.6414, "step": 2585 }, { "epoch": 0.3951560530236467, "grad_norm": 0.28667205572128296, "learning_rate": 0.00013938379173863679, "loss": 0.6163, "step": 2586 }, { "epoch": 0.3953088589219544, "grad_norm": 0.31871435046195984, "learning_rate": 0.00013933790680854387, "loss": 0.7422, "step": 2587 }, { "epoch": 0.39546166482026207, "grad_norm": 0.2837061285972595, "learning_rate": 0.0001392920120777417, "loss": 0.7655, "step": 2588 }, { "epoch": 0.39561447071856976, "grad_norm": 0.2924594581127167, "learning_rate": 0.00013924610755766456, "loss": 0.5866, "step": 2589 }, { "epoch": 0.39576727661687744, "grad_norm": 0.27115708589553833, "learning_rate": 0.00013920019325974916, "loss": 0.9004, "step": 2590 }, { "epoch": 0.39592008251518507, "grad_norm": 0.3006618916988373, "learning_rate": 0.00013915426919543466, "loss": 0.8016, "step": 2591 }, { "epoch": 0.39607288841349275, "grad_norm": 0.3554551601409912, "learning_rate": 0.00013910833537616264, "loss": 0.5658, "step": 2592 }, { "epoch": 0.39622569431180044, "grad_norm": 0.35639873147010803, "learning_rate": 0.00013906239181337717, "loss": 0.5948, "step": 2593 }, { "epoch": 0.3963785002101081, "grad_norm": 0.2902330160140991, "learning_rate": 0.0001390164385185247, "loss": 0.6889, "step": 2594 }, { "epoch": 0.3965313061084158, "grad_norm": 0.3317681550979614, "learning_rate": 0.00013897047550305404, "loss": 0.6601, "step": 2595 }, { "epoch": 0.39668411200672343, "grad_norm": 0.3392220139503479, "learning_rate": 0.00013892450277841654, "loss": 0.7833, "step": 2596 }, { "epoch": 0.3968369179050311, "grad_norm": 0.30571088194847107, "learning_rate": 0.00013887852035606596, "loss": 0.5213, "step": 2597 }, { "epoch": 0.3969897238033388, "grad_norm": 0.3831685781478882, "learning_rate": 0.00013883252824745834, "loss": 0.7385, "step": 2598 }, { "epoch": 0.3971425297016465, "grad_norm": 0.5502047538757324, "learning_rate": 0.0001387865264640523, "loss": 0.6916, "step": 2599 }, { "epoch": 0.39729533559995417, "grad_norm": 0.2634164094924927, "learning_rate": 0.0001387405150173088, "loss": 0.6008, "step": 2600 }, { "epoch": 0.39744814149826185, "grad_norm": 0.4813648760318756, "learning_rate": 0.00013869449391869113, "loss": 0.8057, "step": 2601 }, { "epoch": 0.3976009473965695, "grad_norm": 0.26484498381614685, "learning_rate": 0.00013864846317966515, "loss": 0.647, "step": 2602 }, { "epoch": 0.39775375329487717, "grad_norm": 0.2711394131183624, "learning_rate": 0.00013860242281169897, "loss": 0.647, "step": 2603 }, { "epoch": 0.39790655919318485, "grad_norm": 0.4527345597743988, "learning_rate": 0.00013855637282626318, "loss": 0.804, "step": 2604 }, { "epoch": 0.39805936509149253, "grad_norm": 0.3270074427127838, "learning_rate": 0.00013851031323483076, "loss": 0.7399, "step": 2605 }, { "epoch": 0.3982121709898002, "grad_norm": 0.34323227405548096, "learning_rate": 0.0001384642440488771, "loss": 0.6582, "step": 2606 }, { "epoch": 0.3983649768881079, "grad_norm": 0.2863471210002899, "learning_rate": 0.00013841816527987986, "loss": 0.4844, "step": 2607 }, { "epoch": 0.39851778278641553, "grad_norm": 0.3035363256931305, "learning_rate": 0.00013837207693931925, "loss": 0.748, "step": 2608 }, { "epoch": 0.3986705886847232, "grad_norm": 0.31653252243995667, "learning_rate": 0.00013832597903867775, "loss": 0.7071, "step": 2609 }, { "epoch": 0.3988233945830309, "grad_norm": 0.30605781078338623, "learning_rate": 0.00013827987158944035, "loss": 0.6058, "step": 2610 }, { "epoch": 0.3989762004813386, "grad_norm": 0.37771061062812805, "learning_rate": 0.00013823375460309423, "loss": 0.6425, "step": 2611 }, { "epoch": 0.39912900637964627, "grad_norm": 0.26124832034111023, "learning_rate": 0.0001381876280911291, "loss": 0.6056, "step": 2612 }, { "epoch": 0.39928181227795395, "grad_norm": 0.30022165179252625, "learning_rate": 0.000138141492065037, "loss": 0.6605, "step": 2613 }, { "epoch": 0.3994346181762616, "grad_norm": 0.34445032477378845, "learning_rate": 0.00013809534653631237, "loss": 0.6027, "step": 2614 }, { "epoch": 0.39958742407456926, "grad_norm": 0.27877411246299744, "learning_rate": 0.00013804919151645182, "loss": 0.805, "step": 2615 }, { "epoch": 0.39974022997287695, "grad_norm": 0.2860463559627533, "learning_rate": 0.00013800302701695469, "loss": 0.6378, "step": 2616 }, { "epoch": 0.39989303587118463, "grad_norm": 0.3459800183773041, "learning_rate": 0.00013795685304932232, "loss": 0.5781, "step": 2617 }, { "epoch": 0.4000458417694923, "grad_norm": 3.20552659034729, "learning_rate": 0.00013791066962505868, "loss": 0.6375, "step": 2618 }, { "epoch": 0.4001986476678, "grad_norm": 0.30947524309158325, "learning_rate": 0.0001378644767556699, "loss": 0.842, "step": 2619 }, { "epoch": 0.40035145356610763, "grad_norm": 1.742050290107727, "learning_rate": 0.0001378182744526646, "loss": 0.8278, "step": 2620 }, { "epoch": 0.4005042594644153, "grad_norm": 0.2930509150028229, "learning_rate": 0.0001377720627275537, "loss": 0.7768, "step": 2621 }, { "epoch": 0.400657065362723, "grad_norm": 0.3572491407394409, "learning_rate": 0.00013772584159185038, "loss": 0.7051, "step": 2622 }, { "epoch": 0.4008098712610307, "grad_norm": 0.2924429178237915, "learning_rate": 0.00013767961105707035, "loss": 0.6823, "step": 2623 }, { "epoch": 0.40096267715933837, "grad_norm": 0.3206632733345032, "learning_rate": 0.0001376333711347315, "loss": 0.6994, "step": 2624 }, { "epoch": 0.40111548305764605, "grad_norm": 0.27822092175483704, "learning_rate": 0.00013758712183635415, "loss": 0.7541, "step": 2625 }, { "epoch": 0.4012682889559537, "grad_norm": 0.2822110056877136, "learning_rate": 0.0001375408631734609, "loss": 0.7284, "step": 2626 }, { "epoch": 0.40142109485426136, "grad_norm": 0.26143571734428406, "learning_rate": 0.00013749459515757673, "loss": 0.7453, "step": 2627 }, { "epoch": 0.40157390075256905, "grad_norm": 0.27988147735595703, "learning_rate": 0.0001374483178002289, "loss": 0.8043, "step": 2628 }, { "epoch": 0.40172670665087673, "grad_norm": 0.31378600001335144, "learning_rate": 0.00013740203111294703, "loss": 0.6827, "step": 2629 }, { "epoch": 0.4018795125491844, "grad_norm": 0.30002671480178833, "learning_rate": 0.0001373557351072631, "loss": 0.818, "step": 2630 }, { "epoch": 0.4020323184474921, "grad_norm": 0.3368836045265198, "learning_rate": 0.0001373094297947113, "loss": 0.682, "step": 2631 }, { "epoch": 0.4021851243457997, "grad_norm": 0.3038204610347748, "learning_rate": 0.00013726311518682827, "loss": 0.5353, "step": 2632 }, { "epoch": 0.4023379302441074, "grad_norm": 0.31147778034210205, "learning_rate": 0.0001372167912951529, "loss": 0.7181, "step": 2633 }, { "epoch": 0.4024907361424151, "grad_norm": 0.34573498368263245, "learning_rate": 0.00013717045813122639, "loss": 0.649, "step": 2634 }, { "epoch": 0.4026435420407228, "grad_norm": 0.29101598262786865, "learning_rate": 0.00013712411570659223, "loss": 0.745, "step": 2635 }, { "epoch": 0.40279634793903046, "grad_norm": 0.4098125398159027, "learning_rate": 0.00013707776403279627, "loss": 0.4228, "step": 2636 }, { "epoch": 0.40294915383733815, "grad_norm": 0.29835259914398193, "learning_rate": 0.00013703140312138666, "loss": 0.75, "step": 2637 }, { "epoch": 0.4031019597356458, "grad_norm": 0.2908041477203369, "learning_rate": 0.00013698503298391384, "loss": 0.5537, "step": 2638 }, { "epoch": 0.40325476563395346, "grad_norm": 0.34891489148139954, "learning_rate": 0.00013693865363193045, "loss": 0.7482, "step": 2639 }, { "epoch": 0.40340757153226114, "grad_norm": 0.3375150263309479, "learning_rate": 0.0001368922650769916, "loss": 0.65, "step": 2640 }, { "epoch": 0.40356037743056883, "grad_norm": 0.25705015659332275, "learning_rate": 0.00013684586733065464, "loss": 0.5924, "step": 2641 }, { "epoch": 0.4037131833288765, "grad_norm": 0.4616255760192871, "learning_rate": 0.00013679946040447906, "loss": 0.8208, "step": 2642 }, { "epoch": 0.4038659892271842, "grad_norm": 0.332537978887558, "learning_rate": 0.00013675304431002688, "loss": 0.7843, "step": 2643 }, { "epoch": 0.4040187951254918, "grad_norm": 0.28260141611099243, "learning_rate": 0.00013670661905886217, "loss": 0.6487, "step": 2644 }, { "epoch": 0.4041716010237995, "grad_norm": 0.2809610366821289, "learning_rate": 0.00013666018466255148, "loss": 0.6374, "step": 2645 }, { "epoch": 0.4043244069221072, "grad_norm": 0.26803719997406006, "learning_rate": 0.0001366137411326635, "loss": 0.5907, "step": 2646 }, { "epoch": 0.4044772128204149, "grad_norm": 0.3342551589012146, "learning_rate": 0.00013656728848076928, "loss": 0.7485, "step": 2647 }, { "epoch": 0.40463001871872256, "grad_norm": 0.26108020544052124, "learning_rate": 0.00013652082671844205, "loss": 0.569, "step": 2648 }, { "epoch": 0.4047828246170302, "grad_norm": 0.2628275454044342, "learning_rate": 0.00013647435585725746, "loss": 0.5711, "step": 2649 }, { "epoch": 0.4049356305153379, "grad_norm": 0.42528602480888367, "learning_rate": 0.00013642787590879325, "loss": 0.8466, "step": 2650 }, { "epoch": 0.40508843641364556, "grad_norm": 0.29589298367500305, "learning_rate": 0.00013638138688462957, "loss": 0.6615, "step": 2651 }, { "epoch": 0.40524124231195324, "grad_norm": 0.2670883238315582, "learning_rate": 0.0001363348887963487, "loss": 0.5713, "step": 2652 }, { "epoch": 0.4053940482102609, "grad_norm": 0.27636924386024475, "learning_rate": 0.00013628838165553533, "loss": 0.8193, "step": 2653 }, { "epoch": 0.4055468541085686, "grad_norm": 0.3108629584312439, "learning_rate": 0.00013624186547377628, "loss": 0.6917, "step": 2654 }, { "epoch": 0.40569966000687624, "grad_norm": 0.32495611906051636, "learning_rate": 0.00013619534026266064, "loss": 0.8104, "step": 2655 }, { "epoch": 0.4058524659051839, "grad_norm": 0.4790588617324829, "learning_rate": 0.00013614880603377979, "loss": 0.6834, "step": 2656 }, { "epoch": 0.4060052718034916, "grad_norm": 0.2443542331457138, "learning_rate": 0.0001361022627987274, "loss": 0.5103, "step": 2657 }, { "epoch": 0.4061580777017993, "grad_norm": 0.3075079321861267, "learning_rate": 0.0001360557105690993, "loss": 0.5277, "step": 2658 }, { "epoch": 0.406310883600107, "grad_norm": 0.2426033914089203, "learning_rate": 0.00013600914935649354, "loss": 0.6479, "step": 2659 }, { "epoch": 0.40646368949841466, "grad_norm": 0.3688972592353821, "learning_rate": 0.0001359625791725105, "loss": 0.6978, "step": 2660 }, { "epoch": 0.4066164953967223, "grad_norm": 0.31058305501937866, "learning_rate": 0.00013591600002875272, "loss": 0.5861, "step": 2661 }, { "epoch": 0.40676930129503, "grad_norm": 0.3048050105571747, "learning_rate": 0.00013586941193682506, "loss": 0.5262, "step": 2662 }, { "epoch": 0.40692210719333766, "grad_norm": 0.25697362422943115, "learning_rate": 0.00013582281490833446, "loss": 0.6828, "step": 2663 }, { "epoch": 0.40707491309164534, "grad_norm": 0.28246739506721497, "learning_rate": 0.00013577620895489028, "loss": 0.6223, "step": 2664 }, { "epoch": 0.407227718989953, "grad_norm": 0.3349422216415405, "learning_rate": 0.0001357295940881039, "loss": 0.5698, "step": 2665 }, { "epoch": 0.4073805248882607, "grad_norm": 0.3754185140132904, "learning_rate": 0.00013568297031958912, "loss": 0.6353, "step": 2666 }, { "epoch": 0.40753333078656834, "grad_norm": 0.26379403471946716, "learning_rate": 0.00013563633766096179, "loss": 0.6373, "step": 2667 }, { "epoch": 0.407686136684876, "grad_norm": 0.388322651386261, "learning_rate": 0.00013558969612384008, "loss": 0.709, "step": 2668 }, { "epoch": 0.4078389425831837, "grad_norm": 0.28406521677970886, "learning_rate": 0.00013554304571984437, "loss": 0.7033, "step": 2669 }, { "epoch": 0.4079917484814914, "grad_norm": 0.293530136346817, "learning_rate": 0.00013549638646059712, "loss": 0.6015, "step": 2670 }, { "epoch": 0.4081445543797991, "grad_norm": 0.6281304359436035, "learning_rate": 0.0001354497183577232, "loss": 0.5931, "step": 2671 }, { "epoch": 0.40829736027810676, "grad_norm": 0.2629290521144867, "learning_rate": 0.00013540304142284945, "loss": 0.6174, "step": 2672 }, { "epoch": 0.4084501661764144, "grad_norm": 0.7760477662086487, "learning_rate": 0.00013535635566760517, "loss": 0.5549, "step": 2673 }, { "epoch": 0.40860297207472207, "grad_norm": 0.2590596377849579, "learning_rate": 0.00013530966110362165, "loss": 0.6117, "step": 2674 }, { "epoch": 0.40875577797302975, "grad_norm": 0.24774251878261566, "learning_rate": 0.00013526295774253248, "loss": 0.4853, "step": 2675 }, { "epoch": 0.40890858387133744, "grad_norm": 0.5391387939453125, "learning_rate": 0.00013521624559597337, "loss": 0.7386, "step": 2676 }, { "epoch": 0.4090613897696451, "grad_norm": 0.2838054299354553, "learning_rate": 0.0001351695246755823, "loss": 0.7681, "step": 2677 }, { "epoch": 0.4092141956679528, "grad_norm": 0.28494569659233093, "learning_rate": 0.00013512279499299935, "loss": 0.7326, "step": 2678 }, { "epoch": 0.40936700156626044, "grad_norm": 0.27946600317955017, "learning_rate": 0.0001350760565598669, "loss": 0.7292, "step": 2679 }, { "epoch": 0.4095198074645681, "grad_norm": 0.3265629708766937, "learning_rate": 0.00013502930938782937, "loss": 0.7616, "step": 2680 }, { "epoch": 0.4096726133628758, "grad_norm": 0.3024129867553711, "learning_rate": 0.00013498255348853342, "loss": 0.7034, "step": 2681 }, { "epoch": 0.4098254192611835, "grad_norm": 0.33738934993743896, "learning_rate": 0.00013493578887362797, "loss": 0.8399, "step": 2682 }, { "epoch": 0.40997822515949117, "grad_norm": 0.28026413917541504, "learning_rate": 0.00013488901555476395, "loss": 0.6149, "step": 2683 }, { "epoch": 0.41013103105779886, "grad_norm": 0.2940625548362732, "learning_rate": 0.0001348422335435946, "loss": 0.583, "step": 2684 }, { "epoch": 0.4102838369561065, "grad_norm": 0.30337145924568176, "learning_rate": 0.00013479544285177524, "loss": 0.6673, "step": 2685 }, { "epoch": 0.41043664285441417, "grad_norm": 0.27657046914100647, "learning_rate": 0.00013474864349096333, "loss": 0.652, "step": 2686 }, { "epoch": 0.41058944875272185, "grad_norm": 0.2847646474838257, "learning_rate": 0.00013470183547281862, "loss": 0.8841, "step": 2687 }, { "epoch": 0.41074225465102954, "grad_norm": 0.28591471910476685, "learning_rate": 0.0001346550188090029, "loss": 0.7805, "step": 2688 }, { "epoch": 0.4108950605493372, "grad_norm": 0.2631331980228424, "learning_rate": 0.00013460819351118013, "loss": 0.5712, "step": 2689 }, { "epoch": 0.4110478664476449, "grad_norm": 0.2960440218448639, "learning_rate": 0.00013456135959101644, "loss": 0.5894, "step": 2690 }, { "epoch": 0.41120067234595253, "grad_norm": 0.2985890507698059, "learning_rate": 0.00013451451706018017, "loss": 0.7551, "step": 2691 }, { "epoch": 0.4113534782442602, "grad_norm": 0.3147627115249634, "learning_rate": 0.00013446766593034167, "loss": 0.6941, "step": 2692 }, { "epoch": 0.4115062841425679, "grad_norm": 0.3025978207588196, "learning_rate": 0.00013442080621317354, "loss": 0.8067, "step": 2693 }, { "epoch": 0.4116590900408756, "grad_norm": 0.4465163052082062, "learning_rate": 0.00013437393792035046, "loss": 0.7604, "step": 2694 }, { "epoch": 0.41181189593918327, "grad_norm": 0.27825966477394104, "learning_rate": 0.00013432706106354932, "loss": 0.5674, "step": 2695 }, { "epoch": 0.41196470183749095, "grad_norm": 0.3376471698284149, "learning_rate": 0.00013428017565444904, "loss": 0.4975, "step": 2696 }, { "epoch": 0.4121175077357986, "grad_norm": 0.3866771459579468, "learning_rate": 0.00013423328170473076, "loss": 0.8205, "step": 2697 }, { "epoch": 0.41227031363410627, "grad_norm": 0.30912765860557556, "learning_rate": 0.0001341863792260777, "loss": 0.6016, "step": 2698 }, { "epoch": 0.41242311953241395, "grad_norm": 0.3049004077911377, "learning_rate": 0.00013413946823017528, "loss": 0.7691, "step": 2699 }, { "epoch": 0.41257592543072164, "grad_norm": 0.2947705090045929, "learning_rate": 0.00013409254872871084, "loss": 0.7282, "step": 2700 }, { "epoch": 0.4127287313290293, "grad_norm": 0.31972742080688477, "learning_rate": 0.00013404562073337413, "loss": 0.606, "step": 2701 }, { "epoch": 0.41288153722733695, "grad_norm": 0.33832690119743347, "learning_rate": 0.00013399868425585676, "loss": 0.6662, "step": 2702 }, { "epoch": 0.41303434312564463, "grad_norm": 0.36278048157691956, "learning_rate": 0.00013395173930785261, "loss": 0.807, "step": 2703 }, { "epoch": 0.4131871490239523, "grad_norm": 0.28764086961746216, "learning_rate": 0.00013390478590105762, "loss": 0.6506, "step": 2704 }, { "epoch": 0.41333995492226, "grad_norm": 0.2584592401981354, "learning_rate": 0.00013385782404716983, "loss": 0.4927, "step": 2705 }, { "epoch": 0.4134927608205677, "grad_norm": 0.32915428280830383, "learning_rate": 0.00013381085375788939, "loss": 0.6524, "step": 2706 }, { "epoch": 0.41364556671887537, "grad_norm": 0.29130539298057556, "learning_rate": 0.00013376387504491854, "loss": 0.5596, "step": 2707 }, { "epoch": 0.413798372617183, "grad_norm": 0.293236643075943, "learning_rate": 0.00013371688791996168, "loss": 0.7917, "step": 2708 }, { "epoch": 0.4139511785154907, "grad_norm": 0.34430694580078125, "learning_rate": 0.00013366989239472517, "loss": 0.6377, "step": 2709 }, { "epoch": 0.41410398441379836, "grad_norm": 0.3663583993911743, "learning_rate": 0.00013362288848091765, "loss": 0.7631, "step": 2710 }, { "epoch": 0.41425679031210605, "grad_norm": 0.41205117106437683, "learning_rate": 0.00013357587619024965, "loss": 0.8005, "step": 2711 }, { "epoch": 0.41440959621041373, "grad_norm": 0.3227595388889313, "learning_rate": 0.00013352885553443399, "loss": 0.614, "step": 2712 }, { "epoch": 0.4145624021087214, "grad_norm": 0.3226085305213928, "learning_rate": 0.0001334818265251854, "loss": 0.679, "step": 2713 }, { "epoch": 0.41471520800702905, "grad_norm": 0.3061399459838867, "learning_rate": 0.00013343478917422077, "loss": 0.6796, "step": 2714 }, { "epoch": 0.41486801390533673, "grad_norm": 0.3472737669944763, "learning_rate": 0.00013338774349325912, "loss": 0.8371, "step": 2715 }, { "epoch": 0.4150208198036444, "grad_norm": 0.33723995089530945, "learning_rate": 0.00013334068949402141, "loss": 0.7688, "step": 2716 }, { "epoch": 0.4151736257019521, "grad_norm": 0.24828742444515228, "learning_rate": 0.0001332936271882308, "loss": 0.8392, "step": 2717 }, { "epoch": 0.4153264316002598, "grad_norm": 0.36368826031684875, "learning_rate": 0.00013324655658761246, "loss": 0.5834, "step": 2718 }, { "epoch": 0.41547923749856747, "grad_norm": 0.27683743834495544, "learning_rate": 0.00013319947770389364, "loss": 0.6469, "step": 2719 }, { "epoch": 0.4156320433968751, "grad_norm": 0.31676921248435974, "learning_rate": 0.00013315239054880354, "loss": 0.7522, "step": 2720 }, { "epoch": 0.4157848492951828, "grad_norm": 0.3366953134536743, "learning_rate": 0.00013310529513407374, "loss": 0.7079, "step": 2721 }, { "epoch": 0.41593765519349046, "grad_norm": 0.36535075306892395, "learning_rate": 0.00013305819147143747, "loss": 0.8439, "step": 2722 }, { "epoch": 0.41609046109179815, "grad_norm": 0.3025410771369934, "learning_rate": 0.00013301107957263035, "loss": 0.6305, "step": 2723 }, { "epoch": 0.41624326699010583, "grad_norm": 0.32860392332077026, "learning_rate": 0.00013296395944938983, "loss": 0.6886, "step": 2724 }, { "epoch": 0.4163960728884135, "grad_norm": 0.6233853697776794, "learning_rate": 0.00013291683111345552, "loss": 0.6825, "step": 2725 }, { "epoch": 0.41654887878672114, "grad_norm": 0.26388707756996155, "learning_rate": 0.00013286969457656906, "loss": 0.848, "step": 2726 }, { "epoch": 0.41670168468502883, "grad_norm": 0.30272993445396423, "learning_rate": 0.0001328225498504741, "loss": 0.5376, "step": 2727 }, { "epoch": 0.4168544905833365, "grad_norm": 0.283623605966568, "learning_rate": 0.00013277539694691635, "loss": 0.6954, "step": 2728 }, { "epoch": 0.4170072964816442, "grad_norm": 0.3102206885814667, "learning_rate": 0.0001327282358776436, "loss": 0.6589, "step": 2729 }, { "epoch": 0.4171601023799519, "grad_norm": 0.31666800379753113, "learning_rate": 0.0001326810666544056, "loss": 0.7268, "step": 2730 }, { "epoch": 0.41731290827825956, "grad_norm": 0.3100956380367279, "learning_rate": 0.0001326338892889542, "loss": 0.6853, "step": 2731 }, { "epoch": 0.4174657141765672, "grad_norm": 0.2755642533302307, "learning_rate": 0.00013258670379304318, "loss": 0.6981, "step": 2732 }, { "epoch": 0.4176185200748749, "grad_norm": 0.3609178960323334, "learning_rate": 0.0001325395101784285, "loss": 0.6268, "step": 2733 }, { "epoch": 0.41777132597318256, "grad_norm": 0.30639350414276123, "learning_rate": 0.00013249230845686796, "loss": 0.6244, "step": 2734 }, { "epoch": 0.41792413187149025, "grad_norm": 0.3014542758464813, "learning_rate": 0.00013244509864012154, "loss": 0.7873, "step": 2735 }, { "epoch": 0.41807693776979793, "grad_norm": 0.31064677238464355, "learning_rate": 0.00013239788073995113, "loss": 0.8028, "step": 2736 }, { "epoch": 0.4182297436681056, "grad_norm": 0.36065420508384705, "learning_rate": 0.0001323506547681207, "loss": 0.738, "step": 2737 }, { "epoch": 0.41838254956641324, "grad_norm": 0.8031928539276123, "learning_rate": 0.0001323034207363962, "loss": 0.7281, "step": 2738 }, { "epoch": 0.4185353554647209, "grad_norm": 0.3118455708026886, "learning_rate": 0.0001322561786565456, "loss": 0.7196, "step": 2739 }, { "epoch": 0.4186881613630286, "grad_norm": 0.7187873721122742, "learning_rate": 0.0001322089285403388, "loss": 0.6527, "step": 2740 }, { "epoch": 0.4188409672613363, "grad_norm": 0.31127819418907166, "learning_rate": 0.00013216167039954786, "loss": 0.7949, "step": 2741 }, { "epoch": 0.418993773159644, "grad_norm": 0.2882727086544037, "learning_rate": 0.0001321144042459467, "loss": 0.7491, "step": 2742 }, { "epoch": 0.41914657905795166, "grad_norm": 0.2354152351617813, "learning_rate": 0.00013206713009131132, "loss": 0.6037, "step": 2743 }, { "epoch": 0.4192993849562593, "grad_norm": 0.32294926047325134, "learning_rate": 0.00013201984794741965, "loss": 0.5798, "step": 2744 }, { "epoch": 0.419452190854567, "grad_norm": 0.3169757127761841, "learning_rate": 0.00013197255782605163, "loss": 0.6941, "step": 2745 }, { "epoch": 0.41960499675287466, "grad_norm": 0.28408145904541016, "learning_rate": 0.0001319252597389892, "loss": 0.719, "step": 2746 }, { "epoch": 0.41975780265118234, "grad_norm": 0.3238529860973358, "learning_rate": 0.00013187795369801634, "loss": 0.7321, "step": 2747 }, { "epoch": 0.41991060854949, "grad_norm": 0.29003897309303284, "learning_rate": 0.00013183063971491889, "loss": 0.7617, "step": 2748 }, { "epoch": 0.4200634144477977, "grad_norm": 0.3293977677822113, "learning_rate": 0.00013178331780148474, "loss": 0.8128, "step": 2749 }, { "epoch": 0.42021622034610534, "grad_norm": 0.5713270902633667, "learning_rate": 0.00013173598796950375, "loss": 0.6511, "step": 2750 }, { "epoch": 0.420369026244413, "grad_norm": 0.39325082302093506, "learning_rate": 0.00013168865023076778, "loss": 0.773, "step": 2751 }, { "epoch": 0.4205218321427207, "grad_norm": 0.31554874777793884, "learning_rate": 0.00013164130459707057, "loss": 0.6337, "step": 2752 }, { "epoch": 0.4206746380410284, "grad_norm": 0.5043233036994934, "learning_rate": 0.00013159395108020797, "loss": 0.9153, "step": 2753 }, { "epoch": 0.4208274439393361, "grad_norm": 0.3268618583679199, "learning_rate": 0.00013154658969197767, "loss": 1.1463, "step": 2754 }, { "epoch": 0.42098024983764376, "grad_norm": 0.25591176748275757, "learning_rate": 0.0001314992204441793, "loss": 0.6545, "step": 2755 }, { "epoch": 0.4211330557359514, "grad_norm": 0.31433895230293274, "learning_rate": 0.00013145184334861462, "loss": 0.7142, "step": 2756 }, { "epoch": 0.4212858616342591, "grad_norm": 0.33507999777793884, "learning_rate": 0.00013140445841708715, "loss": 0.8627, "step": 2757 }, { "epoch": 0.42143866753256676, "grad_norm": 0.2889297604560852, "learning_rate": 0.0001313570656614025, "loss": 0.6666, "step": 2758 }, { "epoch": 0.42159147343087444, "grad_norm": 0.34413883090019226, "learning_rate": 0.00013130966509336816, "loss": 0.7164, "step": 2759 }, { "epoch": 0.4217442793291821, "grad_norm": 0.302048921585083, "learning_rate": 0.0001312622567247936, "loss": 0.524, "step": 2760 }, { "epoch": 0.42189708522748975, "grad_norm": 0.30027899146080017, "learning_rate": 0.00013121484056749017, "loss": 0.6239, "step": 2761 }, { "epoch": 0.42204989112579744, "grad_norm": 0.29137787222862244, "learning_rate": 0.00013116741663327124, "loss": 0.6321, "step": 2762 }, { "epoch": 0.4222026970241051, "grad_norm": 0.2735944986343384, "learning_rate": 0.0001311199849339521, "loss": 0.7902, "step": 2763 }, { "epoch": 0.4223555029224128, "grad_norm": 0.2861863970756531, "learning_rate": 0.0001310725454813499, "loss": 0.5792, "step": 2764 }, { "epoch": 0.4225083088207205, "grad_norm": 0.39001768827438354, "learning_rate": 0.00013102509828728388, "loss": 0.7467, "step": 2765 }, { "epoch": 0.4226611147190282, "grad_norm": 0.28480300307273865, "learning_rate": 0.00013097764336357504, "loss": 0.7347, "step": 2766 }, { "epoch": 0.4228139206173358, "grad_norm": 0.3337126076221466, "learning_rate": 0.0001309301807220464, "loss": 0.7443, "step": 2767 }, { "epoch": 0.4229667265156435, "grad_norm": 0.27228620648384094, "learning_rate": 0.0001308827103745228, "loss": 0.7698, "step": 2768 }, { "epoch": 0.42311953241395117, "grad_norm": 0.288928747177124, "learning_rate": 0.00013083523233283124, "loss": 0.6021, "step": 2769 }, { "epoch": 0.42327233831225886, "grad_norm": 0.3781624138355255, "learning_rate": 0.00013078774660880033, "loss": 0.7343, "step": 2770 }, { "epoch": 0.42342514421056654, "grad_norm": 0.354174941778183, "learning_rate": 0.0001307402532142608, "loss": 0.585, "step": 2771 }, { "epoch": 0.4235779501088742, "grad_norm": 0.31593069434165955, "learning_rate": 0.00013069275216104521, "loss": 0.8165, "step": 2772 }, { "epoch": 0.42373075600718185, "grad_norm": 0.4373694956302643, "learning_rate": 0.00013064524346098808, "loss": 0.6946, "step": 2773 }, { "epoch": 0.42388356190548954, "grad_norm": 0.3665942847728729, "learning_rate": 0.00013059772712592578, "loss": 0.7237, "step": 2774 }, { "epoch": 0.4240363678037972, "grad_norm": 0.3371705412864685, "learning_rate": 0.0001305502031676966, "loss": 1.1382, "step": 2775 }, { "epoch": 0.4241891737021049, "grad_norm": 0.2612996995449066, "learning_rate": 0.0001305026715981408, "loss": 0.7192, "step": 2776 }, { "epoch": 0.4243419796004126, "grad_norm": 0.26436302065849304, "learning_rate": 0.00013045513242910032, "loss": 0.7453, "step": 2777 }, { "epoch": 0.4244947854987203, "grad_norm": 0.3628009855747223, "learning_rate": 0.00013040758567241933, "loss": 0.5983, "step": 2778 }, { "epoch": 0.4246475913970279, "grad_norm": 0.32881370186805725, "learning_rate": 0.0001303600313399436, "loss": 0.7943, "step": 2779 }, { "epoch": 0.4248003972953356, "grad_norm": 0.30668795108795166, "learning_rate": 0.0001303124694435209, "loss": 0.6752, "step": 2780 }, { "epoch": 0.42495320319364327, "grad_norm": 0.3583773672580719, "learning_rate": 0.00013026489999500086, "loss": 0.6052, "step": 2781 }, { "epoch": 0.42510600909195095, "grad_norm": 0.3048308491706848, "learning_rate": 0.00013021732300623508, "loss": 0.5376, "step": 2782 }, { "epoch": 0.42525881499025864, "grad_norm": 0.3251284658908844, "learning_rate": 0.0001301697384890769, "loss": 0.5928, "step": 2783 }, { "epoch": 0.4254116208885663, "grad_norm": 0.3076835870742798, "learning_rate": 0.00013012214645538163, "loss": 0.605, "step": 2784 }, { "epoch": 0.42556442678687395, "grad_norm": 0.24366116523742676, "learning_rate": 0.00013007454691700644, "loss": 0.7375, "step": 2785 }, { "epoch": 0.42571723268518163, "grad_norm": 0.3150011897087097, "learning_rate": 0.00013002693988581034, "loss": 0.7008, "step": 2786 }, { "epoch": 0.4258700385834893, "grad_norm": 0.26339590549468994, "learning_rate": 0.0001299793253736542, "loss": 0.5094, "step": 2787 }, { "epoch": 0.426022844481797, "grad_norm": 0.31079646944999695, "learning_rate": 0.00012993170339240082, "loss": 0.5418, "step": 2788 }, { "epoch": 0.4261756503801047, "grad_norm": 0.3271988332271576, "learning_rate": 0.00012988407395391477, "loss": 0.7808, "step": 2789 }, { "epoch": 0.42632845627841237, "grad_norm": 2.406322956085205, "learning_rate": 0.00012983643707006258, "loss": 0.6183, "step": 2790 }, { "epoch": 0.42648126217672, "grad_norm": 0.28407543897628784, "learning_rate": 0.00012978879275271253, "loss": 0.5404, "step": 2791 }, { "epoch": 0.4266340680750277, "grad_norm": 0.2678498923778534, "learning_rate": 0.0001297411410137348, "loss": 0.8858, "step": 2792 }, { "epoch": 0.42678687397333537, "grad_norm": 0.3212999999523163, "learning_rate": 0.00012969348186500147, "loss": 0.6627, "step": 2793 }, { "epoch": 0.42693967987164305, "grad_norm": 0.2900780737400055, "learning_rate": 0.00012964581531838636, "loss": 0.5738, "step": 2794 }, { "epoch": 0.42709248576995074, "grad_norm": 0.3582835793495178, "learning_rate": 0.00012959814138576524, "loss": 0.6817, "step": 2795 }, { "epoch": 0.4272452916682584, "grad_norm": 0.5339453220367432, "learning_rate": 0.00012955046007901563, "loss": 0.5825, "step": 2796 }, { "epoch": 0.42739809756656605, "grad_norm": 0.3053556978702545, "learning_rate": 0.00012950277141001695, "loss": 0.9986, "step": 2797 }, { "epoch": 0.42755090346487373, "grad_norm": 0.29578697681427, "learning_rate": 0.00012945507539065046, "loss": 0.7364, "step": 2798 }, { "epoch": 0.4277037093631814, "grad_norm": 0.2910451292991638, "learning_rate": 0.00012940737203279916, "loss": 0.7474, "step": 2799 }, { "epoch": 0.4278565152614891, "grad_norm": 0.32356956601142883, "learning_rate": 0.00012935966134834797, "loss": 0.7036, "step": 2800 }, { "epoch": 0.4280093211597968, "grad_norm": 0.3562777042388916, "learning_rate": 0.0001293119433491836, "loss": 0.6056, "step": 2801 }, { "epoch": 0.42816212705810447, "grad_norm": 0.2624085247516632, "learning_rate": 0.0001292642180471946, "loss": 0.6888, "step": 2802 }, { "epoch": 0.4283149329564121, "grad_norm": 0.307565838098526, "learning_rate": 0.00012921648545427135, "loss": 0.6338, "step": 2803 }, { "epoch": 0.4284677388547198, "grad_norm": 0.334005743265152, "learning_rate": 0.00012916874558230597, "loss": 0.5713, "step": 2804 }, { "epoch": 0.42862054475302747, "grad_norm": 0.2838148772716522, "learning_rate": 0.00012912099844319247, "loss": 0.5971, "step": 2805 }, { "epoch": 0.42877335065133515, "grad_norm": 0.3633905053138733, "learning_rate": 0.0001290732440488267, "loss": 0.7187, "step": 2806 }, { "epoch": 0.42892615654964283, "grad_norm": 0.2693686783313751, "learning_rate": 0.00012902548241110618, "loss": 0.6844, "step": 2807 }, { "epoch": 0.4290789624479505, "grad_norm": 0.6584002375602722, "learning_rate": 0.00012897771354193038, "loss": 0.6379, "step": 2808 }, { "epoch": 0.42923176834625815, "grad_norm": 0.29742875695228577, "learning_rate": 0.00012892993745320053, "loss": 0.783, "step": 2809 }, { "epoch": 0.42938457424456583, "grad_norm": 0.3576945662498474, "learning_rate": 0.00012888215415681956, "loss": 0.6983, "step": 2810 }, { "epoch": 0.4295373801428735, "grad_norm": 0.3622451424598694, "learning_rate": 0.00012883436366469236, "loss": 0.6491, "step": 2811 }, { "epoch": 0.4296901860411812, "grad_norm": 0.2713620662689209, "learning_rate": 0.00012878656598872546, "loss": 0.7308, "step": 2812 }, { "epoch": 0.4298429919394889, "grad_norm": 0.273732453584671, "learning_rate": 0.00012873876114082733, "loss": 0.6912, "step": 2813 }, { "epoch": 0.4299957978377965, "grad_norm": 0.2648273706436157, "learning_rate": 0.00012869094913290805, "loss": 0.7097, "step": 2814 }, { "epoch": 0.4301486037361042, "grad_norm": 0.32749706506729126, "learning_rate": 0.0001286431299768797, "loss": 0.7119, "step": 2815 }, { "epoch": 0.4303014096344119, "grad_norm": 0.4028230309486389, "learning_rate": 0.00012859530368465586, "loss": 0.6675, "step": 2816 }, { "epoch": 0.43045421553271956, "grad_norm": 0.27643126249313354, "learning_rate": 0.0001285474702681522, "loss": 0.7513, "step": 2817 }, { "epoch": 0.43060702143102725, "grad_norm": 0.2783336341381073, "learning_rate": 0.00012849962973928596, "loss": 0.6643, "step": 2818 }, { "epoch": 0.43075982732933493, "grad_norm": 0.3845579922199249, "learning_rate": 0.00012845178210997622, "loss": 0.5968, "step": 2819 }, { "epoch": 0.43091263322764256, "grad_norm": 0.26863181591033936, "learning_rate": 0.00012840392739214376, "loss": 0.7512, "step": 2820 }, { "epoch": 0.43106543912595024, "grad_norm": 0.3777031898498535, "learning_rate": 0.00012835606559771123, "loss": 0.6785, "step": 2821 }, { "epoch": 0.43121824502425793, "grad_norm": 0.44814273715019226, "learning_rate": 0.000128308196738603, "loss": 0.8311, "step": 2822 }, { "epoch": 0.4313710509225656, "grad_norm": 0.3343289792537689, "learning_rate": 0.00012826032082674516, "loss": 0.7952, "step": 2823 }, { "epoch": 0.4315238568208733, "grad_norm": 0.25728681683540344, "learning_rate": 0.00012821243787406562, "loss": 0.6728, "step": 2824 }, { "epoch": 0.431676662719181, "grad_norm": 0.35816818475723267, "learning_rate": 0.000128164547892494, "loss": 0.6913, "step": 2825 }, { "epoch": 0.4318294686174886, "grad_norm": 0.8182726502418518, "learning_rate": 0.0001281166508939617, "loss": 0.602, "step": 2826 }, { "epoch": 0.4319822745157963, "grad_norm": 0.2920895218849182, "learning_rate": 0.00012806874689040186, "loss": 0.72, "step": 2827 }, { "epoch": 0.432135080414104, "grad_norm": 0.35942891240119934, "learning_rate": 0.0001280208358937493, "loss": 0.7262, "step": 2828 }, { "epoch": 0.43228788631241166, "grad_norm": 0.2746555805206299, "learning_rate": 0.00012797291791594073, "loss": 0.6001, "step": 2829 }, { "epoch": 0.43244069221071935, "grad_norm": 0.32052844762802124, "learning_rate": 0.00012792499296891447, "loss": 0.6371, "step": 2830 }, { "epoch": 0.43259349810902703, "grad_norm": 0.30219176411628723, "learning_rate": 0.00012787706106461063, "loss": 0.8482, "step": 2831 }, { "epoch": 0.43274630400733466, "grad_norm": 0.30528518557548523, "learning_rate": 0.000127829122214971, "loss": 0.7601, "step": 2832 }, { "epoch": 0.43289910990564234, "grad_norm": 0.26077762246131897, "learning_rate": 0.0001277811764319392, "loss": 0.7087, "step": 2833 }, { "epoch": 0.43305191580395, "grad_norm": 0.36096397042274475, "learning_rate": 0.00012773322372746049, "loss": 0.9584, "step": 2834 }, { "epoch": 0.4332047217022577, "grad_norm": 0.29656782746315, "learning_rate": 0.00012768526411348187, "loss": 0.5632, "step": 2835 }, { "epoch": 0.4333575276005654, "grad_norm": 0.29737043380737305, "learning_rate": 0.0001276372976019521, "loss": 0.7245, "step": 2836 }, { "epoch": 0.4335103334988731, "grad_norm": 0.3119308650493622, "learning_rate": 0.00012758932420482163, "loss": 0.739, "step": 2837 }, { "epoch": 0.4336631393971807, "grad_norm": 0.35479971766471863, "learning_rate": 0.00012754134393404265, "loss": 0.753, "step": 2838 }, { "epoch": 0.4338159452954884, "grad_norm": 0.291146457195282, "learning_rate": 0.000127493356801569, "loss": 0.7229, "step": 2839 }, { "epoch": 0.4339687511937961, "grad_norm": 0.28508853912353516, "learning_rate": 0.00012744536281935628, "loss": 0.7438, "step": 2840 }, { "epoch": 0.43412155709210376, "grad_norm": 0.3319436311721802, "learning_rate": 0.00012739736199936182, "loss": 0.7025, "step": 2841 }, { "epoch": 0.43427436299041144, "grad_norm": 0.33289408683776855, "learning_rate": 0.00012734935435354457, "loss": 0.688, "step": 2842 }, { "epoch": 0.43442716888871913, "grad_norm": 0.32559987902641296, "learning_rate": 0.00012730133989386524, "loss": 0.8483, "step": 2843 }, { "epoch": 0.43457997478702676, "grad_norm": 0.2847137749195099, "learning_rate": 0.0001272533186322863, "loss": 0.7268, "step": 2844 }, { "epoch": 0.43473278068533444, "grad_norm": 0.35314276814460754, "learning_rate": 0.00012720529058077176, "loss": 0.7862, "step": 2845 }, { "epoch": 0.4348855865836421, "grad_norm": 0.2800363302230835, "learning_rate": 0.00012715725575128745, "loss": 0.6973, "step": 2846 }, { "epoch": 0.4350383924819498, "grad_norm": 0.8330638408660889, "learning_rate": 0.00012710921415580085, "loss": 0.9033, "step": 2847 }, { "epoch": 0.4351911983802575, "grad_norm": 0.3423483073711395, "learning_rate": 0.00012706116580628112, "loss": 0.6541, "step": 2848 }, { "epoch": 0.4353440042785652, "grad_norm": 0.3231146037578583, "learning_rate": 0.00012701311071469903, "loss": 0.5636, "step": 2849 }, { "epoch": 0.4354968101768728, "grad_norm": 0.5048816204071045, "learning_rate": 0.0001269650488930272, "loss": 0.8825, "step": 2850 }, { "epoch": 0.4356496160751805, "grad_norm": 0.2932036221027374, "learning_rate": 0.00012691698035323978, "loss": 0.7126, "step": 2851 }, { "epoch": 0.4358024219734882, "grad_norm": 0.5563439130783081, "learning_rate": 0.00012686890510731267, "loss": 0.6056, "step": 2852 }, { "epoch": 0.43595522787179586, "grad_norm": 0.28055623173713684, "learning_rate": 0.00012682082316722336, "loss": 0.823, "step": 2853 }, { "epoch": 0.43610803377010354, "grad_norm": 0.28064948320388794, "learning_rate": 0.00012677273454495113, "loss": 0.6092, "step": 2854 }, { "epoch": 0.4362608396684112, "grad_norm": 0.3126406669616699, "learning_rate": 0.0001267246392524768, "loss": 0.7116, "step": 2855 }, { "epoch": 0.43641364556671886, "grad_norm": 0.31279459595680237, "learning_rate": 0.00012667653730178292, "loss": 0.7781, "step": 2856 }, { "epoch": 0.43656645146502654, "grad_norm": 0.2848126292228699, "learning_rate": 0.00012662842870485376, "loss": 0.586, "step": 2857 }, { "epoch": 0.4367192573633342, "grad_norm": 0.28570377826690674, "learning_rate": 0.00012658031347367505, "loss": 0.5861, "step": 2858 }, { "epoch": 0.4368720632616419, "grad_norm": 0.2820393145084381, "learning_rate": 0.0001265321916202344, "loss": 0.659, "step": 2859 }, { "epoch": 0.4370248691599496, "grad_norm": 0.27809035778045654, "learning_rate": 0.0001264840631565209, "loss": 0.573, "step": 2860 }, { "epoch": 0.4371776750582573, "grad_norm": 0.38270965218544006, "learning_rate": 0.00012643592809452543, "loss": 0.7039, "step": 2861 }, { "epoch": 0.4373304809565649, "grad_norm": 0.32795193791389465, "learning_rate": 0.00012638778644624032, "loss": 0.7147, "step": 2862 }, { "epoch": 0.4374832868548726, "grad_norm": 0.32430192828178406, "learning_rate": 0.00012633963822365976, "loss": 0.9189, "step": 2863 }, { "epoch": 0.4376360927531803, "grad_norm": 0.2981487214565277, "learning_rate": 0.00012629148343877943, "loss": 0.6675, "step": 2864 }, { "epoch": 0.43778889865148796, "grad_norm": 0.30008915066719055, "learning_rate": 0.0001262433221035967, "loss": 0.7412, "step": 2865 }, { "epoch": 0.43794170454979564, "grad_norm": 0.31011244654655457, "learning_rate": 0.00012619515423011057, "loss": 0.8016, "step": 2866 }, { "epoch": 0.43809451044810327, "grad_norm": 0.2737204432487488, "learning_rate": 0.00012614697983032164, "loss": 0.5848, "step": 2867 }, { "epoch": 0.43824731634641095, "grad_norm": 0.2696418762207031, "learning_rate": 0.00012609879891623216, "loss": 0.5662, "step": 2868 }, { "epoch": 0.43840012224471864, "grad_norm": 0.31181618571281433, "learning_rate": 0.000126050611499846, "loss": 0.6401, "step": 2869 }, { "epoch": 0.4385529281430263, "grad_norm": 0.3828161060810089, "learning_rate": 0.0001260024175931687, "loss": 0.6144, "step": 2870 }, { "epoch": 0.438705734041334, "grad_norm": 0.26452359557151794, "learning_rate": 0.0001259542172082073, "loss": 0.624, "step": 2871 }, { "epoch": 0.4388585399396417, "grad_norm": 0.776643693447113, "learning_rate": 0.00012590601035697055, "loss": 0.7748, "step": 2872 }, { "epoch": 0.4390113458379493, "grad_norm": 0.4832134246826172, "learning_rate": 0.0001258577970514688, "loss": 0.6519, "step": 2873 }, { "epoch": 0.439164151736257, "grad_norm": 0.305779367685318, "learning_rate": 0.00012580957730371395, "loss": 0.8206, "step": 2874 }, { "epoch": 0.4393169576345647, "grad_norm": 0.3510475754737854, "learning_rate": 0.00012576135112571957, "loss": 0.7114, "step": 2875 }, { "epoch": 0.43946976353287237, "grad_norm": 0.4784543514251709, "learning_rate": 0.0001257131185295008, "loss": 0.8677, "step": 2876 }, { "epoch": 0.43962256943118005, "grad_norm": 0.2720498740673065, "learning_rate": 0.0001256648795270744, "loss": 0.796, "step": 2877 }, { "epoch": 0.43977537532948774, "grad_norm": 0.31961312890052795, "learning_rate": 0.0001256166341304587, "loss": 0.6601, "step": 2878 }, { "epoch": 0.43992818122779537, "grad_norm": 0.2913792133331299, "learning_rate": 0.00012556838235167365, "loss": 0.6879, "step": 2879 }, { "epoch": 0.44008098712610305, "grad_norm": 0.2850216329097748, "learning_rate": 0.00012552012420274076, "loss": 0.6935, "step": 2880 }, { "epoch": 0.44023379302441074, "grad_norm": 0.2468993067741394, "learning_rate": 0.00012547185969568312, "loss": 0.7689, "step": 2881 }, { "epoch": 0.4403865989227184, "grad_norm": 0.3027266561985016, "learning_rate": 0.00012542358884252546, "loss": 0.6852, "step": 2882 }, { "epoch": 0.4405394048210261, "grad_norm": 0.30375269055366516, "learning_rate": 0.00012537531165529407, "loss": 0.5691, "step": 2883 }, { "epoch": 0.4406922107193338, "grad_norm": 0.28959324955940247, "learning_rate": 0.0001253270281460168, "loss": 0.6491, "step": 2884 }, { "epoch": 0.4408450166176414, "grad_norm": 0.32386699318885803, "learning_rate": 0.00012527873832672305, "loss": 0.6175, "step": 2885 }, { "epoch": 0.4409978225159491, "grad_norm": 0.34364500641822815, "learning_rate": 0.00012523044220944383, "loss": 0.6779, "step": 2886 }, { "epoch": 0.4411506284142568, "grad_norm": 0.37659937143325806, "learning_rate": 0.00012518213980621177, "loss": 0.6467, "step": 2887 }, { "epoch": 0.44130343431256447, "grad_norm": 0.34209194779396057, "learning_rate": 0.00012513383112906093, "loss": 0.5235, "step": 2888 }, { "epoch": 0.44145624021087215, "grad_norm": 0.3095417320728302, "learning_rate": 0.00012508551619002701, "loss": 0.6551, "step": 2889 }, { "epoch": 0.44160904610917984, "grad_norm": 0.29345428943634033, "learning_rate": 0.00012503719500114735, "loss": 0.6686, "step": 2890 }, { "epoch": 0.44176185200748747, "grad_norm": 0.2837190330028534, "learning_rate": 0.0001249888675744607, "loss": 0.7411, "step": 2891 }, { "epoch": 0.44191465790579515, "grad_norm": 0.26552635431289673, "learning_rate": 0.0001249405339220075, "loss": 0.5733, "step": 2892 }, { "epoch": 0.44206746380410283, "grad_norm": 0.3099066913127899, "learning_rate": 0.0001248921940558296, "loss": 0.6688, "step": 2893 }, { "epoch": 0.4422202697024105, "grad_norm": 0.35833939909935, "learning_rate": 0.00012484384798797048, "loss": 0.7451, "step": 2894 }, { "epoch": 0.4423730756007182, "grad_norm": 0.2927980422973633, "learning_rate": 0.00012479549573047522, "loss": 0.5564, "step": 2895 }, { "epoch": 0.4425258814990259, "grad_norm": 0.4316510260105133, "learning_rate": 0.00012474713729539034, "loss": 0.5236, "step": 2896 }, { "epoch": 0.4426786873973335, "grad_norm": 0.2684415280818939, "learning_rate": 0.00012469877269476388, "loss": 0.697, "step": 2897 }, { "epoch": 0.4428314932956412, "grad_norm": 0.31690576672554016, "learning_rate": 0.00012465040194064558, "loss": 0.7508, "step": 2898 }, { "epoch": 0.4429842991939489, "grad_norm": 0.32044708728790283, "learning_rate": 0.00012460202504508653, "loss": 0.8633, "step": 2899 }, { "epoch": 0.44313710509225657, "grad_norm": 0.31981486082077026, "learning_rate": 0.0001245536420201395, "loss": 0.6791, "step": 2900 }, { "epoch": 0.44328991099056425, "grad_norm": 0.32003486156463623, "learning_rate": 0.00012450525287785861, "loss": 0.6707, "step": 2901 }, { "epoch": 0.44344271688887194, "grad_norm": 0.3154270052909851, "learning_rate": 0.0001244568576302997, "loss": 0.6449, "step": 2902 }, { "epoch": 0.44359552278717956, "grad_norm": 0.3915046453475952, "learning_rate": 0.00012440845628952004, "loss": 0.7843, "step": 2903 }, { "epoch": 0.44374832868548725, "grad_norm": 0.3003976345062256, "learning_rate": 0.00012436004886757831, "loss": 0.705, "step": 2904 }, { "epoch": 0.44390113458379493, "grad_norm": 0.2850950062274933, "learning_rate": 0.00012431163537653496, "loss": 0.6981, "step": 2905 }, { "epoch": 0.4440539404821026, "grad_norm": 0.38096436858177185, "learning_rate": 0.00012426321582845168, "loss": 0.7158, "step": 2906 }, { "epoch": 0.4442067463804103, "grad_norm": 0.3688443899154663, "learning_rate": 0.00012421479023539192, "loss": 0.699, "step": 2907 }, { "epoch": 0.444359552278718, "grad_norm": 0.3340109884738922, "learning_rate": 0.00012416635860942033, "loss": 0.8428, "step": 2908 }, { "epoch": 0.4445123581770256, "grad_norm": 0.2632228136062622, "learning_rate": 0.00012411792096260347, "loss": 0.6115, "step": 2909 }, { "epoch": 0.4446651640753333, "grad_norm": 0.26376640796661377, "learning_rate": 0.00012406947730700895, "loss": 0.7574, "step": 2910 }, { "epoch": 0.444817969973641, "grad_norm": 0.31339865922927856, "learning_rate": 0.00012402102765470628, "loss": 0.5751, "step": 2911 }, { "epoch": 0.44497077587194866, "grad_norm": 0.32284119725227356, "learning_rate": 0.0001239725720177662, "loss": 0.6088, "step": 2912 }, { "epoch": 0.44512358177025635, "grad_norm": 0.301904559135437, "learning_rate": 0.00012392411040826099, "loss": 0.6401, "step": 2913 }, { "epoch": 0.44527638766856403, "grad_norm": 0.30717435479164124, "learning_rate": 0.00012387564283826451, "loss": 0.6669, "step": 2914 }, { "epoch": 0.44542919356687166, "grad_norm": 0.3378068804740906, "learning_rate": 0.00012382716931985202, "loss": 0.6117, "step": 2915 }, { "epoch": 0.44558199946517935, "grad_norm": 0.3473984897136688, "learning_rate": 0.00012377868986510035, "loss": 0.9922, "step": 2916 }, { "epoch": 0.44573480536348703, "grad_norm": 0.3443201184272766, "learning_rate": 0.00012373020448608766, "loss": 0.7179, "step": 2917 }, { "epoch": 0.4458876112617947, "grad_norm": 0.3572174608707428, "learning_rate": 0.00012368171319489376, "loss": 0.6572, "step": 2918 }, { "epoch": 0.4460404171601024, "grad_norm": 0.23893767595291138, "learning_rate": 0.00012363321600359977, "loss": 0.452, "step": 2919 }, { "epoch": 0.4461932230584101, "grad_norm": 0.3510747253894806, "learning_rate": 0.00012358471292428844, "loss": 0.7885, "step": 2920 }, { "epoch": 0.4463460289567177, "grad_norm": 0.2646324634552002, "learning_rate": 0.00012353620396904382, "loss": 0.5921, "step": 2921 }, { "epoch": 0.4464988348550254, "grad_norm": 0.2995966970920563, "learning_rate": 0.00012348768914995157, "loss": 0.6149, "step": 2922 }, { "epoch": 0.4466516407533331, "grad_norm": 0.27304011583328247, "learning_rate": 0.0001234391684790987, "loss": 0.8127, "step": 2923 }, { "epoch": 0.44680444665164076, "grad_norm": 0.301516056060791, "learning_rate": 0.00012339064196857378, "loss": 0.6597, "step": 2924 }, { "epoch": 0.44695725254994845, "grad_norm": 0.4759582579135895, "learning_rate": 0.00012334210963046679, "loss": 0.753, "step": 2925 }, { "epoch": 0.4471100584482561, "grad_norm": 0.39895206689834595, "learning_rate": 0.0001232935714768691, "loss": 0.6516, "step": 2926 }, { "epoch": 0.44726286434656376, "grad_norm": 0.30540645122528076, "learning_rate": 0.0001232450275198736, "loss": 0.7916, "step": 2927 }, { "epoch": 0.44741567024487144, "grad_norm": 0.3424038887023926, "learning_rate": 0.0001231964777715746, "loss": 0.6381, "step": 2928 }, { "epoch": 0.44756847614317913, "grad_norm": 0.37364235520362854, "learning_rate": 0.00012314792224406792, "loss": 0.7826, "step": 2929 }, { "epoch": 0.4477212820414868, "grad_norm": 0.2799992561340332, "learning_rate": 0.00012309936094945072, "loss": 0.6587, "step": 2930 }, { "epoch": 0.4478740879397945, "grad_norm": 0.306768000125885, "learning_rate": 0.00012305079389982162, "loss": 0.7384, "step": 2931 }, { "epoch": 0.4480268938381021, "grad_norm": 0.3117838501930237, "learning_rate": 0.0001230022211072807, "loss": 0.8259, "step": 2932 }, { "epoch": 0.4481796997364098, "grad_norm": 0.34458303451538086, "learning_rate": 0.0001229536425839295, "loss": 0.7534, "step": 2933 }, { "epoch": 0.4483325056347175, "grad_norm": 0.30791348218917847, "learning_rate": 0.00012290505834187094, "loss": 0.8358, "step": 2934 }, { "epoch": 0.4484853115330252, "grad_norm": 0.327889621257782, "learning_rate": 0.00012285646839320935, "loss": 0.7923, "step": 2935 }, { "epoch": 0.44863811743133286, "grad_norm": 0.2725731432437897, "learning_rate": 0.0001228078727500505, "loss": 0.7498, "step": 2936 }, { "epoch": 0.44879092332964055, "grad_norm": 0.4868723154067993, "learning_rate": 0.00012275927142450164, "loss": 0.5499, "step": 2937 }, { "epoch": 0.4489437292279482, "grad_norm": 0.33403563499450684, "learning_rate": 0.00012271066442867137, "loss": 0.7104, "step": 2938 }, { "epoch": 0.44909653512625586, "grad_norm": 0.323974609375, "learning_rate": 0.00012266205177466965, "loss": 0.6424, "step": 2939 }, { "epoch": 0.44924934102456354, "grad_norm": 0.33368954062461853, "learning_rate": 0.00012261343347460797, "loss": 0.6989, "step": 2940 }, { "epoch": 0.4494021469228712, "grad_norm": 0.22383123636245728, "learning_rate": 0.0001225648095405992, "loss": 0.5866, "step": 2941 }, { "epoch": 0.4495549528211789, "grad_norm": 0.3466974198818207, "learning_rate": 0.00012251617998475752, "loss": 0.7301, "step": 2942 }, { "epoch": 0.4497077587194866, "grad_norm": 0.3107375204563141, "learning_rate": 0.0001224675448191986, "loss": 0.8147, "step": 2943 }, { "epoch": 0.4498605646177942, "grad_norm": 0.32883167266845703, "learning_rate": 0.0001224189040560395, "loss": 0.8292, "step": 2944 }, { "epoch": 0.4500133705161019, "grad_norm": 0.2791670262813568, "learning_rate": 0.00012237025770739862, "loss": 0.6703, "step": 2945 }, { "epoch": 0.4501661764144096, "grad_norm": 0.27861130237579346, "learning_rate": 0.00012232160578539586, "loss": 0.65, "step": 2946 }, { "epoch": 0.4503189823127173, "grad_norm": 15.20201301574707, "learning_rate": 0.00012227294830215234, "loss": 0.8696, "step": 2947 }, { "epoch": 0.45047178821102496, "grad_norm": 0.31514841318130493, "learning_rate": 0.00012222428526979074, "loss": 0.634, "step": 2948 }, { "epoch": 0.45062459410933264, "grad_norm": 0.3868434727191925, "learning_rate": 0.000122175616700435, "loss": 0.7436, "step": 2949 }, { "epoch": 0.45077740000764027, "grad_norm": 0.2911074459552765, "learning_rate": 0.00012212694260621052, "loss": 0.6778, "step": 2950 }, { "epoch": 0.45093020590594796, "grad_norm": 0.3343454599380493, "learning_rate": 0.00012207826299924407, "loss": 0.6356, "step": 2951 }, { "epoch": 0.45108301180425564, "grad_norm": 0.2641962766647339, "learning_rate": 0.00012202957789166365, "loss": 0.8245, "step": 2952 }, { "epoch": 0.4512358177025633, "grad_norm": 0.41699346899986267, "learning_rate": 0.00012198088729559889, "loss": 0.5049, "step": 2953 }, { "epoch": 0.451388623600871, "grad_norm": 0.2433166354894638, "learning_rate": 0.00012193219122318052, "loss": 0.567, "step": 2954 }, { "epoch": 0.4515414294991787, "grad_norm": 0.37394678592681885, "learning_rate": 0.00012188348968654084, "loss": 0.6925, "step": 2955 }, { "epoch": 0.4516942353974863, "grad_norm": 0.6773134469985962, "learning_rate": 0.00012183478269781337, "loss": 0.7749, "step": 2956 }, { "epoch": 0.451847041295794, "grad_norm": 0.3066105544567108, "learning_rate": 0.00012178607026913311, "loss": 0.6992, "step": 2957 }, { "epoch": 0.4519998471941017, "grad_norm": 0.33363470435142517, "learning_rate": 0.00012173735241263631, "loss": 0.667, "step": 2958 }, { "epoch": 0.4521526530924094, "grad_norm": 0.3259199261665344, "learning_rate": 0.00012168862914046063, "loss": 0.82, "step": 2959 }, { "epoch": 0.45230545899071706, "grad_norm": 0.25989770889282227, "learning_rate": 0.00012163990046474505, "loss": 0.7487, "step": 2960 }, { "epoch": 0.45245826488902474, "grad_norm": 0.2744223475456238, "learning_rate": 0.00012159116639762991, "loss": 0.6466, "step": 2961 }, { "epoch": 0.45261107078733237, "grad_norm": 0.2962299585342407, "learning_rate": 0.00012154242695125692, "loss": 0.634, "step": 2962 }, { "epoch": 0.45276387668564005, "grad_norm": 0.319975346326828, "learning_rate": 0.00012149368213776906, "loss": 0.6443, "step": 2963 }, { "epoch": 0.45291668258394774, "grad_norm": 0.2526867687702179, "learning_rate": 0.00012144493196931078, "loss": 0.6078, "step": 2964 }, { "epoch": 0.4530694884822554, "grad_norm": 0.35642507672309875, "learning_rate": 0.00012139617645802763, "loss": 0.5997, "step": 2965 }, { "epoch": 0.4532222943805631, "grad_norm": 0.28118640184402466, "learning_rate": 0.00012134741561606679, "loss": 0.7705, "step": 2966 }, { "epoch": 0.4533751002788708, "grad_norm": 0.3002743721008301, "learning_rate": 0.00012129864945557652, "loss": 0.6621, "step": 2967 }, { "epoch": 0.4535279061771784, "grad_norm": 0.33882054686546326, "learning_rate": 0.00012124987798870652, "loss": 0.8184, "step": 2968 }, { "epoch": 0.4536807120754861, "grad_norm": 0.31006500124931335, "learning_rate": 0.00012120110122760779, "loss": 0.6977, "step": 2969 }, { "epoch": 0.4538335179737938, "grad_norm": 0.3728959858417511, "learning_rate": 0.00012115231918443268, "loss": 0.6769, "step": 2970 }, { "epoch": 0.45398632387210147, "grad_norm": 0.3111363649368286, "learning_rate": 0.00012110353187133478, "loss": 0.6327, "step": 2971 }, { "epoch": 0.45413912977040916, "grad_norm": 0.27471086382865906, "learning_rate": 0.00012105473930046907, "loss": 0.6579, "step": 2972 }, { "epoch": 0.45429193566871684, "grad_norm": 0.29122394323349, "learning_rate": 0.0001210059414839918, "loss": 0.6247, "step": 2973 }, { "epoch": 0.45444474156702447, "grad_norm": 0.3433492183685303, "learning_rate": 0.00012095713843406056, "loss": 0.7394, "step": 2974 }, { "epoch": 0.45459754746533215, "grad_norm": 0.23486945033073425, "learning_rate": 0.00012090833016283415, "loss": 0.5011, "step": 2975 }, { "epoch": 0.45475035336363984, "grad_norm": 0.2754330635070801, "learning_rate": 0.00012085951668247284, "loss": 0.5579, "step": 2976 }, { "epoch": 0.4549031592619475, "grad_norm": 0.37102657556533813, "learning_rate": 0.00012081069800513803, "loss": 0.6467, "step": 2977 }, { "epoch": 0.4550559651602552, "grad_norm": 0.7223601937294006, "learning_rate": 0.00012076187414299249, "loss": 0.6745, "step": 2978 }, { "epoch": 0.45520877105856283, "grad_norm": 0.24973377585411072, "learning_rate": 0.00012071304510820029, "loss": 0.6539, "step": 2979 }, { "epoch": 0.4553615769568705, "grad_norm": 0.2775816023349762, "learning_rate": 0.0001206642109129268, "loss": 0.6408, "step": 2980 }, { "epoch": 0.4555143828551782, "grad_norm": 0.29855409264564514, "learning_rate": 0.0001206153715693386, "loss": 0.5669, "step": 2981 }, { "epoch": 0.4556671887534859, "grad_norm": 0.31334125995635986, "learning_rate": 0.00012056652708960361, "loss": 0.6411, "step": 2982 }, { "epoch": 0.45581999465179357, "grad_norm": 0.2566351294517517, "learning_rate": 0.00012051767748589106, "loss": 0.6787, "step": 2983 }, { "epoch": 0.45597280055010125, "grad_norm": 0.3497639000415802, "learning_rate": 0.00012046882277037136, "loss": 0.6258, "step": 2984 }, { "epoch": 0.4561256064484089, "grad_norm": 0.3289467990398407, "learning_rate": 0.00012041996295521634, "loss": 0.6685, "step": 2985 }, { "epoch": 0.45627841234671657, "grad_norm": 0.32322293519973755, "learning_rate": 0.00012037109805259892, "loss": 0.7568, "step": 2986 }, { "epoch": 0.45643121824502425, "grad_norm": 0.30762824416160583, "learning_rate": 0.00012032222807469344, "loss": 0.8101, "step": 2987 }, { "epoch": 0.45658402414333193, "grad_norm": 0.2983434796333313, "learning_rate": 0.00012027335303367542, "loss": 0.669, "step": 2988 }, { "epoch": 0.4567368300416396, "grad_norm": 0.5077597498893738, "learning_rate": 0.00012022447294172165, "loss": 0.5633, "step": 2989 }, { "epoch": 0.4568896359399473, "grad_norm": 0.24513135850429535, "learning_rate": 0.00012017558781101026, "loss": 0.6378, "step": 2990 }, { "epoch": 0.45704244183825493, "grad_norm": 0.3104904890060425, "learning_rate": 0.00012012669765372049, "loss": 0.7319, "step": 2991 }, { "epoch": 0.4571952477365626, "grad_norm": 0.2522958815097809, "learning_rate": 0.00012007780248203297, "loss": 0.699, "step": 2992 }, { "epoch": 0.4573480536348703, "grad_norm": 0.2799461781978607, "learning_rate": 0.00012002890230812947, "loss": 0.7926, "step": 2993 }, { "epoch": 0.457500859533178, "grad_norm": 0.2929769456386566, "learning_rate": 0.00011997999714419313, "loss": 0.7925, "step": 2994 }, { "epoch": 0.45765366543148567, "grad_norm": 0.37641918659210205, "learning_rate": 0.00011993108700240815, "loss": 0.7682, "step": 2995 }, { "epoch": 0.45780647132979335, "grad_norm": 0.3405778408050537, "learning_rate": 0.00011988217189496022, "loss": 0.5922, "step": 2996 }, { "epoch": 0.457959277228101, "grad_norm": 0.27888843417167664, "learning_rate": 0.00011983325183403604, "loss": 0.7494, "step": 2997 }, { "epoch": 0.45811208312640866, "grad_norm": 0.30546584725379944, "learning_rate": 0.00011978432683182364, "loss": 0.7148, "step": 2998 }, { "epoch": 0.45826488902471635, "grad_norm": 0.6508386731147766, "learning_rate": 0.0001197353969005123, "loss": 0.9219, "step": 2999 }, { "epoch": 0.45841769492302403, "grad_norm": 0.2737182080745697, "learning_rate": 0.00011968646205229244, "loss": 0.4652, "step": 3000 }, { "epoch": 0.4585705008213317, "grad_norm": 0.26674118638038635, "learning_rate": 0.00011963752229935587, "loss": 0.7178, "step": 3001 }, { "epoch": 0.4587233067196394, "grad_norm": 0.29134851694107056, "learning_rate": 0.00011958857765389541, "loss": 0.6097, "step": 3002 }, { "epoch": 0.45887611261794703, "grad_norm": 0.2613201141357422, "learning_rate": 0.00011953962812810531, "loss": 0.7444, "step": 3003 }, { "epoch": 0.4590289185162547, "grad_norm": 0.3279878497123718, "learning_rate": 0.00011949067373418084, "loss": 0.6885, "step": 3004 }, { "epoch": 0.4591817244145624, "grad_norm": 0.2864905595779419, "learning_rate": 0.00011944171448431864, "loss": 0.579, "step": 3005 }, { "epoch": 0.4593345303128701, "grad_norm": 0.3064310550689697, "learning_rate": 0.0001193927503907165, "loss": 0.5701, "step": 3006 }, { "epoch": 0.45948733621117777, "grad_norm": 0.265474796295166, "learning_rate": 0.00011934378146557335, "loss": 0.6268, "step": 3007 }, { "epoch": 0.45964014210948545, "grad_norm": 0.2856680750846863, "learning_rate": 0.00011929480772108941, "loss": 0.6023, "step": 3008 }, { "epoch": 0.4597929480077931, "grad_norm": 0.29818516969680786, "learning_rate": 0.00011924582916946612, "loss": 0.7667, "step": 3009 }, { "epoch": 0.45994575390610076, "grad_norm": 0.46206653118133545, "learning_rate": 0.00011919684582290605, "loss": 0.6517, "step": 3010 }, { "epoch": 0.46009855980440845, "grad_norm": 0.3467860221862793, "learning_rate": 0.00011914785769361294, "loss": 0.6512, "step": 3011 }, { "epoch": 0.46025136570271613, "grad_norm": 0.40472298860549927, "learning_rate": 0.00011909886479379189, "loss": 0.5255, "step": 3012 }, { "epoch": 0.4604041716010238, "grad_norm": 0.33601143956184387, "learning_rate": 0.00011904986713564896, "loss": 0.8582, "step": 3013 }, { "epoch": 0.4605569774993315, "grad_norm": 0.31958696246147156, "learning_rate": 0.00011900086473139153, "loss": 0.901, "step": 3014 }, { "epoch": 0.4607097833976391, "grad_norm": 0.2809063494205475, "learning_rate": 0.00011895185759322818, "loss": 0.8309, "step": 3015 }, { "epoch": 0.4608625892959468, "grad_norm": 0.27857983112335205, "learning_rate": 0.00011890284573336856, "loss": 0.5825, "step": 3016 }, { "epoch": 0.4610153951942545, "grad_norm": 0.29882699251174927, "learning_rate": 0.00011885382916402364, "loss": 0.8242, "step": 3017 }, { "epoch": 0.4611682010925622, "grad_norm": 0.2936548590660095, "learning_rate": 0.00011880480789740542, "loss": 0.8594, "step": 3018 }, { "epoch": 0.46132100699086986, "grad_norm": 0.361464262008667, "learning_rate": 0.00011875578194572719, "loss": 0.5966, "step": 3019 }, { "epoch": 0.46147381288917755, "grad_norm": 0.3106854259967804, "learning_rate": 0.0001187067513212033, "loss": 0.7327, "step": 3020 }, { "epoch": 0.4616266187874852, "grad_norm": 0.3830045759677887, "learning_rate": 0.00011865771603604935, "loss": 0.6991, "step": 3021 }, { "epoch": 0.46177942468579286, "grad_norm": 0.29338714480400085, "learning_rate": 0.00011860867610248208, "loss": 0.7067, "step": 3022 }, { "epoch": 0.46193223058410054, "grad_norm": 0.28551679849624634, "learning_rate": 0.00011855963153271936, "loss": 0.7352, "step": 3023 }, { "epoch": 0.46208503648240823, "grad_norm": 0.2840779423713684, "learning_rate": 0.00011851058233898025, "loss": 0.7279, "step": 3024 }, { "epoch": 0.4622378423807159, "grad_norm": 0.26828092336654663, "learning_rate": 0.00011846152853348491, "loss": 0.7248, "step": 3025 }, { "epoch": 0.4623906482790236, "grad_norm": 0.2917962074279785, "learning_rate": 0.00011841247012845471, "loss": 0.8556, "step": 3026 }, { "epoch": 0.4625434541773312, "grad_norm": 0.33142760396003723, "learning_rate": 0.00011836340713611216, "loss": 0.59, "step": 3027 }, { "epoch": 0.4626962600756389, "grad_norm": 0.5676470994949341, "learning_rate": 0.00011831433956868085, "loss": 0.6251, "step": 3028 }, { "epoch": 0.4628490659739466, "grad_norm": 0.36629360914230347, "learning_rate": 0.0001182652674383856, "loss": 0.7498, "step": 3029 }, { "epoch": 0.4630018718722543, "grad_norm": 0.292192667722702, "learning_rate": 0.00011821619075745225, "loss": 0.7018, "step": 3030 }, { "epoch": 0.46315467777056196, "grad_norm": 0.32250627875328064, "learning_rate": 0.00011816710953810788, "loss": 0.6218, "step": 3031 }, { "epoch": 0.4633074836688696, "grad_norm": 0.2832304835319519, "learning_rate": 0.0001181180237925807, "loss": 0.6173, "step": 3032 }, { "epoch": 0.4634602895671773, "grad_norm": 0.3310091197490692, "learning_rate": 0.00011806893353309995, "loss": 0.4714, "step": 3033 }, { "epoch": 0.46361309546548496, "grad_norm": 0.2954336702823639, "learning_rate": 0.0001180198387718961, "loss": 0.7133, "step": 3034 }, { "epoch": 0.46376590136379264, "grad_norm": 0.31061121821403503, "learning_rate": 0.0001179707395212007, "loss": 0.6204, "step": 3035 }, { "epoch": 0.4639187072621003, "grad_norm": 0.25961393117904663, "learning_rate": 0.0001179216357932464, "loss": 0.5827, "step": 3036 }, { "epoch": 0.464071513160408, "grad_norm": 0.3093631863594055, "learning_rate": 0.00011787252760026694, "loss": 0.6789, "step": 3037 }, { "epoch": 0.46422431905871564, "grad_norm": 0.35962679982185364, "learning_rate": 0.00011782341495449732, "loss": 0.7595, "step": 3038 }, { "epoch": 0.4643771249570233, "grad_norm": 0.44419047236442566, "learning_rate": 0.0001177742978681734, "loss": 0.6952, "step": 3039 }, { "epoch": 0.464529930855331, "grad_norm": 0.382176011800766, "learning_rate": 0.00011772517635353242, "loss": 0.7884, "step": 3040 }, { "epoch": 0.4646827367536387, "grad_norm": 0.302168071269989, "learning_rate": 0.00011767605042281251, "loss": 0.7756, "step": 3041 }, { "epoch": 0.4648355426519464, "grad_norm": 0.33565452694892883, "learning_rate": 0.00011762692008825304, "loss": 0.8042, "step": 3042 }, { "epoch": 0.46498834855025406, "grad_norm": 0.33202725648880005, "learning_rate": 0.00011757778536209438, "loss": 0.7221, "step": 3043 }, { "epoch": 0.4651411544485617, "grad_norm": 0.3008812963962555, "learning_rate": 0.00011752864625657804, "loss": 0.8778, "step": 3044 }, { "epoch": 0.4652939603468694, "grad_norm": 0.3398931324481964, "learning_rate": 0.00011747950278394668, "loss": 0.9344, "step": 3045 }, { "epoch": 0.46544676624517706, "grad_norm": 0.2822340726852417, "learning_rate": 0.00011743035495644385, "loss": 0.7301, "step": 3046 }, { "epoch": 0.46559957214348474, "grad_norm": 0.3987044394016266, "learning_rate": 0.00011738120278631445, "loss": 0.8121, "step": 3047 }, { "epoch": 0.4657523780417924, "grad_norm": 0.28100937604904175, "learning_rate": 0.00011733204628580426, "loss": 0.8923, "step": 3048 }, { "epoch": 0.4659051839401001, "grad_norm": 0.2732929587364197, "learning_rate": 0.00011728288546716024, "loss": 0.8098, "step": 3049 }, { "epoch": 0.46605798983840774, "grad_norm": 0.48743966221809387, "learning_rate": 0.00011723372034263036, "loss": 0.9673, "step": 3050 }, { "epoch": 0.4662107957367154, "grad_norm": 0.3390193581581116, "learning_rate": 0.00011718455092446375, "loss": 0.5456, "step": 3051 }, { "epoch": 0.4663636016350231, "grad_norm": 0.37044551968574524, "learning_rate": 0.0001171353772249105, "loss": 0.6036, "step": 3052 }, { "epoch": 0.4665164075333308, "grad_norm": 0.3185332715511322, "learning_rate": 0.00011708619925622188, "loss": 0.5297, "step": 3053 }, { "epoch": 0.4666692134316385, "grad_norm": 0.30760905146598816, "learning_rate": 0.00011703701703065014, "loss": 0.7604, "step": 3054 }, { "epoch": 0.46682201932994616, "grad_norm": 0.318132609128952, "learning_rate": 0.00011698783056044859, "loss": 0.6375, "step": 3055 }, { "epoch": 0.4669748252282538, "grad_norm": 0.3219239413738251, "learning_rate": 0.00011693863985787168, "loss": 0.8012, "step": 3056 }, { "epoch": 0.46712763112656147, "grad_norm": 0.24363091588020325, "learning_rate": 0.0001168894449351748, "loss": 0.483, "step": 3057 }, { "epoch": 0.46728043702486916, "grad_norm": 0.346457302570343, "learning_rate": 0.00011684024580461455, "loss": 0.8002, "step": 3058 }, { "epoch": 0.46743324292317684, "grad_norm": 0.3414503335952759, "learning_rate": 0.00011679104247844834, "loss": 0.6163, "step": 3059 }, { "epoch": 0.4675860488214845, "grad_norm": 0.3042216897010803, "learning_rate": 0.00011674183496893492, "loss": 0.6604, "step": 3060 }, { "epoch": 0.4677388547197922, "grad_norm": 0.29265016317367554, "learning_rate": 0.00011669262328833381, "loss": 0.6929, "step": 3061 }, { "epoch": 0.46789166061809984, "grad_norm": 0.31261003017425537, "learning_rate": 0.00011664340744890577, "loss": 0.7802, "step": 3062 }, { "epoch": 0.4680444665164075, "grad_norm": 0.3014015257358551, "learning_rate": 0.00011659418746291242, "loss": 0.6751, "step": 3063 }, { "epoch": 0.4681972724147152, "grad_norm": 0.3346925973892212, "learning_rate": 0.0001165449633426166, "loss": 0.7601, "step": 3064 }, { "epoch": 0.4683500783130229, "grad_norm": 0.5724461078643799, "learning_rate": 0.00011649573510028203, "loss": 0.7809, "step": 3065 }, { "epoch": 0.4685028842113306, "grad_norm": 0.45399367809295654, "learning_rate": 0.00011644650274817353, "loss": 0.6694, "step": 3066 }, { "epoch": 0.46865569010963826, "grad_norm": 2.10894775390625, "learning_rate": 0.00011639726629855691, "loss": 0.8659, "step": 3067 }, { "epoch": 0.4688084960079459, "grad_norm": 0.29158470034599304, "learning_rate": 0.00011634802576369905, "loss": 0.5995, "step": 3068 }, { "epoch": 0.46896130190625357, "grad_norm": 0.3776535093784332, "learning_rate": 0.0001162987811558678, "loss": 0.7662, "step": 3069 }, { "epoch": 0.46911410780456125, "grad_norm": 0.30398276448249817, "learning_rate": 0.00011624953248733204, "loss": 0.7443, "step": 3070 }, { "epoch": 0.46926691370286894, "grad_norm": 0.3071722686290741, "learning_rate": 0.00011620027977036168, "loss": 0.7196, "step": 3071 }, { "epoch": 0.4694197196011766, "grad_norm": 0.2889639735221863, "learning_rate": 0.00011615102301722758, "loss": 0.8124, "step": 3072 }, { "epoch": 0.4695725254994843, "grad_norm": 0.29236549139022827, "learning_rate": 0.00011610176224020168, "loss": 0.9651, "step": 3073 }, { "epoch": 0.46972533139779193, "grad_norm": 0.4778033494949341, "learning_rate": 0.00011605249745155688, "loss": 0.7847, "step": 3074 }, { "epoch": 0.4698781372960996, "grad_norm": 0.32045695185661316, "learning_rate": 0.00011600322866356708, "loss": 0.5641, "step": 3075 }, { "epoch": 0.4700309431944073, "grad_norm": 0.33323490619659424, "learning_rate": 0.00011595395588850719, "loss": 0.7267, "step": 3076 }, { "epoch": 0.470183749092715, "grad_norm": 0.2765256464481354, "learning_rate": 0.00011590467913865313, "loss": 0.6555, "step": 3077 }, { "epoch": 0.47033655499102267, "grad_norm": 0.36682021617889404, "learning_rate": 0.00011585539842628178, "loss": 0.7699, "step": 3078 }, { "epoch": 0.47048936088933035, "grad_norm": 0.26881060004234314, "learning_rate": 0.00011580611376367096, "loss": 0.7308, "step": 3079 }, { "epoch": 0.470642166787638, "grad_norm": 0.2899646461009979, "learning_rate": 0.00011575682516309963, "loss": 0.6116, "step": 3080 }, { "epoch": 0.47079497268594567, "grad_norm": 0.32141637802124023, "learning_rate": 0.00011570753263684755, "loss": 0.5917, "step": 3081 }, { "epoch": 0.47094777858425335, "grad_norm": 0.3428771197795868, "learning_rate": 0.00011565823619719556, "loss": 0.595, "step": 3082 }, { "epoch": 0.47110058448256104, "grad_norm": 0.31115248799324036, "learning_rate": 0.00011560893585642547, "loss": 0.5678, "step": 3083 }, { "epoch": 0.4712533903808687, "grad_norm": 0.3463020324707031, "learning_rate": 0.00011555963162682007, "loss": 0.622, "step": 3084 }, { "epoch": 0.4714061962791764, "grad_norm": 0.2892141044139862, "learning_rate": 0.000115510323520663, "loss": 0.6668, "step": 3085 }, { "epoch": 0.47155900217748403, "grad_norm": 0.34900522232055664, "learning_rate": 0.00011546101155023908, "loss": 0.6623, "step": 3086 }, { "epoch": 0.4717118080757917, "grad_norm": 0.2772337794303894, "learning_rate": 0.00011541169572783386, "loss": 0.5601, "step": 3087 }, { "epoch": 0.4718646139740994, "grad_norm": 0.26819148659706116, "learning_rate": 0.00011536237606573405, "loss": 0.6573, "step": 3088 }, { "epoch": 0.4720174198724071, "grad_norm": 0.2884041368961334, "learning_rate": 0.00011531305257622717, "loss": 0.5774, "step": 3089 }, { "epoch": 0.47217022577071477, "grad_norm": 0.4495169222354889, "learning_rate": 0.00011526372527160183, "loss": 0.7284, "step": 3090 }, { "epoch": 0.4723230316690224, "grad_norm": 0.27755841612815857, "learning_rate": 0.00011521439416414746, "loss": 0.7151, "step": 3091 }, { "epoch": 0.4724758375673301, "grad_norm": 0.2918242812156677, "learning_rate": 0.00011516505926615444, "loss": 0.7234, "step": 3092 }, { "epoch": 0.47262864346563777, "grad_norm": 0.3533172607421875, "learning_rate": 0.00011511572058991426, "loss": 0.8371, "step": 3093 }, { "epoch": 0.47278144936394545, "grad_norm": 0.5401041507720947, "learning_rate": 0.00011506637814771915, "loss": 0.9416, "step": 3094 }, { "epoch": 0.47293425526225313, "grad_norm": 0.317081093788147, "learning_rate": 0.00011501703195186242, "loss": 0.7744, "step": 3095 }, { "epoch": 0.4730870611605608, "grad_norm": 0.3578571081161499, "learning_rate": 0.00011496768201463822, "loss": 0.6039, "step": 3096 }, { "epoch": 0.47323986705886845, "grad_norm": 0.683226466178894, "learning_rate": 0.00011491832834834171, "loss": 0.7333, "step": 3097 }, { "epoch": 0.47339267295717613, "grad_norm": 0.47472453117370605, "learning_rate": 0.00011486897096526888, "loss": 0.8873, "step": 3098 }, { "epoch": 0.4735454788554838, "grad_norm": 0.31787946820259094, "learning_rate": 0.00011481960987771678, "loss": 0.7204, "step": 3099 }, { "epoch": 0.4736982847537915, "grad_norm": 0.3622148931026459, "learning_rate": 0.00011477024509798326, "loss": 0.6251, "step": 3100 }, { "epoch": 0.4738510906520992, "grad_norm": 0.31107020378112793, "learning_rate": 0.00011472087663836718, "loss": 0.6451, "step": 3101 }, { "epoch": 0.47400389655040687, "grad_norm": 0.2645438015460968, "learning_rate": 0.00011467150451116823, "loss": 0.7023, "step": 3102 }, { "epoch": 0.4741567024487145, "grad_norm": 0.2966662049293518, "learning_rate": 0.00011462212872868712, "loss": 0.8464, "step": 3103 }, { "epoch": 0.4743095083470222, "grad_norm": 0.30942660570144653, "learning_rate": 0.00011457274930322534, "loss": 0.7057, "step": 3104 }, { "epoch": 0.47446231424532986, "grad_norm": 0.271252304315567, "learning_rate": 0.0001145233662470854, "loss": 0.619, "step": 3105 }, { "epoch": 0.47461512014363755, "grad_norm": 0.3281991183757782, "learning_rate": 0.00011447397957257071, "loss": 0.9169, "step": 3106 }, { "epoch": 0.47476792604194523, "grad_norm": 0.28666695952415466, "learning_rate": 0.00011442458929198549, "loss": 0.7189, "step": 3107 }, { "epoch": 0.4749207319402529, "grad_norm": 0.27483201026916504, "learning_rate": 0.00011437519541763493, "loss": 0.7052, "step": 3108 }, { "epoch": 0.47507353783856054, "grad_norm": 0.3456527590751648, "learning_rate": 0.0001143257979618251, "loss": 0.7226, "step": 3109 }, { "epoch": 0.47522634373686823, "grad_norm": 0.2994341254234314, "learning_rate": 0.00011427639693686296, "loss": 0.728, "step": 3110 }, { "epoch": 0.4753791496351759, "grad_norm": 0.29687169194221497, "learning_rate": 0.00011422699235505636, "loss": 0.7427, "step": 3111 }, { "epoch": 0.4755319555334836, "grad_norm": 0.3335234522819519, "learning_rate": 0.00011417758422871405, "loss": 0.6418, "step": 3112 }, { "epoch": 0.4756847614317913, "grad_norm": 0.41639548540115356, "learning_rate": 0.00011412817257014564, "loss": 0.7566, "step": 3113 }, { "epoch": 0.47583756733009897, "grad_norm": 0.31691673398017883, "learning_rate": 0.00011407875739166161, "loss": 0.6892, "step": 3114 }, { "epoch": 0.4759903732284066, "grad_norm": 0.28714266419410706, "learning_rate": 0.00011402933870557337, "loss": 0.7085, "step": 3115 }, { "epoch": 0.4761431791267143, "grad_norm": 0.2862628400325775, "learning_rate": 0.00011397991652419316, "loss": 0.7797, "step": 3116 }, { "epoch": 0.47629598502502196, "grad_norm": 0.2885003983974457, "learning_rate": 0.00011393049085983409, "loss": 0.81, "step": 3117 }, { "epoch": 0.47644879092332965, "grad_norm": 0.27381911873817444, "learning_rate": 0.00011388106172481016, "loss": 0.6638, "step": 3118 }, { "epoch": 0.47660159682163733, "grad_norm": 0.3181326687335968, "learning_rate": 0.00011383162913143624, "loss": 0.7114, "step": 3119 }, { "epoch": 0.476754402719945, "grad_norm": 0.3386448919773102, "learning_rate": 0.000113782193092028, "loss": 0.7061, "step": 3120 }, { "epoch": 0.47690720861825264, "grad_norm": 0.2852921485900879, "learning_rate": 0.00011373275361890205, "loss": 0.5549, "step": 3121 }, { "epoch": 0.4770600145165603, "grad_norm": 0.30625444650650024, "learning_rate": 0.00011368331072437584, "loss": 0.7699, "step": 3122 }, { "epoch": 0.477212820414868, "grad_norm": 0.4224965274333954, "learning_rate": 0.0001136338644207676, "loss": 0.7529, "step": 3123 }, { "epoch": 0.4773656263131757, "grad_norm": 0.33408239483833313, "learning_rate": 0.00011358441472039647, "loss": 0.625, "step": 3124 }, { "epoch": 0.4775184322114834, "grad_norm": 0.34229129552841187, "learning_rate": 0.00011353496163558246, "loss": 0.6759, "step": 3125 }, { "epoch": 0.47767123810979106, "grad_norm": 0.3091820776462555, "learning_rate": 0.00011348550517864638, "loss": 0.5886, "step": 3126 }, { "epoch": 0.4778240440080987, "grad_norm": 0.2753916382789612, "learning_rate": 0.00011343604536190988, "loss": 0.8108, "step": 3127 }, { "epoch": 0.4779768499064064, "grad_norm": 0.2937089800834656, "learning_rate": 0.00011338658219769546, "loss": 0.6251, "step": 3128 }, { "epoch": 0.47812965580471406, "grad_norm": 0.2915576100349426, "learning_rate": 0.00011333711569832645, "loss": 0.773, "step": 3129 }, { "epoch": 0.47828246170302174, "grad_norm": 0.31171897053718567, "learning_rate": 0.00011328764587612704, "loss": 0.5729, "step": 3130 }, { "epoch": 0.47843526760132943, "grad_norm": 0.4770534932613373, "learning_rate": 0.00011323817274342219, "loss": 0.7378, "step": 3131 }, { "epoch": 0.4785880734996371, "grad_norm": 0.30231431126594543, "learning_rate": 0.00011318869631253774, "loss": 0.7529, "step": 3132 }, { "epoch": 0.47874087939794474, "grad_norm": 0.3131800889968872, "learning_rate": 0.00011313921659580028, "loss": 0.8394, "step": 3133 }, { "epoch": 0.4788936852962524, "grad_norm": 0.3672395348548889, "learning_rate": 0.00011308973360553733, "loss": 0.9422, "step": 3134 }, { "epoch": 0.4790464911945601, "grad_norm": 0.2536657750606537, "learning_rate": 0.0001130402473540771, "loss": 0.759, "step": 3135 }, { "epoch": 0.4791992970928678, "grad_norm": 0.30961093306541443, "learning_rate": 0.00011299075785374875, "loss": 0.5457, "step": 3136 }, { "epoch": 0.4793521029911755, "grad_norm": 0.33329442143440247, "learning_rate": 0.00011294126511688205, "loss": 0.9315, "step": 3137 }, { "epoch": 0.47950490888948316, "grad_norm": 0.3517923355102539, "learning_rate": 0.00011289176915580784, "loss": 0.6728, "step": 3138 }, { "epoch": 0.4796577147877908, "grad_norm": 0.32341015338897705, "learning_rate": 0.00011284226998285756, "loss": 0.7087, "step": 3139 }, { "epoch": 0.4798105206860985, "grad_norm": 0.37149766087532043, "learning_rate": 0.0001127927676103635, "loss": 0.8427, "step": 3140 }, { "epoch": 0.47996332658440616, "grad_norm": 0.2929363250732422, "learning_rate": 0.00011274326205065879, "loss": 0.6859, "step": 3141 }, { "epoch": 0.48011613248271384, "grad_norm": 0.24660053849220276, "learning_rate": 0.00011269375331607728, "loss": 0.6897, "step": 3142 }, { "epoch": 0.4802689383810215, "grad_norm": 0.26271674036979675, "learning_rate": 0.00011264424141895373, "loss": 0.6369, "step": 3143 }, { "epoch": 0.48042174427932915, "grad_norm": 0.3054701089859009, "learning_rate": 0.00011259472637162352, "loss": 0.5811, "step": 3144 }, { "epoch": 0.48057455017763684, "grad_norm": 0.3352110683917999, "learning_rate": 0.000112545208186423, "loss": 0.8285, "step": 3145 }, { "epoch": 0.4807273560759445, "grad_norm": 0.31057208776474, "learning_rate": 0.00011249568687568914, "loss": 0.6465, "step": 3146 }, { "epoch": 0.4808801619742522, "grad_norm": 0.2694271504878998, "learning_rate": 0.00011244616245175981, "loss": 0.713, "step": 3147 }, { "epoch": 0.4810329678725599, "grad_norm": 0.35943877696990967, "learning_rate": 0.00011239663492697356, "loss": 0.7039, "step": 3148 }, { "epoch": 0.4811857737708676, "grad_norm": 0.32773950695991516, "learning_rate": 0.00011234710431366979, "loss": 0.6115, "step": 3149 }, { "epoch": 0.4813385796691752, "grad_norm": 0.2837180495262146, "learning_rate": 0.00011229757062418862, "loss": 0.7428, "step": 3150 }, { "epoch": 0.4814913855674829, "grad_norm": 0.3680180609226227, "learning_rate": 0.00011224803387087095, "loss": 0.8842, "step": 3151 }, { "epoch": 0.48164419146579057, "grad_norm": 0.3667493760585785, "learning_rate": 0.00011219849406605846, "loss": 0.6102, "step": 3152 }, { "epoch": 0.48179699736409826, "grad_norm": 0.35641342401504517, "learning_rate": 0.00011214895122209356, "loss": 0.7404, "step": 3153 }, { "epoch": 0.48194980326240594, "grad_norm": 0.28385525941848755, "learning_rate": 0.00011209940535131948, "loss": 0.6549, "step": 3154 }, { "epoch": 0.4821026091607136, "grad_norm": 0.2514529228210449, "learning_rate": 0.0001120498564660801, "loss": 0.6631, "step": 3155 }, { "epoch": 0.48225541505902125, "grad_norm": 0.2889954447746277, "learning_rate": 0.00011200030457872013, "loss": 0.6912, "step": 3156 }, { "epoch": 0.48240822095732894, "grad_norm": 1.1534295082092285, "learning_rate": 0.00011195074970158502, "loss": 0.676, "step": 3157 }, { "epoch": 0.4825610268556366, "grad_norm": 0.287183940410614, "learning_rate": 0.00011190119184702092, "loss": 0.6186, "step": 3158 }, { "epoch": 0.4827138327539443, "grad_norm": 0.2797063887119293, "learning_rate": 0.00011185163102737477, "loss": 0.6834, "step": 3159 }, { "epoch": 0.482866638652252, "grad_norm": 0.29181644320487976, "learning_rate": 0.00011180206725499424, "loss": 0.795, "step": 3160 }, { "epoch": 0.4830194445505597, "grad_norm": 0.37816306948661804, "learning_rate": 0.00011175250054222774, "loss": 0.6745, "step": 3161 }, { "epoch": 0.4831722504488673, "grad_norm": 0.2842831611633301, "learning_rate": 0.00011170293090142437, "loss": 0.6604, "step": 3162 }, { "epoch": 0.483325056347175, "grad_norm": 0.26622524857521057, "learning_rate": 0.000111653358344934, "loss": 0.6542, "step": 3163 }, { "epoch": 0.48347786224548267, "grad_norm": 0.3658379912376404, "learning_rate": 0.00011160378288510723, "loss": 0.7897, "step": 3164 }, { "epoch": 0.48363066814379035, "grad_norm": 0.42732903361320496, "learning_rate": 0.00011155420453429535, "loss": 0.5246, "step": 3165 }, { "epoch": 0.48378347404209804, "grad_norm": 0.36560025811195374, "learning_rate": 0.00011150462330485041, "loss": 0.5862, "step": 3166 }, { "epoch": 0.4839362799404057, "grad_norm": 0.4351115822792053, "learning_rate": 0.00011145503920912512, "loss": 0.7431, "step": 3167 }, { "epoch": 0.48408908583871335, "grad_norm": 0.34151631593704224, "learning_rate": 0.000111405452259473, "loss": 0.5993, "step": 3168 }, { "epoch": 0.48424189173702104, "grad_norm": 0.35425591468811035, "learning_rate": 0.00011135586246824817, "loss": 0.5834, "step": 3169 }, { "epoch": 0.4843946976353287, "grad_norm": 0.2991638481616974, "learning_rate": 0.00011130626984780554, "loss": 0.7526, "step": 3170 }, { "epoch": 0.4845475035336364, "grad_norm": 0.5623118281364441, "learning_rate": 0.00011125667441050069, "loss": 0.666, "step": 3171 }, { "epoch": 0.4847003094319441, "grad_norm": 0.2882367968559265, "learning_rate": 0.00011120707616868988, "loss": 0.7725, "step": 3172 }, { "epoch": 0.48485311533025177, "grad_norm": 0.2950005829334259, "learning_rate": 0.00011115747513473014, "loss": 0.6603, "step": 3173 }, { "epoch": 0.4850059212285594, "grad_norm": 0.27206265926361084, "learning_rate": 0.0001111078713209791, "loss": 0.847, "step": 3174 }, { "epoch": 0.4851587271268671, "grad_norm": 0.39594581723213196, "learning_rate": 0.0001110582647397952, "loss": 0.9321, "step": 3175 }, { "epoch": 0.48531153302517477, "grad_norm": 0.3641679584980011, "learning_rate": 0.00011100865540353744, "loss": 0.6959, "step": 3176 }, { "epoch": 0.48546433892348245, "grad_norm": 0.35023003816604614, "learning_rate": 0.0001109590433245656, "loss": 0.6577, "step": 3177 }, { "epoch": 0.48561714482179014, "grad_norm": 0.5135242342948914, "learning_rate": 0.00011090942851524013, "loss": 0.9438, "step": 3178 }, { "epoch": 0.4857699507200978, "grad_norm": 0.24862836301326752, "learning_rate": 0.00011085981098792208, "loss": 0.5999, "step": 3179 }, { "epoch": 0.48592275661840545, "grad_norm": 0.5486438870429993, "learning_rate": 0.00011081019075497332, "loss": 0.7452, "step": 3180 }, { "epoch": 0.48607556251671313, "grad_norm": 0.3016669452190399, "learning_rate": 0.00011076056782875625, "loss": 0.6285, "step": 3181 }, { "epoch": 0.4862283684150208, "grad_norm": 0.32901546359062195, "learning_rate": 0.00011071094222163408, "loss": 0.6339, "step": 3182 }, { "epoch": 0.4863811743133285, "grad_norm": 0.31634917855262756, "learning_rate": 0.0001106613139459705, "loss": 0.6858, "step": 3183 }, { "epoch": 0.4865339802116362, "grad_norm": 0.28542599081993103, "learning_rate": 0.00011061168301413009, "loss": 0.8819, "step": 3184 }, { "epoch": 0.48668678610994387, "grad_norm": 0.31034329533576965, "learning_rate": 0.0001105620494384779, "loss": 0.7413, "step": 3185 }, { "epoch": 0.4868395920082515, "grad_norm": 0.32563355565071106, "learning_rate": 0.00011051241323137978, "loss": 0.717, "step": 3186 }, { "epoch": 0.4869923979065592, "grad_norm": 0.278524249792099, "learning_rate": 0.00011046277440520214, "loss": 0.7499, "step": 3187 }, { "epoch": 0.48714520380486687, "grad_norm": 0.31609123945236206, "learning_rate": 0.00011041313297231206, "loss": 0.7538, "step": 3188 }, { "epoch": 0.48729800970317455, "grad_norm": 0.3462464213371277, "learning_rate": 0.00011036348894507735, "loss": 0.7642, "step": 3189 }, { "epoch": 0.48745081560148223, "grad_norm": 0.3006207048892975, "learning_rate": 0.00011031384233586633, "loss": 0.7188, "step": 3190 }, { "epoch": 0.4876036214997899, "grad_norm": 0.29584068059921265, "learning_rate": 0.0001102641931570481, "loss": 0.429, "step": 3191 }, { "epoch": 0.48775642739809755, "grad_norm": 0.25582364201545715, "learning_rate": 0.00011021454142099228, "loss": 0.6474, "step": 3192 }, { "epoch": 0.48790923329640523, "grad_norm": 0.32200515270233154, "learning_rate": 0.00011016488714006923, "loss": 0.6822, "step": 3193 }, { "epoch": 0.4880620391947129, "grad_norm": 0.29044628143310547, "learning_rate": 0.00011011523032664988, "loss": 0.5595, "step": 3194 }, { "epoch": 0.4882148450930206, "grad_norm": 0.3656401038169861, "learning_rate": 0.00011006557099310577, "loss": 0.8375, "step": 3195 }, { "epoch": 0.4883676509913283, "grad_norm": 0.3706183135509491, "learning_rate": 0.00011001590915180917, "loss": 0.751, "step": 3196 }, { "epoch": 0.4885204568896359, "grad_norm": 0.3113393485546112, "learning_rate": 0.00010996624481513287, "loss": 0.7639, "step": 3197 }, { "epoch": 0.4886732627879436, "grad_norm": 0.2899192273616791, "learning_rate": 0.00010991657799545033, "loss": 0.5524, "step": 3198 }, { "epoch": 0.4888260686862513, "grad_norm": 0.31966841220855713, "learning_rate": 0.00010986690870513559, "loss": 0.5835, "step": 3199 }, { "epoch": 0.48897887458455896, "grad_norm": 0.26261016726493835, "learning_rate": 0.00010981723695656343, "loss": 0.7348, "step": 3200 }, { "epoch": 0.48913168048286665, "grad_norm": 0.3918934762477875, "learning_rate": 0.00010976756276210907, "loss": 0.6722, "step": 3201 }, { "epoch": 0.48928448638117433, "grad_norm": 0.40297189354896545, "learning_rate": 0.00010971788613414843, "loss": 0.6896, "step": 3202 }, { "epoch": 0.48943729227948196, "grad_norm": 0.3552076816558838, "learning_rate": 0.00010966820708505805, "loss": 0.6717, "step": 3203 }, { "epoch": 0.48959009817778965, "grad_norm": 0.3047221899032593, "learning_rate": 0.00010961852562721502, "loss": 0.5305, "step": 3204 }, { "epoch": 0.48974290407609733, "grad_norm": 0.3160412013530731, "learning_rate": 0.00010956884177299707, "loss": 0.7559, "step": 3205 }, { "epoch": 0.489895709974405, "grad_norm": 0.2890625, "learning_rate": 0.00010951915553478252, "loss": 0.6041, "step": 3206 }, { "epoch": 0.4900485158727127, "grad_norm": 0.2591745853424072, "learning_rate": 0.00010946946692495029, "loss": 0.7393, "step": 3207 }, { "epoch": 0.4902013217710204, "grad_norm": 0.2744395136833191, "learning_rate": 0.00010941977595587985, "loss": 0.6317, "step": 3208 }, { "epoch": 0.490354127669328, "grad_norm": 0.2722474932670593, "learning_rate": 0.00010937008263995128, "loss": 0.6662, "step": 3209 }, { "epoch": 0.4905069335676357, "grad_norm": 0.33023321628570557, "learning_rate": 0.0001093203869895453, "loss": 0.7126, "step": 3210 }, { "epoch": 0.4906597394659434, "grad_norm": 0.6961508989334106, "learning_rate": 0.00010927068901704314, "loss": 0.6022, "step": 3211 }, { "epoch": 0.49081254536425106, "grad_norm": 0.2518894374370575, "learning_rate": 0.00010922098873482663, "loss": 0.6411, "step": 3212 }, { "epoch": 0.49096535126255875, "grad_norm": 0.3645883798599243, "learning_rate": 0.00010917128615527816, "loss": 0.7511, "step": 3213 }, { "epoch": 0.49111815716086643, "grad_norm": 0.4825361371040344, "learning_rate": 0.00010912158129078074, "loss": 0.9103, "step": 3214 }, { "epoch": 0.49127096305917406, "grad_norm": 0.32693371176719666, "learning_rate": 0.00010907187415371793, "loss": 0.8316, "step": 3215 }, { "epoch": 0.49142376895748174, "grad_norm": 0.2648088335990906, "learning_rate": 0.0001090221647564738, "loss": 0.6512, "step": 3216 }, { "epoch": 0.4915765748557894, "grad_norm": 0.28130269050598145, "learning_rate": 0.0001089724531114331, "loss": 0.6506, "step": 3217 }, { "epoch": 0.4917293807540971, "grad_norm": 0.34511005878448486, "learning_rate": 0.00010892273923098098, "loss": 0.7288, "step": 3218 }, { "epoch": 0.4918821866524048, "grad_norm": 0.29202011227607727, "learning_rate": 0.00010887302312750329, "loss": 0.5704, "step": 3219 }, { "epoch": 0.4920349925507125, "grad_norm": 0.2937288284301758, "learning_rate": 0.00010882330481338636, "loss": 0.8524, "step": 3220 }, { "epoch": 0.4921877984490201, "grad_norm": 0.3036741614341736, "learning_rate": 0.00010877358430101711, "loss": 0.5406, "step": 3221 }, { "epoch": 0.4923406043473278, "grad_norm": 0.2834756672382355, "learning_rate": 0.00010872386160278298, "loss": 0.5422, "step": 3222 }, { "epoch": 0.4924934102456355, "grad_norm": 0.2763515114784241, "learning_rate": 0.00010867413673107196, "loss": 0.9426, "step": 3223 }, { "epoch": 0.49264621614394316, "grad_norm": 0.2803753912448883, "learning_rate": 0.00010862440969827262, "loss": 0.6358, "step": 3224 }, { "epoch": 0.49279902204225084, "grad_norm": 0.47641652822494507, "learning_rate": 0.00010857468051677395, "loss": 0.9681, "step": 3225 }, { "epoch": 0.49295182794055853, "grad_norm": 0.29467979073524475, "learning_rate": 0.00010852494919896565, "loss": 0.566, "step": 3226 }, { "epoch": 0.49310463383886616, "grad_norm": 0.28544798493385315, "learning_rate": 0.00010847521575723778, "loss": 0.7639, "step": 3227 }, { "epoch": 0.49325743973717384, "grad_norm": 0.2804313898086548, "learning_rate": 0.00010842548020398106, "loss": 0.7826, "step": 3228 }, { "epoch": 0.4934102456354815, "grad_norm": 0.2962232530117035, "learning_rate": 0.00010837574255158667, "loss": 0.7477, "step": 3229 }, { "epoch": 0.4935630515337892, "grad_norm": 0.2538807988166809, "learning_rate": 0.00010832600281244635, "loss": 0.6508, "step": 3230 }, { "epoch": 0.4937158574320969, "grad_norm": 0.3388998806476593, "learning_rate": 0.0001082762609989523, "loss": 0.9114, "step": 3231 }, { "epoch": 0.4938686633304046, "grad_norm": 0.2898162007331848, "learning_rate": 0.00010822651712349729, "loss": 0.5826, "step": 3232 }, { "epoch": 0.4940214692287122, "grad_norm": 0.2569247782230377, "learning_rate": 0.00010817677119847463, "loss": 0.683, "step": 3233 }, { "epoch": 0.4941742751270199, "grad_norm": 0.31775936484336853, "learning_rate": 0.00010812702323627802, "loss": 0.7554, "step": 3234 }, { "epoch": 0.4943270810253276, "grad_norm": 0.38079357147216797, "learning_rate": 0.00010807727324930181, "loss": 0.6775, "step": 3235 }, { "epoch": 0.49447988692363526, "grad_norm": 0.7460795044898987, "learning_rate": 0.00010802752124994075, "loss": 0.6831, "step": 3236 }, { "epoch": 0.49463269282194294, "grad_norm": 0.3023420572280884, "learning_rate": 0.00010797776725059021, "loss": 0.8218, "step": 3237 }, { "epoch": 0.4947854987202506, "grad_norm": 0.25051984190940857, "learning_rate": 0.00010792801126364587, "loss": 0.4852, "step": 3238 }, { "epoch": 0.49493830461855826, "grad_norm": 0.28263378143310547, "learning_rate": 0.00010787825330150412, "loss": 0.7961, "step": 3239 }, { "epoch": 0.49509111051686594, "grad_norm": 0.2797674238681793, "learning_rate": 0.00010782849337656165, "loss": 0.6993, "step": 3240 }, { "epoch": 0.4952439164151736, "grad_norm": 0.29567739367485046, "learning_rate": 0.0001077787315012158, "loss": 0.6962, "step": 3241 }, { "epoch": 0.4953967223134813, "grad_norm": 0.8774082064628601, "learning_rate": 0.0001077289676878643, "loss": 0.663, "step": 3242 }, { "epoch": 0.495549528211789, "grad_norm": 0.4161388874053955, "learning_rate": 0.00010767920194890535, "loss": 0.665, "step": 3243 }, { "epoch": 0.4957023341100967, "grad_norm": 0.288461834192276, "learning_rate": 0.0001076294342967377, "loss": 0.5216, "step": 3244 }, { "epoch": 0.4958551400084043, "grad_norm": 0.3518747091293335, "learning_rate": 0.00010757966474376056, "loss": 0.6696, "step": 3245 }, { "epoch": 0.496007945906712, "grad_norm": 0.25768399238586426, "learning_rate": 0.00010752989330237355, "loss": 0.5461, "step": 3246 }, { "epoch": 0.4961607518050197, "grad_norm": 0.2731454372406006, "learning_rate": 0.00010748011998497682, "loss": 0.7564, "step": 3247 }, { "epoch": 0.49631355770332736, "grad_norm": 0.4299314320087433, "learning_rate": 0.00010743034480397103, "loss": 0.732, "step": 3248 }, { "epoch": 0.49646636360163504, "grad_norm": 0.30067208409309387, "learning_rate": 0.00010738056777175717, "loss": 0.5602, "step": 3249 }, { "epoch": 0.4966191694999427, "grad_norm": 0.30349549651145935, "learning_rate": 0.00010733078890073683, "loss": 0.7734, "step": 3250 }, { "epoch": 0.49677197539825035, "grad_norm": 0.7365745902061462, "learning_rate": 0.00010728100820331195, "loss": 0.6051, "step": 3251 }, { "epoch": 0.49692478129655804, "grad_norm": 0.30216264724731445, "learning_rate": 0.000107231225691885, "loss": 0.8426, "step": 3252 }, { "epoch": 0.4970775871948657, "grad_norm": 0.2875060737133026, "learning_rate": 0.00010718144137885888, "loss": 0.6761, "step": 3253 }, { "epoch": 0.4972303930931734, "grad_norm": 0.3124886453151703, "learning_rate": 0.00010713165527663691, "loss": 0.6802, "step": 3254 }, { "epoch": 0.4973831989914811, "grad_norm": 0.2875783443450928, "learning_rate": 0.0001070818673976229, "loss": 0.6805, "step": 3255 }, { "epoch": 0.4975360048897887, "grad_norm": 0.31740424036979675, "learning_rate": 0.00010703207775422106, "loss": 0.5709, "step": 3256 }, { "epoch": 0.4976888107880964, "grad_norm": 0.3055468201637268, "learning_rate": 0.0001069822863588361, "loss": 0.5894, "step": 3257 }, { "epoch": 0.4978416166864041, "grad_norm": 0.2838101089000702, "learning_rate": 0.00010693249322387309, "loss": 0.6071, "step": 3258 }, { "epoch": 0.49799442258471177, "grad_norm": 0.29591605067253113, "learning_rate": 0.00010688269836173759, "loss": 0.7489, "step": 3259 }, { "epoch": 0.49814722848301946, "grad_norm": 0.26190677285194397, "learning_rate": 0.00010683290178483556, "loss": 0.7207, "step": 3260 }, { "epoch": 0.49830003438132714, "grad_norm": 0.3020467162132263, "learning_rate": 0.00010678310350557341, "loss": 0.7131, "step": 3261 }, { "epoch": 0.49845284027963477, "grad_norm": 0.27667558193206787, "learning_rate": 0.00010673330353635798, "loss": 0.7659, "step": 3262 }, { "epoch": 0.49860564617794245, "grad_norm": 0.3452799320220947, "learning_rate": 0.00010668350188959649, "loss": 0.5658, "step": 3263 }, { "epoch": 0.49875845207625014, "grad_norm": 0.2541445791721344, "learning_rate": 0.00010663369857769658, "loss": 0.8587, "step": 3264 }, { "epoch": 0.4989112579745578, "grad_norm": 0.4348546266555786, "learning_rate": 0.0001065838936130664, "loss": 0.7917, "step": 3265 }, { "epoch": 0.4990640638728655, "grad_norm": 1.7747349739074707, "learning_rate": 0.00010653408700811433, "loss": 0.5763, "step": 3266 }, { "epoch": 0.4992168697711732, "grad_norm": 0.23470017313957214, "learning_rate": 0.00010648427877524938, "loss": 0.6186, "step": 3267 }, { "epoch": 0.4993696756694808, "grad_norm": 0.36335036158561707, "learning_rate": 0.00010643446892688078, "loss": 0.8022, "step": 3268 }, { "epoch": 0.4995224815677885, "grad_norm": 0.3045618236064911, "learning_rate": 0.00010638465747541828, "loss": 0.5187, "step": 3269 }, { "epoch": 0.4996752874660962, "grad_norm": 0.29446032643318176, "learning_rate": 0.00010633484443327195, "loss": 0.5423, "step": 3270 }, { "epoch": 0.49982809336440387, "grad_norm": 0.33767345547676086, "learning_rate": 0.0001062850298128523, "loss": 0.679, "step": 3271 }, { "epoch": 0.49998089926271155, "grad_norm": 0.3081493079662323, "learning_rate": 0.00010623521362657025, "loss": 0.6156, "step": 3272 }, { "epoch": 0.5001337051610192, "grad_norm": 0.2943879961967468, "learning_rate": 0.00010618539588683705, "loss": 0.4835, "step": 3273 }, { "epoch": 0.5002865110593269, "grad_norm": 0.2678261697292328, "learning_rate": 0.00010613557660606441, "loss": 0.7285, "step": 3274 }, { "epoch": 0.5004393169576346, "grad_norm": 0.3967953324317932, "learning_rate": 0.0001060857557966643, "loss": 0.6834, "step": 3275 }, { "epoch": 0.5005921228559422, "grad_norm": 0.36574381589889526, "learning_rate": 0.0001060359334710493, "loss": 0.6326, "step": 3276 }, { "epoch": 0.5007449287542499, "grad_norm": 0.3894730806350708, "learning_rate": 0.00010598610964163208, "loss": 0.6009, "step": 3277 }, { "epoch": 0.5008977346525576, "grad_norm": 0.2868845462799072, "learning_rate": 0.00010593628432082594, "loss": 0.7465, "step": 3278 }, { "epoch": 0.5010505405508653, "grad_norm": 0.26092529296875, "learning_rate": 0.00010588645752104433, "loss": 0.6455, "step": 3279 }, { "epoch": 0.501203346449173, "grad_norm": 0.3582485318183899, "learning_rate": 0.00010583662925470128, "loss": 0.8203, "step": 3280 }, { "epoch": 0.5013561523474807, "grad_norm": 0.31029212474823, "learning_rate": 0.00010578679953421106, "loss": 0.7229, "step": 3281 }, { "epoch": 0.5015089582457883, "grad_norm": 0.36049965023994446, "learning_rate": 0.0001057369683719883, "loss": 0.8482, "step": 3282 }, { "epoch": 0.5016617641440959, "grad_norm": 0.38351500034332275, "learning_rate": 0.00010568713578044805, "loss": 0.794, "step": 3283 }, { "epoch": 0.5018145700424036, "grad_norm": 0.3084133565425873, "learning_rate": 0.0001056373017720056, "loss": 0.8044, "step": 3284 }, { "epoch": 0.5019673759407113, "grad_norm": 0.4007570445537567, "learning_rate": 0.0001055874663590768, "loss": 0.6948, "step": 3285 }, { "epoch": 0.502120181839019, "grad_norm": 0.3142980635166168, "learning_rate": 0.00010553762955407757, "loss": 0.7865, "step": 3286 }, { "epoch": 0.5022729877373266, "grad_norm": 0.30172571539878845, "learning_rate": 0.0001054877913694245, "loss": 0.7177, "step": 3287 }, { "epoch": 0.5024257936356343, "grad_norm": 0.3817455470561981, "learning_rate": 0.00010543795181753427, "loss": 0.7549, "step": 3288 }, { "epoch": 0.502578599533942, "grad_norm": 0.9309012293815613, "learning_rate": 0.00010538811091082397, "loss": 0.747, "step": 3289 }, { "epoch": 0.5027314054322497, "grad_norm": 0.31485238671302795, "learning_rate": 0.00010533826866171108, "loss": 0.8134, "step": 3290 }, { "epoch": 0.5028842113305574, "grad_norm": 0.3265262842178345, "learning_rate": 0.00010528842508261334, "loss": 0.5837, "step": 3291 }, { "epoch": 0.5030370172288651, "grad_norm": 0.3494139313697815, "learning_rate": 0.0001052385801859489, "loss": 0.7537, "step": 3292 }, { "epoch": 0.5031898231271728, "grad_norm": 0.2907181978225708, "learning_rate": 0.00010518873398413616, "loss": 0.7375, "step": 3293 }, { "epoch": 0.5033426290254804, "grad_norm": 0.2753676474094391, "learning_rate": 0.00010513888648959394, "loss": 0.7807, "step": 3294 }, { "epoch": 0.503495434923788, "grad_norm": 0.2893278896808624, "learning_rate": 0.00010508903771474128, "loss": 0.6039, "step": 3295 }, { "epoch": 0.5036482408220957, "grad_norm": 0.2934708297252655, "learning_rate": 0.00010503918767199758, "loss": 0.6074, "step": 3296 }, { "epoch": 0.5038010467204034, "grad_norm": 0.2802904546260834, "learning_rate": 0.00010498933637378257, "loss": 0.605, "step": 3297 }, { "epoch": 0.5039538526187111, "grad_norm": 0.29273319244384766, "learning_rate": 0.00010493948383251628, "loss": 0.845, "step": 3298 }, { "epoch": 0.5041066585170187, "grad_norm": 0.25674715638160706, "learning_rate": 0.00010488963006061907, "loss": 0.7262, "step": 3299 }, { "epoch": 0.5042594644153264, "grad_norm": 0.4300982654094696, "learning_rate": 0.00010483977507051157, "loss": 0.6937, "step": 3300 }, { "epoch": 0.5044122703136341, "grad_norm": 0.3364725410938263, "learning_rate": 0.00010478991887461473, "loss": 0.5855, "step": 3301 }, { "epoch": 0.5045650762119418, "grad_norm": 0.2849768400192261, "learning_rate": 0.00010474006148534983, "loss": 0.7837, "step": 3302 }, { "epoch": 0.5047178821102495, "grad_norm": 0.2889060378074646, "learning_rate": 0.00010469020291513838, "loss": 0.5903, "step": 3303 }, { "epoch": 0.5048706880085572, "grad_norm": 0.2896782457828522, "learning_rate": 0.00010464034317640225, "loss": 0.599, "step": 3304 }, { "epoch": 0.5050234939068649, "grad_norm": 0.26331770420074463, "learning_rate": 0.00010459048228156356, "loss": 0.6462, "step": 3305 }, { "epoch": 0.5051762998051725, "grad_norm": 0.37208205461502075, "learning_rate": 0.00010454062024304476, "loss": 0.8038, "step": 3306 }, { "epoch": 0.5053291057034801, "grad_norm": 0.41795673966407776, "learning_rate": 0.00010449075707326855, "loss": 0.7771, "step": 3307 }, { "epoch": 0.5054819116017878, "grad_norm": 0.3807390034198761, "learning_rate": 0.0001044408927846579, "loss": 0.7304, "step": 3308 }, { "epoch": 0.5056347175000955, "grad_norm": 0.33464887738227844, "learning_rate": 0.00010439102738963609, "loss": 0.8507, "step": 3309 }, { "epoch": 0.5057875233984032, "grad_norm": 0.3084365129470825, "learning_rate": 0.00010434116090062664, "loss": 0.73, "step": 3310 }, { "epoch": 0.5059403292967108, "grad_norm": 0.2747865319252014, "learning_rate": 0.00010429129333005345, "loss": 0.7288, "step": 3311 }, { "epoch": 0.5060931351950185, "grad_norm": 0.46816909313201904, "learning_rate": 0.00010424142469034048, "loss": 0.716, "step": 3312 }, { "epoch": 0.5062459410933262, "grad_norm": 0.425784170627594, "learning_rate": 0.0001041915549939122, "loss": 0.5491, "step": 3313 }, { "epoch": 0.5063987469916339, "grad_norm": 0.3221166431903839, "learning_rate": 0.00010414168425319315, "loss": 0.7381, "step": 3314 }, { "epoch": 0.5065515528899416, "grad_norm": 0.608630359172821, "learning_rate": 0.00010409181248060827, "loss": 0.5901, "step": 3315 }, { "epoch": 0.5067043587882493, "grad_norm": 0.28582873940467834, "learning_rate": 0.00010404193968858262, "loss": 0.6935, "step": 3316 }, { "epoch": 0.506857164686557, "grad_norm": 0.29004615545272827, "learning_rate": 0.00010399206588954164, "loss": 0.6994, "step": 3317 }, { "epoch": 0.5070099705848645, "grad_norm": 0.2937512993812561, "learning_rate": 0.00010394219109591096, "loss": 0.8092, "step": 3318 }, { "epoch": 0.5071627764831722, "grad_norm": 0.2914525270462036, "learning_rate": 0.00010389231532011647, "loss": 0.801, "step": 3319 }, { "epoch": 0.5073155823814799, "grad_norm": 0.2659449875354767, "learning_rate": 0.00010384243857458428, "loss": 0.6694, "step": 3320 }, { "epoch": 0.5074683882797876, "grad_norm": 0.29074615240097046, "learning_rate": 0.00010379256087174076, "loss": 0.5927, "step": 3321 }, { "epoch": 0.5076211941780953, "grad_norm": 0.33049747347831726, "learning_rate": 0.00010374268222401258, "loss": 0.7625, "step": 3322 }, { "epoch": 0.5077740000764029, "grad_norm": 0.2912755310535431, "learning_rate": 0.00010369280264382648, "loss": 0.668, "step": 3323 }, { "epoch": 0.5079268059747106, "grad_norm": 0.298967182636261, "learning_rate": 0.00010364292214360965, "loss": 0.625, "step": 3324 }, { "epoch": 0.5080796118730183, "grad_norm": 0.26732969284057617, "learning_rate": 0.0001035930407357893, "loss": 0.6825, "step": 3325 }, { "epoch": 0.508232417771326, "grad_norm": 0.27220967411994934, "learning_rate": 0.00010354315843279306, "loss": 0.85, "step": 3326 }, { "epoch": 0.5083852236696337, "grad_norm": 0.2452717274427414, "learning_rate": 0.00010349327524704862, "loss": 0.66, "step": 3327 }, { "epoch": 0.5085380295679414, "grad_norm": 0.2734704613685608, "learning_rate": 0.00010344339119098394, "loss": 0.7091, "step": 3328 }, { "epoch": 0.508690835466249, "grad_norm": 0.30528584122657776, "learning_rate": 0.0001033935062770273, "loss": 0.7044, "step": 3329 }, { "epoch": 0.5088436413645566, "grad_norm": 0.26126575469970703, "learning_rate": 0.00010334362051760703, "loss": 0.7252, "step": 3330 }, { "epoch": 0.5089964472628643, "grad_norm": 0.27342644333839417, "learning_rate": 0.00010329373392515179, "loss": 0.57, "step": 3331 }, { "epoch": 0.509149253161172, "grad_norm": 0.26855266094207764, "learning_rate": 0.00010324384651209036, "loss": 0.6485, "step": 3332 }, { "epoch": 0.5093020590594797, "grad_norm": 0.26671916246414185, "learning_rate": 0.00010319395829085184, "loss": 0.7488, "step": 3333 }, { "epoch": 0.5094548649577874, "grad_norm": 0.2993987500667572, "learning_rate": 0.0001031440692738654, "loss": 0.6563, "step": 3334 }, { "epoch": 0.509607670856095, "grad_norm": 0.2229076474905014, "learning_rate": 0.0001030941794735605, "loss": 0.4909, "step": 3335 }, { "epoch": 0.5097604767544027, "grad_norm": 0.2941783666610718, "learning_rate": 0.00010304428890236678, "loss": 0.7214, "step": 3336 }, { "epoch": 0.5099132826527104, "grad_norm": 0.2748726010322571, "learning_rate": 0.00010299439757271399, "loss": 0.6889, "step": 3337 }, { "epoch": 0.5100660885510181, "grad_norm": 0.2850393056869507, "learning_rate": 0.00010294450549703221, "loss": 0.86, "step": 3338 }, { "epoch": 0.5102188944493258, "grad_norm": 1.257244348526001, "learning_rate": 0.00010289461268775157, "loss": 0.6314, "step": 3339 }, { "epoch": 0.5103717003476335, "grad_norm": 0.28113579750061035, "learning_rate": 0.00010284471915730252, "loss": 0.8423, "step": 3340 }, { "epoch": 0.5105245062459411, "grad_norm": 0.2960244119167328, "learning_rate": 0.00010279482491811554, "loss": 0.6526, "step": 3341 }, { "epoch": 0.5106773121442487, "grad_norm": 0.26911747455596924, "learning_rate": 0.00010274492998262142, "loss": 0.7716, "step": 3342 }, { "epoch": 0.5108301180425564, "grad_norm": 0.29852014780044556, "learning_rate": 0.000102695034363251, "loss": 0.5997, "step": 3343 }, { "epoch": 0.5109829239408641, "grad_norm": 0.28390073776245117, "learning_rate": 0.00010264513807243543, "loss": 0.7266, "step": 3344 }, { "epoch": 0.5111357298391718, "grad_norm": 0.31037935614585876, "learning_rate": 0.00010259524112260591, "loss": 0.5311, "step": 3345 }, { "epoch": 0.5112885357374795, "grad_norm": 0.35973161458969116, "learning_rate": 0.00010254534352619381, "loss": 0.9332, "step": 3346 }, { "epoch": 0.5114413416357871, "grad_norm": 0.286542683839798, "learning_rate": 0.00010249544529563077, "loss": 0.7231, "step": 3347 }, { "epoch": 0.5115941475340948, "grad_norm": 0.31040236353874207, "learning_rate": 0.00010244554644334847, "loss": 0.8314, "step": 3348 }, { "epoch": 0.5117469534324025, "grad_norm": 0.29848411679267883, "learning_rate": 0.00010239564698177879, "loss": 0.7519, "step": 3349 }, { "epoch": 0.5118997593307102, "grad_norm": 0.2744828760623932, "learning_rate": 0.0001023457469233538, "loss": 0.6148, "step": 3350 }, { "epoch": 0.5120525652290179, "grad_norm": 0.30576545000076294, "learning_rate": 0.00010229584628050563, "loss": 0.5859, "step": 3351 }, { "epoch": 0.5122053711273256, "grad_norm": 0.27415305376052856, "learning_rate": 0.00010224594506566667, "loss": 0.6705, "step": 3352 }, { "epoch": 0.5123581770256332, "grad_norm": 0.30824410915374756, "learning_rate": 0.0001021960432912693, "loss": 0.8869, "step": 3353 }, { "epoch": 0.5125109829239408, "grad_norm": 0.2593754529953003, "learning_rate": 0.00010214614096974622, "loss": 0.7246, "step": 3354 }, { "epoch": 0.5126637888222485, "grad_norm": 0.30506977438926697, "learning_rate": 0.00010209623811353011, "loss": 0.8341, "step": 3355 }, { "epoch": 0.5128165947205562, "grad_norm": 0.2997819483280182, "learning_rate": 0.00010204633473505388, "loss": 0.6893, "step": 3356 }, { "epoch": 0.5129694006188639, "grad_norm": 0.3118533194065094, "learning_rate": 0.00010199643084675052, "loss": 0.914, "step": 3357 }, { "epoch": 0.5131222065171716, "grad_norm": 0.29679909348487854, "learning_rate": 0.00010194652646105318, "loss": 0.7542, "step": 3358 }, { "epoch": 0.5132750124154792, "grad_norm": 0.3198535144329071, "learning_rate": 0.00010189662159039512, "loss": 0.7142, "step": 3359 }, { "epoch": 0.5134278183137869, "grad_norm": 0.2824925482273102, "learning_rate": 0.0001018467162472097, "loss": 0.6344, "step": 3360 }, { "epoch": 0.5135806242120946, "grad_norm": 0.3698297441005707, "learning_rate": 0.00010179681044393042, "loss": 0.7198, "step": 3361 }, { "epoch": 0.5137334301104023, "grad_norm": 0.28322651982307434, "learning_rate": 0.0001017469041929909, "loss": 0.74, "step": 3362 }, { "epoch": 0.51388623600871, "grad_norm": 0.3029322326183319, "learning_rate": 0.00010169699750682489, "loss": 0.7064, "step": 3363 }, { "epoch": 0.5140390419070177, "grad_norm": 0.2545001208782196, "learning_rate": 0.00010164709039786618, "loss": 0.8169, "step": 3364 }, { "epoch": 0.5141918478053252, "grad_norm": 0.28603988885879517, "learning_rate": 0.00010159718287854871, "loss": 0.7604, "step": 3365 }, { "epoch": 0.5143446537036329, "grad_norm": 0.3488546907901764, "learning_rate": 0.00010154727496130658, "loss": 0.6961, "step": 3366 }, { "epoch": 0.5144974596019406, "grad_norm": 0.36552485823631287, "learning_rate": 0.00010149736665857382, "loss": 0.7482, "step": 3367 }, { "epoch": 0.5146502655002483, "grad_norm": 0.6362305283546448, "learning_rate": 0.00010144745798278479, "loss": 0.8138, "step": 3368 }, { "epoch": 0.514803071398556, "grad_norm": 0.28395330905914307, "learning_rate": 0.00010139754894637367, "loss": 0.7591, "step": 3369 }, { "epoch": 0.5149558772968637, "grad_norm": 0.26198312640190125, "learning_rate": 0.00010134763956177504, "loss": 0.6243, "step": 3370 }, { "epoch": 0.5151086831951713, "grad_norm": 0.2880837619304657, "learning_rate": 0.00010129772984142328, "loss": 0.6279, "step": 3371 }, { "epoch": 0.515261489093479, "grad_norm": 0.28449442982673645, "learning_rate": 0.00010124781979775307, "loss": 0.7934, "step": 3372 }, { "epoch": 0.5154142949917867, "grad_norm": 0.29876309633255005, "learning_rate": 0.00010119790944319899, "loss": 0.8046, "step": 3373 }, { "epoch": 0.5155671008900944, "grad_norm": 0.4107857048511505, "learning_rate": 0.00010114799879019581, "loss": 0.8078, "step": 3374 }, { "epoch": 0.5157199067884021, "grad_norm": 0.3255639374256134, "learning_rate": 0.00010109808785117843, "loss": 0.8144, "step": 3375 }, { "epoch": 0.5158727126867098, "grad_norm": 0.3160342276096344, "learning_rate": 0.00010104817663858161, "loss": 0.932, "step": 3376 }, { "epoch": 0.5160255185850173, "grad_norm": 0.35046565532684326, "learning_rate": 0.00010099826516484045, "loss": 0.7134, "step": 3377 }, { "epoch": 0.516178324483325, "grad_norm": 0.29910796880722046, "learning_rate": 0.00010094835344238984, "loss": 0.8236, "step": 3378 }, { "epoch": 0.5163311303816327, "grad_norm": 0.2847612202167511, "learning_rate": 0.00010089844148366498, "loss": 0.7021, "step": 3379 }, { "epoch": 0.5164839362799404, "grad_norm": 0.37408819794654846, "learning_rate": 0.00010084852930110094, "loss": 0.5381, "step": 3380 }, { "epoch": 0.5166367421782481, "grad_norm": 0.3474291265010834, "learning_rate": 0.00010079861690713297, "loss": 0.7535, "step": 3381 }, { "epoch": 0.5167895480765557, "grad_norm": 0.4439990818500519, "learning_rate": 0.00010074870431419627, "loss": 0.9417, "step": 3382 }, { "epoch": 0.5169423539748634, "grad_norm": 0.2557135224342346, "learning_rate": 0.0001006987915347262, "loss": 0.5236, "step": 3383 }, { "epoch": 0.5170951598731711, "grad_norm": 0.2894841730594635, "learning_rate": 0.00010064887858115808, "loss": 0.6814, "step": 3384 }, { "epoch": 0.5172479657714788, "grad_norm": 0.3533530533313751, "learning_rate": 0.00010059896546592729, "loss": 0.4942, "step": 3385 }, { "epoch": 0.5174007716697865, "grad_norm": 0.33828791975975037, "learning_rate": 0.0001005490522014693, "loss": 0.9148, "step": 3386 }, { "epoch": 0.5175535775680942, "grad_norm": 0.291148841381073, "learning_rate": 0.00010049913880021956, "loss": 0.7756, "step": 3387 }, { "epoch": 0.5177063834664019, "grad_norm": 0.477228581905365, "learning_rate": 0.00010044922527461358, "loss": 0.8127, "step": 3388 }, { "epoch": 0.5178591893647094, "grad_norm": 0.31533282995224, "learning_rate": 0.00010039931163708686, "loss": 0.6602, "step": 3389 }, { "epoch": 0.5180119952630171, "grad_norm": 0.31487801671028137, "learning_rate": 0.00010034939790007504, "loss": 0.7307, "step": 3390 }, { "epoch": 0.5181648011613248, "grad_norm": 0.2877635657787323, "learning_rate": 0.00010029948407601366, "loss": 0.7646, "step": 3391 }, { "epoch": 0.5183176070596325, "grad_norm": 0.5219588279724121, "learning_rate": 0.00010024957017733834, "loss": 0.6373, "step": 3392 }, { "epoch": 0.5184704129579402, "grad_norm": 0.34876278042793274, "learning_rate": 0.00010019965621648468, "loss": 0.5714, "step": 3393 }, { "epoch": 0.5186232188562478, "grad_norm": 0.3323829174041748, "learning_rate": 0.00010014974220588838, "loss": 0.5746, "step": 3394 }, { "epoch": 0.5187760247545555, "grad_norm": 0.3449549674987793, "learning_rate": 0.00010009982815798504, "loss": 0.7553, "step": 3395 }, { "epoch": 0.5189288306528632, "grad_norm": 0.3018842935562134, "learning_rate": 0.00010004991408521036, "loss": 0.7412, "step": 3396 }, { "epoch": 0.5190816365511709, "grad_norm": 0.31991279125213623, "learning_rate": 0.0001, "loss": 0.6534, "step": 3397 }, { "epoch": 0.5192344424494786, "grad_norm": 0.2634223699569702, "learning_rate": 9.995008591478966e-05, "loss": 0.6144, "step": 3398 }, { "epoch": 0.5193872483477863, "grad_norm": 0.36058294773101807, "learning_rate": 9.9900171842015e-05, "loss": 0.7775, "step": 3399 }, { "epoch": 0.519540054246094, "grad_norm": 0.287720263004303, "learning_rate": 9.985025779411166e-05, "loss": 0.7882, "step": 3400 }, { "epoch": 0.5196928601444015, "grad_norm": 0.26958808302879333, "learning_rate": 9.980034378351534e-05, "loss": 0.6573, "step": 3401 }, { "epoch": 0.5198456660427092, "grad_norm": 0.9140129685401917, "learning_rate": 9.975042982266167e-05, "loss": 0.7488, "step": 3402 }, { "epoch": 0.5199984719410169, "grad_norm": 0.30511972308158875, "learning_rate": 9.970051592398638e-05, "loss": 0.6557, "step": 3403 }, { "epoch": 0.5201512778393246, "grad_norm": 0.2656531035900116, "learning_rate": 9.965060209992497e-05, "loss": 0.6858, "step": 3404 }, { "epoch": 0.5203040837376323, "grad_norm": 0.4538237452507019, "learning_rate": 9.960068836291315e-05, "loss": 0.8245, "step": 3405 }, { "epoch": 0.52045688963594, "grad_norm": 0.3917170763015747, "learning_rate": 9.955077472538647e-05, "loss": 0.8073, "step": 3406 }, { "epoch": 0.5206096955342476, "grad_norm": 0.32771754264831543, "learning_rate": 9.950086119978045e-05, "loss": 0.5978, "step": 3407 }, { "epoch": 0.5207625014325553, "grad_norm": 0.640074610710144, "learning_rate": 9.945094779853073e-05, "loss": 0.9897, "step": 3408 }, { "epoch": 0.520915307330863, "grad_norm": 0.5286215543746948, "learning_rate": 9.940103453407272e-05, "loss": 0.7344, "step": 3409 }, { "epoch": 0.5210681132291707, "grad_norm": 0.31370532512664795, "learning_rate": 9.935112141884197e-05, "loss": 0.6146, "step": 3410 }, { "epoch": 0.5212209191274784, "grad_norm": 0.2929065525531769, "learning_rate": 9.930120846527381e-05, "loss": 0.7299, "step": 3411 }, { "epoch": 0.521373725025786, "grad_norm": 0.2866988182067871, "learning_rate": 9.925129568580375e-05, "loss": 0.6022, "step": 3412 }, { "epoch": 0.5215265309240936, "grad_norm": 0.2536863088607788, "learning_rate": 9.920138309286708e-05, "loss": 0.6714, "step": 3413 }, { "epoch": 0.5216793368224013, "grad_norm": 0.27033594250679016, "learning_rate": 9.91514706988991e-05, "loss": 0.8957, "step": 3414 }, { "epoch": 0.521832142720709, "grad_norm": 0.32144030928611755, "learning_rate": 9.910155851633504e-05, "loss": 0.8017, "step": 3415 }, { "epoch": 0.5219849486190167, "grad_norm": 0.27790653705596924, "learning_rate": 9.905164655761016e-05, "loss": 0.698, "step": 3416 }, { "epoch": 0.5221377545173244, "grad_norm": 0.3195480704307556, "learning_rate": 9.90017348351596e-05, "loss": 0.6499, "step": 3417 }, { "epoch": 0.522290560415632, "grad_norm": 0.3057548999786377, "learning_rate": 9.89518233614184e-05, "loss": 0.7596, "step": 3418 }, { "epoch": 0.5224433663139397, "grad_norm": 0.30440661311149597, "learning_rate": 9.89019121488216e-05, "loss": 0.665, "step": 3419 }, { "epoch": 0.5225961722122474, "grad_norm": 0.2874852120876312, "learning_rate": 9.885200120980418e-05, "loss": 0.6549, "step": 3420 }, { "epoch": 0.5227489781105551, "grad_norm": 0.31353822350502014, "learning_rate": 9.880209055680105e-05, "loss": 0.801, "step": 3421 }, { "epoch": 0.5229017840088628, "grad_norm": 0.2793009579181671, "learning_rate": 9.875218020224696e-05, "loss": 0.7663, "step": 3422 }, { "epoch": 0.5230545899071705, "grad_norm": 0.23979683220386505, "learning_rate": 9.870227015857672e-05, "loss": 0.5808, "step": 3423 }, { "epoch": 0.523207395805478, "grad_norm": 0.29966726899147034, "learning_rate": 9.8652360438225e-05, "loss": 0.7144, "step": 3424 }, { "epoch": 0.5233602017037857, "grad_norm": 0.3735535144805908, "learning_rate": 9.860245105362634e-05, "loss": 0.663, "step": 3425 }, { "epoch": 0.5235130076020934, "grad_norm": 0.28507325053215027, "learning_rate": 9.855254201721524e-05, "loss": 0.7955, "step": 3426 }, { "epoch": 0.5236658135004011, "grad_norm": 0.26180824637413025, "learning_rate": 9.850263334142618e-05, "loss": 0.6727, "step": 3427 }, { "epoch": 0.5238186193987088, "grad_norm": 0.3200896680355072, "learning_rate": 9.845272503869347e-05, "loss": 0.4995, "step": 3428 }, { "epoch": 0.5239714252970165, "grad_norm": 0.31497979164123535, "learning_rate": 9.840281712145131e-05, "loss": 0.6823, "step": 3429 }, { "epoch": 0.5241242311953241, "grad_norm": 0.3218442499637604, "learning_rate": 9.835290960213383e-05, "loss": 0.7584, "step": 3430 }, { "epoch": 0.5242770370936318, "grad_norm": 0.26961660385131836, "learning_rate": 9.830300249317515e-05, "loss": 0.7869, "step": 3431 }, { "epoch": 0.5244298429919395, "grad_norm": 0.3074052035808563, "learning_rate": 9.82530958070091e-05, "loss": 0.7193, "step": 3432 }, { "epoch": 0.5245826488902472, "grad_norm": 0.2740161418914795, "learning_rate": 9.82031895560696e-05, "loss": 0.5419, "step": 3433 }, { "epoch": 0.5247354547885549, "grad_norm": 0.2755180299282074, "learning_rate": 9.815328375279031e-05, "loss": 0.5791, "step": 3434 }, { "epoch": 0.5248882606868626, "grad_norm": 0.3647940456867218, "learning_rate": 9.810337840960491e-05, "loss": 0.8048, "step": 3435 }, { "epoch": 0.5250410665851701, "grad_norm": 0.26341575384140015, "learning_rate": 9.805347353894684e-05, "loss": 0.788, "step": 3436 }, { "epoch": 0.5251938724834778, "grad_norm": 0.275977224111557, "learning_rate": 9.800356915324948e-05, "loss": 0.7517, "step": 3437 }, { "epoch": 0.5253466783817855, "grad_norm": 0.28197404742240906, "learning_rate": 9.795366526494617e-05, "loss": 0.654, "step": 3438 }, { "epoch": 0.5254994842800932, "grad_norm": 0.30565693974494934, "learning_rate": 9.790376188646992e-05, "loss": 0.6654, "step": 3439 }, { "epoch": 0.5256522901784009, "grad_norm": 0.31840893626213074, "learning_rate": 9.78538590302538e-05, "loss": 0.9181, "step": 3440 }, { "epoch": 0.5258050960767086, "grad_norm": 0.30634599924087524, "learning_rate": 9.780395670873068e-05, "loss": 0.8275, "step": 3441 }, { "epoch": 0.5259579019750162, "grad_norm": 0.33374178409576416, "learning_rate": 9.775405493433337e-05, "loss": 0.492, "step": 3442 }, { "epoch": 0.5261107078733239, "grad_norm": 0.27607855200767517, "learning_rate": 9.770415371949438e-05, "loss": 0.6481, "step": 3443 }, { "epoch": 0.5262635137716316, "grad_norm": 0.3031352758407593, "learning_rate": 9.765425307664621e-05, "loss": 0.5265, "step": 3444 }, { "epoch": 0.5264163196699393, "grad_norm": 0.3562638461589813, "learning_rate": 9.760435301822125e-05, "loss": 0.7271, "step": 3445 }, { "epoch": 0.526569125568247, "grad_norm": 0.360408753156662, "learning_rate": 9.755445355665155e-05, "loss": 0.7489, "step": 3446 }, { "epoch": 0.5267219314665547, "grad_norm": 0.2757256031036377, "learning_rate": 9.750455470436925e-05, "loss": 0.6827, "step": 3447 }, { "epoch": 0.5268747373648622, "grad_norm": 0.32317423820495605, "learning_rate": 9.745465647380619e-05, "loss": 0.7025, "step": 3448 }, { "epoch": 0.5270275432631699, "grad_norm": 0.3631436824798584, "learning_rate": 9.740475887739416e-05, "loss": 0.5346, "step": 3449 }, { "epoch": 0.5271803491614776, "grad_norm": 0.29940101504325867, "learning_rate": 9.73548619275646e-05, "loss": 0.7169, "step": 3450 }, { "epoch": 0.5273331550597853, "grad_norm": 0.3059080243110657, "learning_rate": 9.7304965636749e-05, "loss": 0.7512, "step": 3451 }, { "epoch": 0.527485960958093, "grad_norm": 0.398517370223999, "learning_rate": 9.725507001737863e-05, "loss": 0.8103, "step": 3452 }, { "epoch": 0.5276387668564007, "grad_norm": 0.26001110672950745, "learning_rate": 9.72051750818845e-05, "loss": 0.8303, "step": 3453 }, { "epoch": 0.5277915727547083, "grad_norm": 0.32580479979515076, "learning_rate": 9.71552808426975e-05, "loss": 0.6707, "step": 3454 }, { "epoch": 0.527944378653016, "grad_norm": 0.3100968599319458, "learning_rate": 9.710538731224843e-05, "loss": 0.8172, "step": 3455 }, { "epoch": 0.5280971845513237, "grad_norm": 0.35659340023994446, "learning_rate": 9.705549450296784e-05, "loss": 0.7393, "step": 3456 }, { "epoch": 0.5282499904496314, "grad_norm": 0.47527024149894714, "learning_rate": 9.700560242728602e-05, "loss": 0.7251, "step": 3457 }, { "epoch": 0.5284027963479391, "grad_norm": 0.27160361409187317, "learning_rate": 9.695571109763326e-05, "loss": 0.6963, "step": 3458 }, { "epoch": 0.5285556022462468, "grad_norm": 0.40073350071907043, "learning_rate": 9.690582052643951e-05, "loss": 0.6446, "step": 3459 }, { "epoch": 0.5287084081445543, "grad_norm": 0.2409697026014328, "learning_rate": 9.685593072613464e-05, "loss": 0.5726, "step": 3460 }, { "epoch": 0.528861214042862, "grad_norm": 0.3380088806152344, "learning_rate": 9.680604170914817e-05, "loss": 0.8136, "step": 3461 }, { "epoch": 0.5290140199411697, "grad_norm": 0.2963113784790039, "learning_rate": 9.675615348790964e-05, "loss": 0.6934, "step": 3462 }, { "epoch": 0.5291668258394774, "grad_norm": 0.2802518904209137, "learning_rate": 9.670626607484826e-05, "loss": 0.5576, "step": 3463 }, { "epoch": 0.5293196317377851, "grad_norm": 0.420153945684433, "learning_rate": 9.665637948239301e-05, "loss": 0.7867, "step": 3464 }, { "epoch": 0.5294724376360928, "grad_norm": 0.3025205433368683, "learning_rate": 9.660649372297272e-05, "loss": 0.7153, "step": 3465 }, { "epoch": 0.5296252435344004, "grad_norm": 0.2623286843299866, "learning_rate": 9.655660880901606e-05, "loss": 0.6243, "step": 3466 }, { "epoch": 0.5297780494327081, "grad_norm": 0.2510450780391693, "learning_rate": 9.650672475295143e-05, "loss": 0.6112, "step": 3467 }, { "epoch": 0.5299308553310158, "grad_norm": 0.34889891743659973, "learning_rate": 9.645684156720697e-05, "loss": 0.691, "step": 3468 }, { "epoch": 0.5300836612293235, "grad_norm": 0.30945080518722534, "learning_rate": 9.64069592642107e-05, "loss": 0.8384, "step": 3469 }, { "epoch": 0.5302364671276312, "grad_norm": 0.3335753083229065, "learning_rate": 9.63570778563904e-05, "loss": 0.5605, "step": 3470 }, { "epoch": 0.5303892730259389, "grad_norm": 0.36615538597106934, "learning_rate": 9.630719735617354e-05, "loss": 0.687, "step": 3471 }, { "epoch": 0.5305420789242464, "grad_norm": 1.3929213285446167, "learning_rate": 9.625731777598746e-05, "loss": 0.7925, "step": 3472 }, { "epoch": 0.5306948848225541, "grad_norm": 0.35153627395629883, "learning_rate": 9.620743912825924e-05, "loss": 0.5368, "step": 3473 }, { "epoch": 0.5308476907208618, "grad_norm": 0.28042492270469666, "learning_rate": 9.615756142541575e-05, "loss": 0.6947, "step": 3474 }, { "epoch": 0.5310004966191695, "grad_norm": 0.22383491694927216, "learning_rate": 9.610768467988356e-05, "loss": 0.6687, "step": 3475 }, { "epoch": 0.5311533025174772, "grad_norm": 0.5090556144714355, "learning_rate": 9.605780890408903e-05, "loss": 0.8305, "step": 3476 }, { "epoch": 0.5313061084157849, "grad_norm": 0.2908128499984741, "learning_rate": 9.600793411045838e-05, "loss": 0.5973, "step": 3477 }, { "epoch": 0.5314589143140925, "grad_norm": 0.3204064667224884, "learning_rate": 9.595806031141739e-05, "loss": 0.7176, "step": 3478 }, { "epoch": 0.5316117202124002, "grad_norm": 0.3861880302429199, "learning_rate": 9.590818751939177e-05, "loss": 0.4478, "step": 3479 }, { "epoch": 0.5317645261107079, "grad_norm": 0.5264634490013123, "learning_rate": 9.585831574680684e-05, "loss": 0.7398, "step": 3480 }, { "epoch": 0.5319173320090156, "grad_norm": 0.32976770401000977, "learning_rate": 9.580844500608782e-05, "loss": 0.7962, "step": 3481 }, { "epoch": 0.5320701379073233, "grad_norm": 0.2593368589878082, "learning_rate": 9.575857530965953e-05, "loss": 0.5612, "step": 3482 }, { "epoch": 0.5322229438056308, "grad_norm": 0.3448527753353119, "learning_rate": 9.570870666994658e-05, "loss": 0.63, "step": 3483 }, { "epoch": 0.5323757497039385, "grad_norm": 0.3122631013393402, "learning_rate": 9.56588390993734e-05, "loss": 0.6706, "step": 3484 }, { "epoch": 0.5325285556022462, "grad_norm": 0.35931363701820374, "learning_rate": 9.560897261036395e-05, "loss": 0.7125, "step": 3485 }, { "epoch": 0.5326813615005539, "grad_norm": 0.28302425146102905, "learning_rate": 9.555910721534214e-05, "loss": 0.6462, "step": 3486 }, { "epoch": 0.5328341673988616, "grad_norm": 0.3118671774864197, "learning_rate": 9.550924292673146e-05, "loss": 0.6675, "step": 3487 }, { "epoch": 0.5329869732971693, "grad_norm": 0.33205705881118774, "learning_rate": 9.545937975695526e-05, "loss": 0.6899, "step": 3488 }, { "epoch": 0.533139779195477, "grad_norm": 0.3029916286468506, "learning_rate": 9.540951771843645e-05, "loss": 0.7862, "step": 3489 }, { "epoch": 0.5332925850937846, "grad_norm": 0.3043176531791687, "learning_rate": 9.535965682359778e-05, "loss": 0.8245, "step": 3490 }, { "epoch": 0.5334453909920923, "grad_norm": 0.3936460316181183, "learning_rate": 9.530979708486162e-05, "loss": 0.9028, "step": 3491 }, { "epoch": 0.5335981968904, "grad_norm": 0.3017941117286682, "learning_rate": 9.525993851465021e-05, "loss": 0.684, "step": 3492 }, { "epoch": 0.5337510027887077, "grad_norm": 0.3985665440559387, "learning_rate": 9.521008112538529e-05, "loss": 0.6063, "step": 3493 }, { "epoch": 0.5339038086870154, "grad_norm": 0.3323298394680023, "learning_rate": 9.516022492948845e-05, "loss": 0.5456, "step": 3494 }, { "epoch": 0.5340566145853229, "grad_norm": 0.31235024333000183, "learning_rate": 9.511036993938097e-05, "loss": 0.5547, "step": 3495 }, { "epoch": 0.5342094204836306, "grad_norm": 0.26063108444213867, "learning_rate": 9.506051616748374e-05, "loss": 0.7367, "step": 3496 }, { "epoch": 0.5343622263819383, "grad_norm": 0.33859163522720337, "learning_rate": 9.501066362621746e-05, "loss": 0.6035, "step": 3497 }, { "epoch": 0.534515032280246, "grad_norm": 0.274844765663147, "learning_rate": 9.496081232800243e-05, "loss": 0.7057, "step": 3498 }, { "epoch": 0.5346678381785537, "grad_norm": 0.2800372540950775, "learning_rate": 9.491096228525876e-05, "loss": 0.5468, "step": 3499 }, { "epoch": 0.5348206440768614, "grad_norm": 0.5205533504486084, "learning_rate": 9.486111351040607e-05, "loss": 0.7171, "step": 3500 }, { "epoch": 0.534973449975169, "grad_norm": 0.3389289081096649, "learning_rate": 9.481126601586385e-05, "loss": 0.6145, "step": 3501 }, { "epoch": 0.5351262558734767, "grad_norm": 0.30579712986946106, "learning_rate": 9.476141981405113e-05, "loss": 0.7139, "step": 3502 }, { "epoch": 0.5352790617717844, "grad_norm": 0.27632755041122437, "learning_rate": 9.471157491738667e-05, "loss": 0.751, "step": 3503 }, { "epoch": 0.5354318676700921, "grad_norm": 0.3189046382904053, "learning_rate": 9.466173133828895e-05, "loss": 0.8163, "step": 3504 }, { "epoch": 0.5355846735683998, "grad_norm": 0.288310170173645, "learning_rate": 9.461188908917605e-05, "loss": 0.6486, "step": 3505 }, { "epoch": 0.5357374794667075, "grad_norm": 0.2973790764808655, "learning_rate": 9.456204818246578e-05, "loss": 0.8716, "step": 3506 }, { "epoch": 0.535890285365015, "grad_norm": 0.2922728359699249, "learning_rate": 9.451220863057551e-05, "loss": 0.708, "step": 3507 }, { "epoch": 0.5360430912633227, "grad_norm": 0.3109127879142761, "learning_rate": 9.446237044592241e-05, "loss": 0.5553, "step": 3508 }, { "epoch": 0.5361958971616304, "grad_norm": 0.27865827083587646, "learning_rate": 9.441253364092326e-05, "loss": 0.8297, "step": 3509 }, { "epoch": 0.5363487030599381, "grad_norm": 0.28346171975135803, "learning_rate": 9.436269822799443e-05, "loss": 0.6596, "step": 3510 }, { "epoch": 0.5365015089582458, "grad_norm": 0.27833786606788635, "learning_rate": 9.431286421955199e-05, "loss": 0.758, "step": 3511 }, { "epoch": 0.5366543148565535, "grad_norm": 0.2874302864074707, "learning_rate": 9.426303162801171e-05, "loss": 0.58, "step": 3512 }, { "epoch": 0.5368071207548611, "grad_norm": 0.3504881262779236, "learning_rate": 9.421320046578896e-05, "loss": 0.557, "step": 3513 }, { "epoch": 0.5369599266531688, "grad_norm": 0.357403039932251, "learning_rate": 9.416337074529873e-05, "loss": 0.719, "step": 3514 }, { "epoch": 0.5371127325514765, "grad_norm": 0.35706785321235657, "learning_rate": 9.411354247895566e-05, "loss": 0.7606, "step": 3515 }, { "epoch": 0.5372655384497842, "grad_norm": 0.2595686912536621, "learning_rate": 9.406371567917411e-05, "loss": 0.8486, "step": 3516 }, { "epoch": 0.5374183443480919, "grad_norm": 0.30613166093826294, "learning_rate": 9.401389035836793e-05, "loss": 0.8481, "step": 3517 }, { "epoch": 0.5375711502463996, "grad_norm": 0.46529725193977356, "learning_rate": 9.396406652895072e-05, "loss": 0.7343, "step": 3518 }, { "epoch": 0.5377239561447071, "grad_norm": 0.34976473450660706, "learning_rate": 9.391424420333569e-05, "loss": 0.6401, "step": 3519 }, { "epoch": 0.5378767620430148, "grad_norm": 0.3024265766143799, "learning_rate": 9.386442339393564e-05, "loss": 0.6997, "step": 3520 }, { "epoch": 0.5380295679413225, "grad_norm": 0.2734369933605194, "learning_rate": 9.381460411316298e-05, "loss": 0.7366, "step": 3521 }, { "epoch": 0.5381823738396302, "grad_norm": 0.28172358870506287, "learning_rate": 9.376478637342976e-05, "loss": 0.8492, "step": 3522 }, { "epoch": 0.5383351797379379, "grad_norm": 0.32786622643470764, "learning_rate": 9.371497018714772e-05, "loss": 0.6493, "step": 3523 }, { "epoch": 0.5384879856362456, "grad_norm": 0.31345850229263306, "learning_rate": 9.366515556672808e-05, "loss": 0.8286, "step": 3524 }, { "epoch": 0.5386407915345532, "grad_norm": 0.3381461501121521, "learning_rate": 9.361534252458175e-05, "loss": 0.8441, "step": 3525 }, { "epoch": 0.5387935974328609, "grad_norm": 0.3015748858451843, "learning_rate": 9.356553107311921e-05, "loss": 0.693, "step": 3526 }, { "epoch": 0.5389464033311686, "grad_norm": 0.44636762142181396, "learning_rate": 9.351572122475065e-05, "loss": 0.6457, "step": 3527 }, { "epoch": 0.5390992092294763, "grad_norm": 0.5325556993484497, "learning_rate": 9.346591299188568e-05, "loss": 0.8459, "step": 3528 }, { "epoch": 0.539252015127784, "grad_norm": 0.2519857585430145, "learning_rate": 9.341610638693363e-05, "loss": 0.5791, "step": 3529 }, { "epoch": 0.5394048210260916, "grad_norm": 0.26468968391418457, "learning_rate": 9.336630142230342e-05, "loss": 0.6978, "step": 3530 }, { "epoch": 0.5395576269243992, "grad_norm": 0.2735024094581604, "learning_rate": 9.331649811040355e-05, "loss": 0.5179, "step": 3531 }, { "epoch": 0.5397104328227069, "grad_norm": 0.2528313398361206, "learning_rate": 9.326669646364205e-05, "loss": 0.6918, "step": 3532 }, { "epoch": 0.5398632387210146, "grad_norm": 0.4224424362182617, "learning_rate": 9.321689649442657e-05, "loss": 0.8484, "step": 3533 }, { "epoch": 0.5400160446193223, "grad_norm": 1.335054874420166, "learning_rate": 9.316709821516449e-05, "loss": 0.7747, "step": 3534 }, { "epoch": 0.54016885051763, "grad_norm": 0.2739188075065613, "learning_rate": 9.311730163826243e-05, "loss": 0.6359, "step": 3535 }, { "epoch": 0.5403216564159377, "grad_norm": 0.39737796783447266, "learning_rate": 9.306750677612693e-05, "loss": 0.767, "step": 3536 }, { "epoch": 0.5404744623142453, "grad_norm": 0.29680559039115906, "learning_rate": 9.301771364116391e-05, "loss": 0.7895, "step": 3537 }, { "epoch": 0.540627268212553, "grad_norm": 0.33182379603385925, "learning_rate": 9.296792224577895e-05, "loss": 0.5614, "step": 3538 }, { "epoch": 0.5407800741108607, "grad_norm": 0.2581312358379364, "learning_rate": 9.291813260237712e-05, "loss": 0.8357, "step": 3539 }, { "epoch": 0.5409328800091684, "grad_norm": 0.27075865864753723, "learning_rate": 9.286834472336311e-05, "loss": 0.5831, "step": 3540 }, { "epoch": 0.5410856859074761, "grad_norm": 0.36913207173347473, "learning_rate": 9.281855862114117e-05, "loss": 0.8358, "step": 3541 }, { "epoch": 0.5412384918057837, "grad_norm": 0.27564701437950134, "learning_rate": 9.276877430811501e-05, "loss": 0.7194, "step": 3542 }, { "epoch": 0.5413912977040913, "grad_norm": 0.42347952723503113, "learning_rate": 9.271899179668807e-05, "loss": 0.816, "step": 3543 }, { "epoch": 0.541544103602399, "grad_norm": 0.3452145755290985, "learning_rate": 9.266921109926318e-05, "loss": 0.6209, "step": 3544 }, { "epoch": 0.5416969095007067, "grad_norm": 0.3281693160533905, "learning_rate": 9.261943222824286e-05, "loss": 0.7045, "step": 3545 }, { "epoch": 0.5418497153990144, "grad_norm": 0.29575106501579285, "learning_rate": 9.2569655196029e-05, "loss": 0.588, "step": 3546 }, { "epoch": 0.5420025212973221, "grad_norm": 0.2839337885379791, "learning_rate": 9.251988001502317e-05, "loss": 0.6298, "step": 3547 }, { "epoch": 0.5421553271956298, "grad_norm": 0.263030469417572, "learning_rate": 9.24701066976265e-05, "loss": 0.8087, "step": 3548 }, { "epoch": 0.5423081330939374, "grad_norm": 0.3667827844619751, "learning_rate": 9.242033525623946e-05, "loss": 0.6128, "step": 3549 }, { "epoch": 0.5424609389922451, "grad_norm": 0.2590767741203308, "learning_rate": 9.237056570326231e-05, "loss": 0.6672, "step": 3550 }, { "epoch": 0.5426137448905528, "grad_norm": 0.3932031989097595, "learning_rate": 9.232079805109467e-05, "loss": 0.6827, "step": 3551 }, { "epoch": 0.5427665507888605, "grad_norm": 0.41681838035583496, "learning_rate": 9.227103231213575e-05, "loss": 0.8258, "step": 3552 }, { "epoch": 0.5429193566871682, "grad_norm": 0.2821168303489685, "learning_rate": 9.222126849878421e-05, "loss": 0.7343, "step": 3553 }, { "epoch": 0.5430721625854757, "grad_norm": 0.3376697301864624, "learning_rate": 9.217150662343835e-05, "loss": 0.6614, "step": 3554 }, { "epoch": 0.5432249684837834, "grad_norm": 0.28673434257507324, "learning_rate": 9.212174669849593e-05, "loss": 0.655, "step": 3555 }, { "epoch": 0.5433777743820911, "grad_norm": 0.8690926432609558, "learning_rate": 9.207198873635414e-05, "loss": 0.744, "step": 3556 }, { "epoch": 0.5435305802803988, "grad_norm": 0.2887100577354431, "learning_rate": 9.202223274940981e-05, "loss": 0.8725, "step": 3557 }, { "epoch": 0.5436833861787065, "grad_norm": 0.276275634765625, "learning_rate": 9.197247875005923e-05, "loss": 0.794, "step": 3558 }, { "epoch": 0.5438361920770142, "grad_norm": 0.5013990998268127, "learning_rate": 9.192272675069821e-05, "loss": 0.6538, "step": 3559 }, { "epoch": 0.5439889979753219, "grad_norm": 0.3582007586956024, "learning_rate": 9.1872976763722e-05, "loss": 0.801, "step": 3560 }, { "epoch": 0.5441418038736295, "grad_norm": 0.28688696026802063, "learning_rate": 9.182322880152539e-05, "loss": 0.8727, "step": 3561 }, { "epoch": 0.5442946097719372, "grad_norm": 0.9655963182449341, "learning_rate": 9.177348287650273e-05, "loss": 0.6883, "step": 3562 }, { "epoch": 0.5444474156702449, "grad_norm": 0.3847043812274933, "learning_rate": 9.172373900104774e-05, "loss": 0.6851, "step": 3563 }, { "epoch": 0.5446002215685526, "grad_norm": 2.0926127433776855, "learning_rate": 9.167399718755366e-05, "loss": 0.6177, "step": 3564 }, { "epoch": 0.5447530274668603, "grad_norm": 1.1532070636749268, "learning_rate": 9.162425744841333e-05, "loss": 0.8987, "step": 3565 }, { "epoch": 0.5449058333651678, "grad_norm": 0.31538552045822144, "learning_rate": 9.157451979601896e-05, "loss": 0.6536, "step": 3566 }, { "epoch": 0.5450586392634755, "grad_norm": 0.2628517746925354, "learning_rate": 9.152478424276226e-05, "loss": 0.6453, "step": 3567 }, { "epoch": 0.5452114451617832, "grad_norm": 0.3210277557373047, "learning_rate": 9.147505080103437e-05, "loss": 0.6057, "step": 3568 }, { "epoch": 0.5453642510600909, "grad_norm": 0.2707095146179199, "learning_rate": 9.142531948322605e-05, "loss": 0.6254, "step": 3569 }, { "epoch": 0.5455170569583986, "grad_norm": 0.3149011433124542, "learning_rate": 9.137559030172742e-05, "loss": 0.5751, "step": 3570 }, { "epoch": 0.5456698628567063, "grad_norm": 0.25553005933761597, "learning_rate": 9.132586326892805e-05, "loss": 0.6009, "step": 3571 }, { "epoch": 0.545822668755014, "grad_norm": 0.32813313603401184, "learning_rate": 9.1276138397217e-05, "loss": 0.551, "step": 3572 }, { "epoch": 0.5459754746533216, "grad_norm": 0.33819863200187683, "learning_rate": 9.12264156989829e-05, "loss": 0.6935, "step": 3573 }, { "epoch": 0.5461282805516293, "grad_norm": 0.2711593210697174, "learning_rate": 9.117669518661366e-05, "loss": 0.6271, "step": 3574 }, { "epoch": 0.546281086449937, "grad_norm": 0.3292696475982666, "learning_rate": 9.112697687249673e-05, "loss": 0.7504, "step": 3575 }, { "epoch": 0.5464338923482447, "grad_norm": 0.31159183382987976, "learning_rate": 9.107726076901903e-05, "loss": 0.5733, "step": 3576 }, { "epoch": 0.5465866982465524, "grad_norm": 0.29188716411590576, "learning_rate": 9.102754688856694e-05, "loss": 0.6164, "step": 3577 }, { "epoch": 0.54673950414486, "grad_norm": 0.3371030390262604, "learning_rate": 9.09778352435262e-05, "loss": 0.8755, "step": 3578 }, { "epoch": 0.5468923100431676, "grad_norm": 0.34226492047309875, "learning_rate": 9.092812584628208e-05, "loss": 0.7217, "step": 3579 }, { "epoch": 0.5470451159414753, "grad_norm": 0.2898171544075012, "learning_rate": 9.08784187092193e-05, "loss": 0.6281, "step": 3580 }, { "epoch": 0.547197921839783, "grad_norm": 0.34746459126472473, "learning_rate": 9.082871384472186e-05, "loss": 0.8541, "step": 3581 }, { "epoch": 0.5473507277380907, "grad_norm": 0.3657127916812897, "learning_rate": 9.077901126517341e-05, "loss": 0.79, "step": 3582 }, { "epoch": 0.5475035336363984, "grad_norm": 0.27212727069854736, "learning_rate": 9.072931098295687e-05, "loss": 0.8048, "step": 3583 }, { "epoch": 0.547656339534706, "grad_norm": 0.49914315342903137, "learning_rate": 9.067961301045472e-05, "loss": 0.5319, "step": 3584 }, { "epoch": 0.5478091454330137, "grad_norm": 0.31453418731689453, "learning_rate": 9.062991736004874e-05, "loss": 0.7725, "step": 3585 }, { "epoch": 0.5479619513313214, "grad_norm": 0.2770235538482666, "learning_rate": 9.058022404412019e-05, "loss": 0.7344, "step": 3586 }, { "epoch": 0.5481147572296291, "grad_norm": 0.29153865575790405, "learning_rate": 9.053053307504978e-05, "loss": 0.4709, "step": 3587 }, { "epoch": 0.5482675631279368, "grad_norm": 0.3256016969680786, "learning_rate": 9.04808444652175e-05, "loss": 0.6212, "step": 3588 }, { "epoch": 0.5484203690262444, "grad_norm": 0.2777874171733856, "learning_rate": 9.043115822700294e-05, "loss": 0.8251, "step": 3589 }, { "epoch": 0.548573174924552, "grad_norm": 0.37808412313461304, "learning_rate": 9.038147437278498e-05, "loss": 0.7221, "step": 3590 }, { "epoch": 0.5487259808228597, "grad_norm": 0.33841803669929504, "learning_rate": 9.0331792914942e-05, "loss": 0.7242, "step": 3591 }, { "epoch": 0.5488787867211674, "grad_norm": 0.4070587158203125, "learning_rate": 9.028211386585158e-05, "loss": 0.6671, "step": 3592 }, { "epoch": 0.5490315926194751, "grad_norm": 0.32144245505332947, "learning_rate": 9.023243723789095e-05, "loss": 0.7437, "step": 3593 }, { "epoch": 0.5491843985177828, "grad_norm": 0.3725501000881195, "learning_rate": 9.018276304343661e-05, "loss": 0.9447, "step": 3594 }, { "epoch": 0.5493372044160905, "grad_norm": 0.287739098072052, "learning_rate": 9.013309129486442e-05, "loss": 0.8444, "step": 3595 }, { "epoch": 0.5494900103143981, "grad_norm": 0.3222897946834564, "learning_rate": 9.00834220045497e-05, "loss": 0.8743, "step": 3596 }, { "epoch": 0.5496428162127058, "grad_norm": 0.29483258724212646, "learning_rate": 9.003375518486717e-05, "loss": 0.6778, "step": 3597 }, { "epoch": 0.5497956221110135, "grad_norm": 0.31444084644317627, "learning_rate": 8.998409084819088e-05, "loss": 0.6698, "step": 3598 }, { "epoch": 0.5499484280093212, "grad_norm": 0.935632050037384, "learning_rate": 8.993442900689426e-05, "loss": 0.735, "step": 3599 }, { "epoch": 0.5501012339076289, "grad_norm": 0.3364983797073364, "learning_rate": 8.988476967335015e-05, "loss": 0.7976, "step": 3600 }, { "epoch": 0.5502540398059365, "grad_norm": 0.2456827610731125, "learning_rate": 8.983511285993077e-05, "loss": 0.651, "step": 3601 }, { "epoch": 0.5504068457042441, "grad_norm": 0.42094314098358154, "learning_rate": 8.978545857900774e-05, "loss": 0.8525, "step": 3602 }, { "epoch": 0.5505596516025518, "grad_norm": 0.3060030937194824, "learning_rate": 8.973580684295191e-05, "loss": 0.5432, "step": 3603 }, { "epoch": 0.5507124575008595, "grad_norm": 0.3308151066303253, "learning_rate": 8.968615766413367e-05, "loss": 0.7881, "step": 3604 }, { "epoch": 0.5508652633991672, "grad_norm": 0.2703869938850403, "learning_rate": 8.963651105492267e-05, "loss": 0.7023, "step": 3605 }, { "epoch": 0.5510180692974749, "grad_norm": 0.24846504628658295, "learning_rate": 8.958686702768796e-05, "loss": 0.513, "step": 3606 }, { "epoch": 0.5511708751957826, "grad_norm": 0.2808684706687927, "learning_rate": 8.953722559479788e-05, "loss": 0.7579, "step": 3607 }, { "epoch": 0.5513236810940902, "grad_norm": 0.36039602756500244, "learning_rate": 8.948758676862023e-05, "loss": 0.6608, "step": 3608 }, { "epoch": 0.5514764869923979, "grad_norm": 0.24951785802841187, "learning_rate": 8.943795056152213e-05, "loss": 0.6244, "step": 3609 }, { "epoch": 0.5516292928907056, "grad_norm": 0.2586328983306885, "learning_rate": 8.938831698586993e-05, "loss": 0.5952, "step": 3610 }, { "epoch": 0.5517820987890133, "grad_norm": 0.2756107449531555, "learning_rate": 8.933868605402951e-05, "loss": 0.5698, "step": 3611 }, { "epoch": 0.551934904687321, "grad_norm": 0.28072118759155273, "learning_rate": 8.928905777836599e-05, "loss": 0.5509, "step": 3612 }, { "epoch": 0.5520877105856286, "grad_norm": 0.3419652581214905, "learning_rate": 8.923943217124377e-05, "loss": 0.6476, "step": 3613 }, { "epoch": 0.5522405164839362, "grad_norm": 0.28130725026130676, "learning_rate": 8.918980924502669e-05, "loss": 0.613, "step": 3614 }, { "epoch": 0.5523933223822439, "grad_norm": 0.34239283204078674, "learning_rate": 8.914018901207791e-05, "loss": 0.738, "step": 3615 }, { "epoch": 0.5525461282805516, "grad_norm": 0.2802269458770752, "learning_rate": 8.909057148475991e-05, "loss": 0.5347, "step": 3616 }, { "epoch": 0.5526989341788593, "grad_norm": 0.35925936698913574, "learning_rate": 8.904095667543442e-05, "loss": 0.7487, "step": 3617 }, { "epoch": 0.552851740077167, "grad_norm": 0.2965247631072998, "learning_rate": 8.899134459646257e-05, "loss": 0.4349, "step": 3618 }, { "epoch": 0.5530045459754747, "grad_norm": 0.2840178608894348, "learning_rate": 8.894173526020483e-05, "loss": 0.7946, "step": 3619 }, { "epoch": 0.5531573518737823, "grad_norm": 0.2992875576019287, "learning_rate": 8.889212867902092e-05, "loss": 0.7204, "step": 3620 }, { "epoch": 0.55331015777209, "grad_norm": 0.4619637429714203, "learning_rate": 8.88425248652699e-05, "loss": 0.8135, "step": 3621 }, { "epoch": 0.5534629636703977, "grad_norm": 0.2635784447193146, "learning_rate": 8.879292383131012e-05, "loss": 0.7223, "step": 3622 }, { "epoch": 0.5536157695687054, "grad_norm": 0.29897844791412354, "learning_rate": 8.874332558949933e-05, "loss": 0.5591, "step": 3623 }, { "epoch": 0.5537685754670131, "grad_norm": 0.4032471776008606, "learning_rate": 8.869373015219448e-05, "loss": 0.4957, "step": 3624 }, { "epoch": 0.5539213813653207, "grad_norm": 0.33153533935546875, "learning_rate": 8.864413753175183e-05, "loss": 0.699, "step": 3625 }, { "epoch": 0.5540741872636283, "grad_norm": 0.2981371283531189, "learning_rate": 8.859454774052705e-05, "loss": 0.7621, "step": 3626 }, { "epoch": 0.554226993161936, "grad_norm": 0.34244444966316223, "learning_rate": 8.854496079087489e-05, "loss": 0.623, "step": 3627 }, { "epoch": 0.5543797990602437, "grad_norm": 0.2774951756000519, "learning_rate": 8.849537669514963e-05, "loss": 0.7215, "step": 3628 }, { "epoch": 0.5545326049585514, "grad_norm": 0.35149478912353516, "learning_rate": 8.844579546570466e-05, "loss": 0.7571, "step": 3629 }, { "epoch": 0.5546854108568591, "grad_norm": 0.2874681055545807, "learning_rate": 8.839621711489278e-05, "loss": 0.8364, "step": 3630 }, { "epoch": 0.5548382167551668, "grad_norm": 0.34419891238212585, "learning_rate": 8.834664165506602e-05, "loss": 0.5213, "step": 3631 }, { "epoch": 0.5549910226534744, "grad_norm": 0.24731077253818512, "learning_rate": 8.829706909857564e-05, "loss": 0.7184, "step": 3632 }, { "epoch": 0.5551438285517821, "grad_norm": 0.31164079904556274, "learning_rate": 8.824749945777231e-05, "loss": 0.8348, "step": 3633 }, { "epoch": 0.5552966344500898, "grad_norm": 0.2927907705307007, "learning_rate": 8.819793274500577e-05, "loss": 0.6894, "step": 3634 }, { "epoch": 0.5554494403483975, "grad_norm": 0.32291513681411743, "learning_rate": 8.814836897262524e-05, "loss": 0.7885, "step": 3635 }, { "epoch": 0.5556022462467052, "grad_norm": 0.2853772044181824, "learning_rate": 8.80988081529791e-05, "loss": 0.6697, "step": 3636 }, { "epoch": 0.5557550521450128, "grad_norm": 0.35197320580482483, "learning_rate": 8.804925029841503e-05, "loss": 0.6366, "step": 3637 }, { "epoch": 0.5559078580433204, "grad_norm": 0.2681580185890198, "learning_rate": 8.79996954212799e-05, "loss": 0.804, "step": 3638 }, { "epoch": 0.5560606639416281, "grad_norm": 0.3117936849594116, "learning_rate": 8.795014353391992e-05, "loss": 0.7763, "step": 3639 }, { "epoch": 0.5562134698399358, "grad_norm": 0.4075622856616974, "learning_rate": 8.790059464868052e-05, "loss": 0.6972, "step": 3640 }, { "epoch": 0.5563662757382435, "grad_norm": 0.2932533919811249, "learning_rate": 8.785104877790646e-05, "loss": 0.7157, "step": 3641 }, { "epoch": 0.5565190816365512, "grad_norm": 0.3137199282646179, "learning_rate": 8.780150593394155e-05, "loss": 0.5992, "step": 3642 }, { "epoch": 0.5566718875348589, "grad_norm": 0.31674298644065857, "learning_rate": 8.775196612912906e-05, "loss": 0.5875, "step": 3643 }, { "epoch": 0.5568246934331665, "grad_norm": 0.31617283821105957, "learning_rate": 8.770242937581142e-05, "loss": 0.6944, "step": 3644 }, { "epoch": 0.5569774993314742, "grad_norm": 0.25456005334854126, "learning_rate": 8.765289568633023e-05, "loss": 0.6138, "step": 3645 }, { "epoch": 0.5571303052297819, "grad_norm": 0.35206279158592224, "learning_rate": 8.760336507302645e-05, "loss": 0.6077, "step": 3646 }, { "epoch": 0.5572831111280896, "grad_norm": 0.2751142382621765, "learning_rate": 8.755383754824021e-05, "loss": 0.545, "step": 3647 }, { "epoch": 0.5574359170263972, "grad_norm": 0.30307304859161377, "learning_rate": 8.750431312431088e-05, "loss": 0.793, "step": 3648 }, { "epoch": 0.5575887229247048, "grad_norm": 0.3258976936340332, "learning_rate": 8.745479181357702e-05, "loss": 0.8169, "step": 3649 }, { "epoch": 0.5577415288230125, "grad_norm": 0.34108075499534607, "learning_rate": 8.740527362837649e-05, "loss": 0.6695, "step": 3650 }, { "epoch": 0.5578943347213202, "grad_norm": 0.2840404510498047, "learning_rate": 8.735575858104632e-05, "loss": 0.6967, "step": 3651 }, { "epoch": 0.5580471406196279, "grad_norm": 0.2926589548587799, "learning_rate": 8.730624668392274e-05, "loss": 0.6988, "step": 3652 }, { "epoch": 0.5581999465179356, "grad_norm": 0.2742522358894348, "learning_rate": 8.725673794934122e-05, "loss": 0.5609, "step": 3653 }, { "epoch": 0.5583527524162433, "grad_norm": 0.29250505566596985, "learning_rate": 8.720723238963651e-05, "loss": 0.8384, "step": 3654 }, { "epoch": 0.558505558314551, "grad_norm": 0.297107458114624, "learning_rate": 8.715773001714247e-05, "loss": 0.7247, "step": 3655 }, { "epoch": 0.5586583642128586, "grad_norm": 0.2704477906227112, "learning_rate": 8.710823084419217e-05, "loss": 0.7114, "step": 3656 }, { "epoch": 0.5588111701111663, "grad_norm": 0.4120960235595703, "learning_rate": 8.705873488311793e-05, "loss": 0.781, "step": 3657 }, { "epoch": 0.558963976009474, "grad_norm": 0.293628066778183, "learning_rate": 8.70092421462513e-05, "loss": 0.8205, "step": 3658 }, { "epoch": 0.5591167819077817, "grad_norm": 0.43009713292121887, "learning_rate": 8.695975264592293e-05, "loss": 0.6756, "step": 3659 }, { "epoch": 0.5592695878060893, "grad_norm": 0.3064223825931549, "learning_rate": 8.691026639446269e-05, "loss": 0.6319, "step": 3660 }, { "epoch": 0.559422393704397, "grad_norm": 0.6732028722763062, "learning_rate": 8.686078340419973e-05, "loss": 0.6336, "step": 3661 }, { "epoch": 0.5595751996027046, "grad_norm": 0.28670135140419006, "learning_rate": 8.68113036874623e-05, "loss": 0.6219, "step": 3662 }, { "epoch": 0.5597280055010123, "grad_norm": 0.332643061876297, "learning_rate": 8.676182725657783e-05, "loss": 0.6776, "step": 3663 }, { "epoch": 0.55988081139932, "grad_norm": 0.34815049171447754, "learning_rate": 8.671235412387296e-05, "loss": 0.621, "step": 3664 }, { "epoch": 0.5600336172976277, "grad_norm": 0.36170151829719543, "learning_rate": 8.666288430167356e-05, "loss": 0.841, "step": 3665 }, { "epoch": 0.5601864231959354, "grad_norm": 0.2729724049568176, "learning_rate": 8.661341780230456e-05, "loss": 0.7348, "step": 3666 }, { "epoch": 0.560339229094243, "grad_norm": 0.3887978196144104, "learning_rate": 8.656395463809014e-05, "loss": 0.7487, "step": 3667 }, { "epoch": 0.5604920349925507, "grad_norm": 0.29915356636047363, "learning_rate": 8.651449482135362e-05, "loss": 0.6618, "step": 3668 }, { "epoch": 0.5606448408908584, "grad_norm": 0.35995975136756897, "learning_rate": 8.646503836441755e-05, "loss": 0.6133, "step": 3669 }, { "epoch": 0.5607976467891661, "grad_norm": 0.3829162120819092, "learning_rate": 8.641558527960354e-05, "loss": 0.5826, "step": 3670 }, { "epoch": 0.5609504526874738, "grad_norm": 0.34606584906578064, "learning_rate": 8.63661355792324e-05, "loss": 0.8346, "step": 3671 }, { "epoch": 0.5611032585857814, "grad_norm": 0.25087451934814453, "learning_rate": 8.631668927562421e-05, "loss": 0.71, "step": 3672 }, { "epoch": 0.561256064484089, "grad_norm": 0.3157608211040497, "learning_rate": 8.626724638109796e-05, "loss": 0.6984, "step": 3673 }, { "epoch": 0.5614088703823967, "grad_norm": 0.33677148818969727, "learning_rate": 8.6217806907972e-05, "loss": 0.541, "step": 3674 }, { "epoch": 0.5615616762807044, "grad_norm": 0.28932487964630127, "learning_rate": 8.616837086856377e-05, "loss": 0.819, "step": 3675 }, { "epoch": 0.5617144821790121, "grad_norm": 0.3011149764060974, "learning_rate": 8.611893827518987e-05, "loss": 0.6822, "step": 3676 }, { "epoch": 0.5618672880773198, "grad_norm": 0.3271982669830322, "learning_rate": 8.606950914016593e-05, "loss": 0.6524, "step": 3677 }, { "epoch": 0.5620200939756275, "grad_norm": 0.291826993227005, "learning_rate": 8.602008347580685e-05, "loss": 0.6323, "step": 3678 }, { "epoch": 0.5621728998739352, "grad_norm": 0.2812834680080414, "learning_rate": 8.597066129442663e-05, "loss": 0.6507, "step": 3679 }, { "epoch": 0.5623257057722428, "grad_norm": 0.35509222745895386, "learning_rate": 8.59212426083384e-05, "loss": 0.8128, "step": 3680 }, { "epoch": 0.5624785116705505, "grad_norm": 0.26385602355003357, "learning_rate": 8.587182742985439e-05, "loss": 0.5526, "step": 3681 }, { "epoch": 0.5626313175688582, "grad_norm": 0.26334047317504883, "learning_rate": 8.582241577128596e-05, "loss": 0.7467, "step": 3682 }, { "epoch": 0.5627841234671659, "grad_norm": 0.4938865303993225, "learning_rate": 8.577300764494369e-05, "loss": 0.7191, "step": 3683 }, { "epoch": 0.5629369293654735, "grad_norm": 0.2696554958820343, "learning_rate": 8.572360306313706e-05, "loss": 0.6709, "step": 3684 }, { "epoch": 0.5630897352637811, "grad_norm": 0.36236676573753357, "learning_rate": 8.567420203817492e-05, "loss": 0.6558, "step": 3685 }, { "epoch": 0.5632425411620888, "grad_norm": 0.27699196338653564, "learning_rate": 8.562480458236507e-05, "loss": 0.7543, "step": 3686 }, { "epoch": 0.5633953470603965, "grad_norm": 0.2756933569908142, "learning_rate": 8.557541070801455e-05, "loss": 0.6575, "step": 3687 }, { "epoch": 0.5635481529587042, "grad_norm": 0.3750994801521301, "learning_rate": 8.55260204274293e-05, "loss": 0.6176, "step": 3688 }, { "epoch": 0.5637009588570119, "grad_norm": 0.4120349884033203, "learning_rate": 8.547663375291459e-05, "loss": 0.6742, "step": 3689 }, { "epoch": 0.5638537647553196, "grad_norm": 0.2664812207221985, "learning_rate": 8.54272506967747e-05, "loss": 0.5904, "step": 3690 }, { "epoch": 0.5640065706536272, "grad_norm": 0.2847195565700531, "learning_rate": 8.537787127131292e-05, "loss": 0.7125, "step": 3691 }, { "epoch": 0.5641593765519349, "grad_norm": 0.27091795206069946, "learning_rate": 8.532849548883179e-05, "loss": 0.789, "step": 3692 }, { "epoch": 0.5643121824502426, "grad_norm": 0.3841065764427185, "learning_rate": 8.527912336163283e-05, "loss": 0.7313, "step": 3693 }, { "epoch": 0.5644649883485503, "grad_norm": 0.3322576582431793, "learning_rate": 8.522975490201677e-05, "loss": 0.6777, "step": 3694 }, { "epoch": 0.5646177942468579, "grad_norm": 0.26073184609413147, "learning_rate": 8.518039012228324e-05, "loss": 0.7309, "step": 3695 }, { "epoch": 0.5647706001451656, "grad_norm": 0.28975701332092285, "learning_rate": 8.513102903473113e-05, "loss": 0.6413, "step": 3696 }, { "epoch": 0.5649234060434732, "grad_norm": 0.25856003165245056, "learning_rate": 8.508167165165834e-05, "loss": 0.5668, "step": 3697 }, { "epoch": 0.5650762119417809, "grad_norm": 0.3022193908691406, "learning_rate": 8.50323179853618e-05, "loss": 0.663, "step": 3698 }, { "epoch": 0.5652290178400886, "grad_norm": 0.2755308449268341, "learning_rate": 8.498296804813759e-05, "loss": 0.6749, "step": 3699 }, { "epoch": 0.5653818237383963, "grad_norm": 0.30388593673706055, "learning_rate": 8.493362185228086e-05, "loss": 0.6675, "step": 3700 }, { "epoch": 0.565534629636704, "grad_norm": 0.3053312599658966, "learning_rate": 8.488427941008578e-05, "loss": 0.8176, "step": 3701 }, { "epoch": 0.5656874355350117, "grad_norm": 0.36438342928886414, "learning_rate": 8.483494073384557e-05, "loss": 0.6879, "step": 3702 }, { "epoch": 0.5658402414333193, "grad_norm": 0.30132386088371277, "learning_rate": 8.478560583585258e-05, "loss": 0.7079, "step": 3703 }, { "epoch": 0.565993047331627, "grad_norm": 0.2842436134815216, "learning_rate": 8.47362747283982e-05, "loss": 0.5923, "step": 3704 }, { "epoch": 0.5661458532299347, "grad_norm": 0.33494096994400024, "learning_rate": 8.468694742377284e-05, "loss": 0.6465, "step": 3705 }, { "epoch": 0.5662986591282424, "grad_norm": 0.4333784580230713, "learning_rate": 8.463762393426596e-05, "loss": 0.8571, "step": 3706 }, { "epoch": 0.56645146502655, "grad_norm": 0.29764246940612793, "learning_rate": 8.458830427216615e-05, "loss": 0.6411, "step": 3707 }, { "epoch": 0.5666042709248577, "grad_norm": 0.2814170718193054, "learning_rate": 8.453898844976098e-05, "loss": 0.7452, "step": 3708 }, { "epoch": 0.5667570768231653, "grad_norm": 0.33996063470840454, "learning_rate": 8.448967647933702e-05, "loss": 0.6065, "step": 3709 }, { "epoch": 0.566909882721473, "grad_norm": 0.3823285698890686, "learning_rate": 8.444036837317995e-05, "loss": 0.7324, "step": 3710 }, { "epoch": 0.5670626886197807, "grad_norm": 0.37926560640335083, "learning_rate": 8.439106414357455e-05, "loss": 0.6082, "step": 3711 }, { "epoch": 0.5672154945180884, "grad_norm": 0.3851792812347412, "learning_rate": 8.434176380280445e-05, "loss": 0.7381, "step": 3712 }, { "epoch": 0.5673683004163961, "grad_norm": 0.3026863932609558, "learning_rate": 8.429246736315248e-05, "loss": 0.6545, "step": 3713 }, { "epoch": 0.5675211063147038, "grad_norm": 0.3653876781463623, "learning_rate": 8.424317483690037e-05, "loss": 0.8296, "step": 3714 }, { "epoch": 0.5676739122130114, "grad_norm": 0.30067160725593567, "learning_rate": 8.419388623632905e-05, "loss": 0.6965, "step": 3715 }, { "epoch": 0.5678267181113191, "grad_norm": 0.3310571610927582, "learning_rate": 8.414460157371825e-05, "loss": 0.6493, "step": 3716 }, { "epoch": 0.5679795240096268, "grad_norm": 0.3468477725982666, "learning_rate": 8.409532086134688e-05, "loss": 0.6634, "step": 3717 }, { "epoch": 0.5681323299079345, "grad_norm": 0.29771387577056885, "learning_rate": 8.40460441114928e-05, "loss": 0.5265, "step": 3718 }, { "epoch": 0.5682851358062421, "grad_norm": 0.4328177571296692, "learning_rate": 8.399677133643294e-05, "loss": 0.7287, "step": 3719 }, { "epoch": 0.5684379417045498, "grad_norm": 0.3208015263080597, "learning_rate": 8.394750254844314e-05, "loss": 0.7538, "step": 3720 }, { "epoch": 0.5685907476028574, "grad_norm": 0.27956128120422363, "learning_rate": 8.389823775979833e-05, "loss": 0.608, "step": 3721 }, { "epoch": 0.5687435535011651, "grad_norm": 0.2791298031806946, "learning_rate": 8.384897698277246e-05, "loss": 0.6882, "step": 3722 }, { "epoch": 0.5688963593994728, "grad_norm": 0.28099286556243896, "learning_rate": 8.379972022963835e-05, "loss": 0.7346, "step": 3723 }, { "epoch": 0.5690491652977805, "grad_norm": 0.27625101804733276, "learning_rate": 8.375046751266797e-05, "loss": 0.6558, "step": 3724 }, { "epoch": 0.5692019711960882, "grad_norm": 0.26803532242774963, "learning_rate": 8.37012188441322e-05, "loss": 0.6644, "step": 3725 }, { "epoch": 0.5693547770943959, "grad_norm": 0.28423699736595154, "learning_rate": 8.365197423630097e-05, "loss": 0.7188, "step": 3726 }, { "epoch": 0.5695075829927035, "grad_norm": 0.3047555387020111, "learning_rate": 8.36027337014431e-05, "loss": 0.7798, "step": 3727 }, { "epoch": 0.5696603888910112, "grad_norm": 0.27898523211479187, "learning_rate": 8.355349725182651e-05, "loss": 0.6601, "step": 3728 }, { "epoch": 0.5698131947893189, "grad_norm": 0.2902171015739441, "learning_rate": 8.350426489971802e-05, "loss": 0.7398, "step": 3729 }, { "epoch": 0.5699660006876266, "grad_norm": 0.3205011487007141, "learning_rate": 8.345503665738343e-05, "loss": 0.8977, "step": 3730 }, { "epoch": 0.5701188065859342, "grad_norm": 0.2823057770729065, "learning_rate": 8.340581253708759e-05, "loss": 0.6605, "step": 3731 }, { "epoch": 0.5702716124842419, "grad_norm": 0.2640000283718109, "learning_rate": 8.335659255109424e-05, "loss": 0.704, "step": 3732 }, { "epoch": 0.5704244183825495, "grad_norm": 0.32330089807510376, "learning_rate": 8.330737671166622e-05, "loss": 0.6702, "step": 3733 }, { "epoch": 0.5705772242808572, "grad_norm": 0.29183852672576904, "learning_rate": 8.32581650310651e-05, "loss": 0.7842, "step": 3734 }, { "epoch": 0.5707300301791649, "grad_norm": 0.28813636302948, "learning_rate": 8.320895752155165e-05, "loss": 0.6337, "step": 3735 }, { "epoch": 0.5708828360774726, "grad_norm": 0.265868604183197, "learning_rate": 8.315975419538551e-05, "loss": 0.7946, "step": 3736 }, { "epoch": 0.5710356419757803, "grad_norm": 0.2624013423919678, "learning_rate": 8.311055506482522e-05, "loss": 0.5877, "step": 3737 }, { "epoch": 0.571188447874088, "grad_norm": 0.5272555947303772, "learning_rate": 8.306136014212836e-05, "loss": 0.7125, "step": 3738 }, { "epoch": 0.5713412537723956, "grad_norm": 0.30128014087677, "learning_rate": 8.301216943955143e-05, "loss": 0.7108, "step": 3739 }, { "epoch": 0.5714940596707033, "grad_norm": 0.26696425676345825, "learning_rate": 8.296298296934993e-05, "loss": 0.6307, "step": 3740 }, { "epoch": 0.571646865569011, "grad_norm": 0.3078870177268982, "learning_rate": 8.291380074377815e-05, "loss": 0.6569, "step": 3741 }, { "epoch": 0.5717996714673187, "grad_norm": 0.3038552403450012, "learning_rate": 8.286462277508951e-05, "loss": 0.6657, "step": 3742 }, { "epoch": 0.5719524773656263, "grad_norm": 0.3003843426704407, "learning_rate": 8.281544907553629e-05, "loss": 0.7251, "step": 3743 }, { "epoch": 0.572105283263934, "grad_norm": 0.31399810314178467, "learning_rate": 8.276627965736968e-05, "loss": 0.8504, "step": 3744 }, { "epoch": 0.5722580891622416, "grad_norm": 0.3033444881439209, "learning_rate": 8.271711453283978e-05, "loss": 0.8417, "step": 3745 }, { "epoch": 0.5724108950605493, "grad_norm": 0.325181782245636, "learning_rate": 8.266795371419574e-05, "loss": 0.5664, "step": 3746 }, { "epoch": 0.572563700958857, "grad_norm": 0.31936803460121155, "learning_rate": 8.261879721368558e-05, "loss": 0.6776, "step": 3747 }, { "epoch": 0.5727165068571647, "grad_norm": 0.34658119082450867, "learning_rate": 8.256964504355617e-05, "loss": 0.8581, "step": 3748 }, { "epoch": 0.5728693127554724, "grad_norm": 0.288990318775177, "learning_rate": 8.252049721605335e-05, "loss": 0.6763, "step": 3749 }, { "epoch": 0.57302211865378, "grad_norm": 0.3504142165184021, "learning_rate": 8.247135374342196e-05, "loss": 0.7964, "step": 3750 }, { "epoch": 0.5731749245520877, "grad_norm": 0.3110313415527344, "learning_rate": 8.242221463790565e-05, "loss": 0.7416, "step": 3751 }, { "epoch": 0.5733277304503954, "grad_norm": 0.28872978687286377, "learning_rate": 8.237307991174697e-05, "loss": 0.6734, "step": 3752 }, { "epoch": 0.5734805363487031, "grad_norm": 0.24102354049682617, "learning_rate": 8.232394957718749e-05, "loss": 0.8467, "step": 3753 }, { "epoch": 0.5736333422470107, "grad_norm": 0.28960511088371277, "learning_rate": 8.227482364646762e-05, "loss": 0.7903, "step": 3754 }, { "epoch": 0.5737861481453184, "grad_norm": 0.28464069962501526, "learning_rate": 8.222570213182662e-05, "loss": 0.8631, "step": 3755 }, { "epoch": 0.573938954043626, "grad_norm": 0.3860986828804016, "learning_rate": 8.217658504550272e-05, "loss": 0.8208, "step": 3756 }, { "epoch": 0.5740917599419337, "grad_norm": 0.29846811294555664, "learning_rate": 8.212747239973306e-05, "loss": 0.7068, "step": 3757 }, { "epoch": 0.5742445658402414, "grad_norm": 0.37197667360305786, "learning_rate": 8.207836420675365e-05, "loss": 0.7763, "step": 3758 }, { "epoch": 0.5743973717385491, "grad_norm": 0.3524726927280426, "learning_rate": 8.202926047879933e-05, "loss": 0.6656, "step": 3759 }, { "epoch": 0.5745501776368568, "grad_norm": 0.30434712767601013, "learning_rate": 8.198016122810388e-05, "loss": 0.6682, "step": 3760 }, { "epoch": 0.5747029835351645, "grad_norm": 0.3250044584274292, "learning_rate": 8.193106646690006e-05, "loss": 0.5475, "step": 3761 }, { "epoch": 0.5748557894334722, "grad_norm": 0.2767269015312195, "learning_rate": 8.188197620741933e-05, "loss": 0.8508, "step": 3762 }, { "epoch": 0.5750085953317798, "grad_norm": 0.3154396414756775, "learning_rate": 8.183289046189213e-05, "loss": 0.782, "step": 3763 }, { "epoch": 0.5751614012300875, "grad_norm": 0.3278322219848633, "learning_rate": 8.178380924254775e-05, "loss": 0.5591, "step": 3764 }, { "epoch": 0.5753142071283952, "grad_norm": 0.29018348455429077, "learning_rate": 8.173473256161445e-05, "loss": 0.7719, "step": 3765 }, { "epoch": 0.5754670130267028, "grad_norm": 0.307338148355484, "learning_rate": 8.168566043131917e-05, "loss": 0.8133, "step": 3766 }, { "epoch": 0.5756198189250105, "grad_norm": 0.35957232117652893, "learning_rate": 8.163659286388784e-05, "loss": 0.7925, "step": 3767 }, { "epoch": 0.5757726248233181, "grad_norm": 0.29846546053886414, "learning_rate": 8.158752987154533e-05, "loss": 0.6603, "step": 3768 }, { "epoch": 0.5759254307216258, "grad_norm": 0.3277988135814667, "learning_rate": 8.153847146651511e-05, "loss": 0.7112, "step": 3769 }, { "epoch": 0.5760782366199335, "grad_norm": 0.3009068965911865, "learning_rate": 8.148941766101979e-05, "loss": 0.7852, "step": 3770 }, { "epoch": 0.5762310425182412, "grad_norm": 0.3635782301425934, "learning_rate": 8.144036846728063e-05, "loss": 0.6492, "step": 3771 }, { "epoch": 0.5763838484165489, "grad_norm": 0.3081236779689789, "learning_rate": 8.139132389751793e-05, "loss": 0.8141, "step": 3772 }, { "epoch": 0.5765366543148566, "grad_norm": 0.2913459837436676, "learning_rate": 8.134228396395067e-05, "loss": 0.6704, "step": 3773 }, { "epoch": 0.5766894602131643, "grad_norm": 0.29093053936958313, "learning_rate": 8.129324867879673e-05, "loss": 0.7357, "step": 3774 }, { "epoch": 0.5768422661114719, "grad_norm": 0.39256051182746887, "learning_rate": 8.124421805427286e-05, "loss": 0.7393, "step": 3775 }, { "epoch": 0.5769950720097796, "grad_norm": 0.54695063829422, "learning_rate": 8.11951921025946e-05, "loss": 0.6199, "step": 3776 }, { "epoch": 0.5771478779080873, "grad_norm": 0.27664807438850403, "learning_rate": 8.114617083597639e-05, "loss": 0.523, "step": 3777 }, { "epoch": 0.5773006838063949, "grad_norm": 0.27972468733787537, "learning_rate": 8.109715426663145e-05, "loss": 0.6728, "step": 3778 }, { "epoch": 0.5774534897047026, "grad_norm": 0.3046027719974518, "learning_rate": 8.104814240677188e-05, "loss": 0.5586, "step": 3779 }, { "epoch": 0.5776062956030102, "grad_norm": 0.3104955852031708, "learning_rate": 8.099913526860849e-05, "loss": 0.7716, "step": 3780 }, { "epoch": 0.5777591015013179, "grad_norm": 0.28801658749580383, "learning_rate": 8.095013286435107e-05, "loss": 0.7354, "step": 3781 }, { "epoch": 0.5779119073996256, "grad_norm": 0.2638983428478241, "learning_rate": 8.090113520620816e-05, "loss": 0.6428, "step": 3782 }, { "epoch": 0.5780647132979333, "grad_norm": 0.4122447669506073, "learning_rate": 8.085214230638707e-05, "loss": 0.6169, "step": 3783 }, { "epoch": 0.578217519196241, "grad_norm": 0.36378583312034607, "learning_rate": 8.080315417709398e-05, "loss": 0.5359, "step": 3784 }, { "epoch": 0.5783703250945487, "grad_norm": 0.2803877592086792, "learning_rate": 8.075417083053389e-05, "loss": 0.8017, "step": 3785 }, { "epoch": 0.5785231309928563, "grad_norm": 0.3822747468948364, "learning_rate": 8.070519227891063e-05, "loss": 1.0106, "step": 3786 }, { "epoch": 0.578675936891164, "grad_norm": 0.31396767497062683, "learning_rate": 8.065621853442669e-05, "loss": 0.5438, "step": 3787 }, { "epoch": 0.5788287427894717, "grad_norm": 0.3391607105731964, "learning_rate": 8.060724960928354e-05, "loss": 1.0131, "step": 3788 }, { "epoch": 0.5789815486877794, "grad_norm": 0.3262687027454376, "learning_rate": 8.055828551568138e-05, "loss": 0.8778, "step": 3789 }, { "epoch": 0.579134354586087, "grad_norm": 0.2590049207210541, "learning_rate": 8.050932626581918e-05, "loss": 0.6414, "step": 3790 }, { "epoch": 0.5792871604843947, "grad_norm": 0.31911787390708923, "learning_rate": 8.046037187189471e-05, "loss": 0.7003, "step": 3791 }, { "epoch": 0.5794399663827023, "grad_norm": 0.281447172164917, "learning_rate": 8.04114223461046e-05, "loss": 0.711, "step": 3792 }, { "epoch": 0.57959277228101, "grad_norm": 0.3643721640110016, "learning_rate": 8.036247770064418e-05, "loss": 0.5737, "step": 3793 }, { "epoch": 0.5797455781793177, "grad_norm": 0.3160751760005951, "learning_rate": 8.031353794770757e-05, "loss": 0.6465, "step": 3794 }, { "epoch": 0.5798983840776254, "grad_norm": 0.28902941942214966, "learning_rate": 8.026460309948774e-05, "loss": 0.5288, "step": 3795 }, { "epoch": 0.5800511899759331, "grad_norm": 0.28848549723625183, "learning_rate": 8.021567316817637e-05, "loss": 0.7132, "step": 3796 }, { "epoch": 0.5802039958742408, "grad_norm": 0.35009557008743286, "learning_rate": 8.0166748165964e-05, "loss": 0.6089, "step": 3797 }, { "epoch": 0.5803568017725484, "grad_norm": 0.31023460626602173, "learning_rate": 8.011782810503979e-05, "loss": 0.6298, "step": 3798 }, { "epoch": 0.5805096076708561, "grad_norm": 0.32872024178504944, "learning_rate": 8.006891299759183e-05, "loss": 0.6994, "step": 3799 }, { "epoch": 0.5806624135691638, "grad_norm": 0.25840136408805847, "learning_rate": 8.002000285580692e-05, "loss": 0.681, "step": 3800 }, { "epoch": 0.5808152194674715, "grad_norm": 0.3060307502746582, "learning_rate": 7.997109769187054e-05, "loss": 0.7211, "step": 3801 }, { "epoch": 0.5809680253657791, "grad_norm": 0.43128442764282227, "learning_rate": 7.992219751796704e-05, "loss": 0.9828, "step": 3802 }, { "epoch": 0.5811208312640868, "grad_norm": 0.3138619661331177, "learning_rate": 7.987330234627951e-05, "loss": 0.7311, "step": 3803 }, { "epoch": 0.5812736371623944, "grad_norm": 0.39342501759529114, "learning_rate": 7.982441218898977e-05, "loss": 0.7003, "step": 3804 }, { "epoch": 0.5814264430607021, "grad_norm": 1.905755639076233, "learning_rate": 7.977552705827836e-05, "loss": 0.8023, "step": 3805 }, { "epoch": 0.5815792489590098, "grad_norm": 0.26288965344429016, "learning_rate": 7.972664696632458e-05, "loss": 0.621, "step": 3806 }, { "epoch": 0.5817320548573175, "grad_norm": 0.2712186276912689, "learning_rate": 7.967777192530658e-05, "loss": 0.572, "step": 3807 }, { "epoch": 0.5818848607556252, "grad_norm": 0.28509825468063354, "learning_rate": 7.962890194740109e-05, "loss": 0.6408, "step": 3808 }, { "epoch": 0.5820376666539329, "grad_norm": 0.2955816984176636, "learning_rate": 7.958003704478368e-05, "loss": 0.5941, "step": 3809 }, { "epoch": 0.5821904725522405, "grad_norm": 0.3763854205608368, "learning_rate": 7.953117722962862e-05, "loss": 0.6688, "step": 3810 }, { "epoch": 0.5823432784505482, "grad_norm": 0.4830414652824402, "learning_rate": 7.948232251410896e-05, "loss": 0.6907, "step": 3811 }, { "epoch": 0.5824960843488559, "grad_norm": 0.4125008285045624, "learning_rate": 7.94334729103964e-05, "loss": 0.7622, "step": 3812 }, { "epoch": 0.5826488902471635, "grad_norm": 0.45555487275123596, "learning_rate": 7.938462843066142e-05, "loss": 0.903, "step": 3813 }, { "epoch": 0.5828016961454712, "grad_norm": 0.3383921682834625, "learning_rate": 7.933578908707326e-05, "loss": 0.77, "step": 3814 }, { "epoch": 0.5829545020437789, "grad_norm": 0.28087118268013, "learning_rate": 7.928695489179972e-05, "loss": 0.5502, "step": 3815 }, { "epoch": 0.5831073079420865, "grad_norm": 0.32582974433898926, "learning_rate": 7.923812585700753e-05, "loss": 0.6114, "step": 3816 }, { "epoch": 0.5832601138403942, "grad_norm": 0.33687683939933777, "learning_rate": 7.918930199486197e-05, "loss": 0.7654, "step": 3817 }, { "epoch": 0.5834129197387019, "grad_norm": 0.35183513164520264, "learning_rate": 7.914048331752719e-05, "loss": 0.6804, "step": 3818 }, { "epoch": 0.5835657256370096, "grad_norm": 0.35134196281433105, "learning_rate": 7.909166983716586e-05, "loss": 0.8217, "step": 3819 }, { "epoch": 0.5837185315353173, "grad_norm": 0.24550901353359222, "learning_rate": 7.904286156593948e-05, "loss": 0.6473, "step": 3820 }, { "epoch": 0.583871337433625, "grad_norm": 0.28883498907089233, "learning_rate": 7.899405851600822e-05, "loss": 0.6587, "step": 3821 }, { "epoch": 0.5840241433319326, "grad_norm": 0.29037415981292725, "learning_rate": 7.894526069953094e-05, "loss": 0.6768, "step": 3822 }, { "epoch": 0.5841769492302403, "grad_norm": 0.2939176857471466, "learning_rate": 7.889646812866524e-05, "loss": 0.5691, "step": 3823 }, { "epoch": 0.584329755128548, "grad_norm": 0.29959577322006226, "learning_rate": 7.884768081556735e-05, "loss": 0.7104, "step": 3824 }, { "epoch": 0.5844825610268556, "grad_norm": 0.2616795003414154, "learning_rate": 7.879889877239224e-05, "loss": 0.6442, "step": 3825 }, { "epoch": 0.5846353669251633, "grad_norm": 0.29472115635871887, "learning_rate": 7.87501220112935e-05, "loss": 0.3929, "step": 3826 }, { "epoch": 0.584788172823471, "grad_norm": 0.23962584137916565, "learning_rate": 7.87013505444235e-05, "loss": 0.6744, "step": 3827 }, { "epoch": 0.5849409787217786, "grad_norm": 0.26623886823654175, "learning_rate": 7.865258438393322e-05, "loss": 0.6706, "step": 3828 }, { "epoch": 0.5850937846200863, "grad_norm": 0.3564209043979645, "learning_rate": 7.860382354197239e-05, "loss": 0.8078, "step": 3829 }, { "epoch": 0.585246590518394, "grad_norm": 0.2812064588069916, "learning_rate": 7.855506803068926e-05, "loss": 0.6951, "step": 3830 }, { "epoch": 0.5853993964167017, "grad_norm": 0.29761627316474915, "learning_rate": 7.850631786223093e-05, "loss": 0.5924, "step": 3831 }, { "epoch": 0.5855522023150094, "grad_norm": 0.2774466276168823, "learning_rate": 7.845757304874313e-05, "loss": 0.5304, "step": 3832 }, { "epoch": 0.5857050082133171, "grad_norm": 0.31062594056129456, "learning_rate": 7.84088336023701e-05, "loss": 0.6803, "step": 3833 }, { "epoch": 0.5858578141116247, "grad_norm": 0.30979427695274353, "learning_rate": 7.836009953525499e-05, "loss": 0.7159, "step": 3834 }, { "epoch": 0.5860106200099324, "grad_norm": 0.30622562766075134, "learning_rate": 7.83113708595394e-05, "loss": 0.6932, "step": 3835 }, { "epoch": 0.5861634259082401, "grad_norm": 0.28833743929862976, "learning_rate": 7.826264758736374e-05, "loss": 0.8625, "step": 3836 }, { "epoch": 0.5863162318065477, "grad_norm": 0.30535200238227844, "learning_rate": 7.821392973086691e-05, "loss": 0.9028, "step": 3837 }, { "epoch": 0.5864690377048554, "grad_norm": 0.3745479881763458, "learning_rate": 7.816521730218663e-05, "loss": 0.8378, "step": 3838 }, { "epoch": 0.586621843603163, "grad_norm": 0.35396698117256165, "learning_rate": 7.811651031345921e-05, "loss": 0.7586, "step": 3839 }, { "epoch": 0.5867746495014707, "grad_norm": 0.29585057497024536, "learning_rate": 7.806780877681952e-05, "loss": 0.6075, "step": 3840 }, { "epoch": 0.5869274553997784, "grad_norm": 0.30357035994529724, "learning_rate": 7.801911270440114e-05, "loss": 0.6288, "step": 3841 }, { "epoch": 0.5870802612980861, "grad_norm": 0.28360724449157715, "learning_rate": 7.797042210833635e-05, "loss": 0.6806, "step": 3842 }, { "epoch": 0.5872330671963938, "grad_norm": 0.30030885338783264, "learning_rate": 7.792173700075598e-05, "loss": 0.6808, "step": 3843 }, { "epoch": 0.5873858730947015, "grad_norm": 0.31683292984962463, "learning_rate": 7.78730573937895e-05, "loss": 0.8747, "step": 3844 }, { "epoch": 0.5875386789930092, "grad_norm": 0.34894859790802, "learning_rate": 7.7824383299565e-05, "loss": 0.9388, "step": 3845 }, { "epoch": 0.5876914848913168, "grad_norm": 1.2283755540847778, "learning_rate": 7.777571473020931e-05, "loss": 0.5487, "step": 3846 }, { "epoch": 0.5878442907896245, "grad_norm": 0.3542500138282776, "learning_rate": 7.772705169784769e-05, "loss": 0.7024, "step": 3847 }, { "epoch": 0.5879970966879322, "grad_norm": 0.3123841881752014, "learning_rate": 7.767839421460417e-05, "loss": 0.6317, "step": 3848 }, { "epoch": 0.5881499025862398, "grad_norm": 0.3913807272911072, "learning_rate": 7.762974229260138e-05, "loss": 0.7937, "step": 3849 }, { "epoch": 0.5883027084845475, "grad_norm": 0.24539242684841156, "learning_rate": 7.758109594396054e-05, "loss": 0.6266, "step": 3850 }, { "epoch": 0.5884555143828551, "grad_norm": 0.2929461598396301, "learning_rate": 7.753245518080143e-05, "loss": 0.6406, "step": 3851 }, { "epoch": 0.5886083202811628, "grad_norm": 0.5563262701034546, "learning_rate": 7.748382001524249e-05, "loss": 0.9369, "step": 3852 }, { "epoch": 0.5887611261794705, "grad_norm": 0.30135178565979004, "learning_rate": 7.743519045940083e-05, "loss": 0.4966, "step": 3853 }, { "epoch": 0.5889139320777782, "grad_norm": 0.2895030975341797, "learning_rate": 7.738656652539204e-05, "loss": 0.6125, "step": 3854 }, { "epoch": 0.5890667379760859, "grad_norm": 0.33835187554359436, "learning_rate": 7.733794822533038e-05, "loss": 0.5807, "step": 3855 }, { "epoch": 0.5892195438743936, "grad_norm": 0.2818509042263031, "learning_rate": 7.728933557132864e-05, "loss": 0.6172, "step": 3856 }, { "epoch": 0.5893723497727013, "grad_norm": 0.30797627568244934, "learning_rate": 7.724072857549838e-05, "loss": 0.7899, "step": 3857 }, { "epoch": 0.5895251556710089, "grad_norm": 0.30569693446159363, "learning_rate": 7.719212724994951e-05, "loss": 0.6897, "step": 3858 }, { "epoch": 0.5896779615693166, "grad_norm": 0.255311518907547, "learning_rate": 7.714353160679066e-05, "loss": 0.7461, "step": 3859 }, { "epoch": 0.5898307674676242, "grad_norm": 0.2892606556415558, "learning_rate": 7.709494165812907e-05, "loss": 0.5905, "step": 3860 }, { "epoch": 0.5899835733659319, "grad_norm": 0.24588391184806824, "learning_rate": 7.704635741607052e-05, "loss": 0.5826, "step": 3861 }, { "epoch": 0.5901363792642396, "grad_norm": 0.25809404253959656, "learning_rate": 7.69977788927193e-05, "loss": 0.6255, "step": 3862 }, { "epoch": 0.5902891851625472, "grad_norm": 0.3097207546234131, "learning_rate": 7.69492061001784e-05, "loss": 0.6712, "step": 3863 }, { "epoch": 0.5904419910608549, "grad_norm": 0.31905776262283325, "learning_rate": 7.690063905054933e-05, "loss": 0.9132, "step": 3864 }, { "epoch": 0.5905947969591626, "grad_norm": 0.3189091384410858, "learning_rate": 7.685207775593211e-05, "loss": 0.727, "step": 3865 }, { "epoch": 0.5907476028574703, "grad_norm": 0.3288535177707672, "learning_rate": 7.680352222842541e-05, "loss": 0.5702, "step": 3866 }, { "epoch": 0.590900408755778, "grad_norm": 0.2501838207244873, "learning_rate": 7.67549724801264e-05, "loss": 0.5476, "step": 3867 }, { "epoch": 0.5910532146540857, "grad_norm": 0.25239741802215576, "learning_rate": 7.670642852313094e-05, "loss": 0.6705, "step": 3868 }, { "epoch": 0.5912060205523934, "grad_norm": 0.3150840401649475, "learning_rate": 7.665789036953324e-05, "loss": 0.6463, "step": 3869 }, { "epoch": 0.591358826450701, "grad_norm": 0.29708942770957947, "learning_rate": 7.660935803142621e-05, "loss": 0.9097, "step": 3870 }, { "epoch": 0.5915116323490087, "grad_norm": 0.3262752592563629, "learning_rate": 7.656083152090133e-05, "loss": 0.6029, "step": 3871 }, { "epoch": 0.5916644382473163, "grad_norm": 0.3073914647102356, "learning_rate": 7.651231085004845e-05, "loss": 0.7531, "step": 3872 }, { "epoch": 0.591817244145624, "grad_norm": 0.3142286241054535, "learning_rate": 7.646379603095619e-05, "loss": 0.8999, "step": 3873 }, { "epoch": 0.5919700500439317, "grad_norm": 0.30331626534461975, "learning_rate": 7.641528707571157e-05, "loss": 0.5738, "step": 3874 }, { "epoch": 0.5921228559422393, "grad_norm": 0.2707924246788025, "learning_rate": 7.636678399640026e-05, "loss": 0.6544, "step": 3875 }, { "epoch": 0.592275661840547, "grad_norm": 0.4686855375766754, "learning_rate": 7.631828680510626e-05, "loss": 0.5677, "step": 3876 }, { "epoch": 0.5924284677388547, "grad_norm": 0.2566758096218109, "learning_rate": 7.626979551391235e-05, "loss": 0.6577, "step": 3877 }, { "epoch": 0.5925812736371624, "grad_norm": 0.30719277262687683, "learning_rate": 7.622131013489971e-05, "loss": 0.6697, "step": 3878 }, { "epoch": 0.5927340795354701, "grad_norm": 0.349299818277359, "learning_rate": 7.617283068014797e-05, "loss": 0.8471, "step": 3879 }, { "epoch": 0.5928868854337778, "grad_norm": 0.31798675656318665, "learning_rate": 7.612435716173552e-05, "loss": 0.9319, "step": 3880 }, { "epoch": 0.5930396913320855, "grad_norm": 0.34878382086753845, "learning_rate": 7.607588959173904e-05, "loss": 0.7974, "step": 3881 }, { "epoch": 0.5931924972303931, "grad_norm": 0.3770315945148468, "learning_rate": 7.602742798223388e-05, "loss": 0.6537, "step": 3882 }, { "epoch": 0.5933453031287008, "grad_norm": 0.2860184907913208, "learning_rate": 7.597897234529374e-05, "loss": 0.6633, "step": 3883 }, { "epoch": 0.5934981090270084, "grad_norm": 0.27172017097473145, "learning_rate": 7.593052269299105e-05, "loss": 0.724, "step": 3884 }, { "epoch": 0.5936509149253161, "grad_norm": 0.3685009479522705, "learning_rate": 7.58820790373966e-05, "loss": 0.4991, "step": 3885 }, { "epoch": 0.5938037208236238, "grad_norm": 0.3282112181186676, "learning_rate": 7.583364139057966e-05, "loss": 0.6445, "step": 3886 }, { "epoch": 0.5939565267219314, "grad_norm": 0.28819167613983154, "learning_rate": 7.578520976460813e-05, "loss": 0.7517, "step": 3887 }, { "epoch": 0.5941093326202391, "grad_norm": 0.34896764159202576, "learning_rate": 7.573678417154831e-05, "loss": 0.7079, "step": 3888 }, { "epoch": 0.5942621385185468, "grad_norm": 0.28771957755088806, "learning_rate": 7.568836462346509e-05, "loss": 0.7737, "step": 3889 }, { "epoch": 0.5944149444168545, "grad_norm": 0.2801218330860138, "learning_rate": 7.563995113242171e-05, "loss": 0.6842, "step": 3890 }, { "epoch": 0.5945677503151622, "grad_norm": 0.30863484740257263, "learning_rate": 7.559154371048e-05, "loss": 0.7982, "step": 3891 }, { "epoch": 0.5947205562134699, "grad_norm": 0.30108898878097534, "learning_rate": 7.554314236970032e-05, "loss": 0.6757, "step": 3892 }, { "epoch": 0.5948733621117775, "grad_norm": 0.25410279631614685, "learning_rate": 7.549474712214141e-05, "loss": 0.7674, "step": 3893 }, { "epoch": 0.5950261680100852, "grad_norm": 0.27434930205345154, "learning_rate": 7.544635797986053e-05, "loss": 0.7742, "step": 3894 }, { "epoch": 0.5951789739083929, "grad_norm": 0.3767421245574951, "learning_rate": 7.539797495491347e-05, "loss": 0.6442, "step": 3895 }, { "epoch": 0.5953317798067005, "grad_norm": 2.1998231410980225, "learning_rate": 7.534959805935444e-05, "loss": 0.9633, "step": 3896 }, { "epoch": 0.5954845857050082, "grad_norm": 0.28787243366241455, "learning_rate": 7.530122730523613e-05, "loss": 0.6614, "step": 3897 }, { "epoch": 0.5956373916033159, "grad_norm": 0.2915334105491638, "learning_rate": 7.52528627046097e-05, "loss": 0.6121, "step": 3898 }, { "epoch": 0.5957901975016235, "grad_norm": 0.40670573711395264, "learning_rate": 7.520450426952479e-05, "loss": 0.5696, "step": 3899 }, { "epoch": 0.5959430033999312, "grad_norm": 0.25353896617889404, "learning_rate": 7.515615201202953e-05, "loss": 0.6941, "step": 3900 }, { "epoch": 0.5960958092982389, "grad_norm": 0.5514530539512634, "learning_rate": 7.510780594417043e-05, "loss": 0.6979, "step": 3901 }, { "epoch": 0.5962486151965466, "grad_norm": 0.28294044733047485, "learning_rate": 7.505946607799251e-05, "loss": 0.5892, "step": 3902 }, { "epoch": 0.5964014210948543, "grad_norm": 0.2962487041950226, "learning_rate": 7.50111324255393e-05, "loss": 0.6492, "step": 3903 }, { "epoch": 0.596554226993162, "grad_norm": 0.34200263023376465, "learning_rate": 7.496280499885267e-05, "loss": 0.5383, "step": 3904 }, { "epoch": 0.5967070328914696, "grad_norm": 0.5539739727973938, "learning_rate": 7.4914483809973e-05, "loss": 0.8527, "step": 3905 }, { "epoch": 0.5968598387897773, "grad_norm": 0.28289106488227844, "learning_rate": 7.48661688709391e-05, "loss": 0.6176, "step": 3906 }, { "epoch": 0.597012644688085, "grad_norm": 0.2907141447067261, "learning_rate": 7.481786019378827e-05, "loss": 1.0129, "step": 3907 }, { "epoch": 0.5971654505863926, "grad_norm": 0.3366968035697937, "learning_rate": 7.476955779055618e-05, "loss": 0.7976, "step": 3908 }, { "epoch": 0.5973182564847003, "grad_norm": 0.2729245722293854, "learning_rate": 7.472126167327695e-05, "loss": 0.7484, "step": 3909 }, { "epoch": 0.597471062383008, "grad_norm": 0.29188403487205505, "learning_rate": 7.467297185398324e-05, "loss": 0.6826, "step": 3910 }, { "epoch": 0.5976238682813156, "grad_norm": 0.3058101236820221, "learning_rate": 7.462468834470592e-05, "loss": 0.6161, "step": 3911 }, { "epoch": 0.5977766741796233, "grad_norm": 0.29125460982322693, "learning_rate": 7.457641115747453e-05, "loss": 0.6507, "step": 3912 }, { "epoch": 0.597929480077931, "grad_norm": 0.322819322347641, "learning_rate": 7.452814030431687e-05, "loss": 0.7652, "step": 3913 }, { "epoch": 0.5980822859762387, "grad_norm": 0.27412402629852295, "learning_rate": 7.447987579725928e-05, "loss": 0.8564, "step": 3914 }, { "epoch": 0.5982350918745464, "grad_norm": 0.2602679133415222, "learning_rate": 7.443161764832638e-05, "loss": 0.6872, "step": 3915 }, { "epoch": 0.5983878977728541, "grad_norm": 0.3177022337913513, "learning_rate": 7.438336586954131e-05, "loss": 0.6176, "step": 3916 }, { "epoch": 0.5985407036711617, "grad_norm": 0.2916209101676941, "learning_rate": 7.433512047292563e-05, "loss": 0.5914, "step": 3917 }, { "epoch": 0.5986935095694694, "grad_norm": 0.2940508723258972, "learning_rate": 7.428688147049921e-05, "loss": 0.6788, "step": 3918 }, { "epoch": 0.598846315467777, "grad_norm": 0.31359565258026123, "learning_rate": 7.423864887428044e-05, "loss": 0.8232, "step": 3919 }, { "epoch": 0.5989991213660847, "grad_norm": 0.33102843165397644, "learning_rate": 7.419042269628606e-05, "loss": 0.8431, "step": 3920 }, { "epoch": 0.5991519272643924, "grad_norm": 0.3415786027908325, "learning_rate": 7.414220294853125e-05, "loss": 0.8176, "step": 3921 }, { "epoch": 0.5993047331627, "grad_norm": 0.2847096025943756, "learning_rate": 7.409398964302947e-05, "loss": 0.6231, "step": 3922 }, { "epoch": 0.5994575390610077, "grad_norm": 0.3127872347831726, "learning_rate": 7.404578279179273e-05, "loss": 0.5949, "step": 3923 }, { "epoch": 0.5996103449593154, "grad_norm": 0.31126832962036133, "learning_rate": 7.399758240683134e-05, "loss": 0.6723, "step": 3924 }, { "epoch": 0.5997631508576231, "grad_norm": 0.26205122470855713, "learning_rate": 7.394938850015402e-05, "loss": 0.6486, "step": 3925 }, { "epoch": 0.5999159567559308, "grad_norm": 0.9165391325950623, "learning_rate": 7.390120108376785e-05, "loss": 0.788, "step": 3926 }, { "epoch": 0.6000687626542385, "grad_norm": 0.32874244451522827, "learning_rate": 7.385302016967839e-05, "loss": 0.6388, "step": 3927 }, { "epoch": 0.6002215685525462, "grad_norm": 0.7829940319061279, "learning_rate": 7.380484576988948e-05, "loss": 0.5911, "step": 3928 }, { "epoch": 0.6003743744508538, "grad_norm": 0.5211532711982727, "learning_rate": 7.375667789640331e-05, "loss": 0.8848, "step": 3929 }, { "epoch": 0.6005271803491615, "grad_norm": 0.3158925175666809, "learning_rate": 7.370851656122058e-05, "loss": 0.6837, "step": 3930 }, { "epoch": 0.6006799862474691, "grad_norm": 0.3350100815296173, "learning_rate": 7.366036177634027e-05, "loss": 0.7339, "step": 3931 }, { "epoch": 0.6008327921457768, "grad_norm": 0.28904014825820923, "learning_rate": 7.36122135537597e-05, "loss": 0.5846, "step": 3932 }, { "epoch": 0.6009855980440845, "grad_norm": 0.27264395356178284, "learning_rate": 7.356407190547459e-05, "loss": 0.7204, "step": 3933 }, { "epoch": 0.6011384039423922, "grad_norm": 0.34374403953552246, "learning_rate": 7.351593684347909e-05, "loss": 0.7039, "step": 3934 }, { "epoch": 0.6012912098406998, "grad_norm": 0.3035162091255188, "learning_rate": 7.346780837976563e-05, "loss": 1.0995, "step": 3935 }, { "epoch": 0.6014440157390075, "grad_norm": 0.3120017349720001, "learning_rate": 7.341968652632496e-05, "loss": 0.7253, "step": 3936 }, { "epoch": 0.6015968216373152, "grad_norm": 0.38665685057640076, "learning_rate": 7.337157129514627e-05, "loss": 0.8983, "step": 3937 }, { "epoch": 0.6017496275356229, "grad_norm": 0.34627819061279297, "learning_rate": 7.332346269821706e-05, "loss": 0.8088, "step": 3938 }, { "epoch": 0.6019024334339306, "grad_norm": 0.29683157801628113, "learning_rate": 7.327536074752324e-05, "loss": 0.7064, "step": 3939 }, { "epoch": 0.6020552393322383, "grad_norm": 0.5646716952323914, "learning_rate": 7.32272654550489e-05, "loss": 0.7454, "step": 3940 }, { "epoch": 0.6022080452305459, "grad_norm": 0.462716668844223, "learning_rate": 7.317917683277665e-05, "loss": 0.6755, "step": 3941 }, { "epoch": 0.6023608511288536, "grad_norm": 0.31523165106773376, "learning_rate": 7.313109489268738e-05, "loss": 0.779, "step": 3942 }, { "epoch": 0.6025136570271612, "grad_norm": 0.29431718587875366, "learning_rate": 7.308301964676026e-05, "loss": 0.6833, "step": 3943 }, { "epoch": 0.6026664629254689, "grad_norm": 0.29744040966033936, "learning_rate": 7.303495110697281e-05, "loss": 0.7451, "step": 3944 }, { "epoch": 0.6028192688237766, "grad_norm": 0.3033977448940277, "learning_rate": 7.298688928530098e-05, "loss": 0.937, "step": 3945 }, { "epoch": 0.6029720747220843, "grad_norm": 0.3462549149990082, "learning_rate": 7.293883419371893e-05, "loss": 0.8325, "step": 3946 }, { "epoch": 0.6031248806203919, "grad_norm": 0.28772634267807007, "learning_rate": 7.289078584419918e-05, "loss": 0.7318, "step": 3947 }, { "epoch": 0.6032776865186996, "grad_norm": 0.2670397162437439, "learning_rate": 7.284274424871254e-05, "loss": 0.5443, "step": 3948 }, { "epoch": 0.6034304924170073, "grad_norm": 0.3772238790988922, "learning_rate": 7.279470941922826e-05, "loss": 0.7199, "step": 3949 }, { "epoch": 0.603583298315315, "grad_norm": 0.27530890703201294, "learning_rate": 7.274668136771373e-05, "loss": 0.7777, "step": 3950 }, { "epoch": 0.6037361042136227, "grad_norm": 0.3031236529350281, "learning_rate": 7.269866010613477e-05, "loss": 0.7969, "step": 3951 }, { "epoch": 0.6038889101119304, "grad_norm": 0.4699702262878418, "learning_rate": 7.265064564645545e-05, "loss": 0.8742, "step": 3952 }, { "epoch": 0.604041716010238, "grad_norm": 0.2931947708129883, "learning_rate": 7.260263800063822e-05, "loss": 0.6974, "step": 3953 }, { "epoch": 0.6041945219085457, "grad_norm": 0.2625153660774231, "learning_rate": 7.255463718064375e-05, "loss": 0.6238, "step": 3954 }, { "epoch": 0.6043473278068533, "grad_norm": 0.2817601263523102, "learning_rate": 7.250664319843101e-05, "loss": 0.6791, "step": 3955 }, { "epoch": 0.604500133705161, "grad_norm": 0.29988333582878113, "learning_rate": 7.245865606595741e-05, "loss": 0.6681, "step": 3956 }, { "epoch": 0.6046529396034687, "grad_norm": 0.29616379737854004, "learning_rate": 7.241067579517837e-05, "loss": 0.7775, "step": 3957 }, { "epoch": 0.6048057455017763, "grad_norm": 0.28116655349731445, "learning_rate": 7.236270239804792e-05, "loss": 0.8737, "step": 3958 }, { "epoch": 0.604958551400084, "grad_norm": 0.30657532811164856, "learning_rate": 7.231473588651814e-05, "loss": 0.8031, "step": 3959 }, { "epoch": 0.6051113572983917, "grad_norm": 0.30859723687171936, "learning_rate": 7.226677627253955e-05, "loss": 0.6121, "step": 3960 }, { "epoch": 0.6052641631966994, "grad_norm": 0.23964034020900726, "learning_rate": 7.221882356806083e-05, "loss": 0.6389, "step": 3961 }, { "epoch": 0.6054169690950071, "grad_norm": 0.26439711451530457, "learning_rate": 7.217087778502903e-05, "loss": 0.6267, "step": 3962 }, { "epoch": 0.6055697749933148, "grad_norm": 0.9159783124923706, "learning_rate": 7.212293893538944e-05, "loss": 0.4435, "step": 3963 }, { "epoch": 0.6057225808916225, "grad_norm": 0.7968850135803223, "learning_rate": 7.207500703108556e-05, "loss": 0.7617, "step": 3964 }, { "epoch": 0.6058753867899301, "grad_norm": 0.4541511535644531, "learning_rate": 7.202708208405928e-05, "loss": 0.6902, "step": 3965 }, { "epoch": 0.6060281926882378, "grad_norm": 0.3079363703727722, "learning_rate": 7.197916410625072e-05, "loss": 0.5515, "step": 3966 }, { "epoch": 0.6061809985865454, "grad_norm": 0.287811279296875, "learning_rate": 7.193125310959821e-05, "loss": 0.7739, "step": 3967 }, { "epoch": 0.6063338044848531, "grad_norm": 0.3375343978404999, "learning_rate": 7.188334910603832e-05, "loss": 0.6862, "step": 3968 }, { "epoch": 0.6064866103831608, "grad_norm": 0.3060528039932251, "learning_rate": 7.183545210750602e-05, "loss": 0.7394, "step": 3969 }, { "epoch": 0.6066394162814684, "grad_norm": 0.279608815908432, "learning_rate": 7.178756212593443e-05, "loss": 0.6801, "step": 3970 }, { "epoch": 0.6067922221797761, "grad_norm": 0.33723247051239014, "learning_rate": 7.173967917325488e-05, "loss": 0.675, "step": 3971 }, { "epoch": 0.6069450280780838, "grad_norm": 0.32487475872039795, "learning_rate": 7.169180326139702e-05, "loss": 0.7913, "step": 3972 }, { "epoch": 0.6070978339763915, "grad_norm": 0.2952229678630829, "learning_rate": 7.164393440228878e-05, "loss": 0.6479, "step": 3973 }, { "epoch": 0.6072506398746992, "grad_norm": 0.2784630060195923, "learning_rate": 7.159607260785627e-05, "loss": 0.8433, "step": 3974 }, { "epoch": 0.6074034457730069, "grad_norm": 0.2817748785018921, "learning_rate": 7.15482178900238e-05, "loss": 0.6597, "step": 3975 }, { "epoch": 0.6075562516713146, "grad_norm": 0.32576805353164673, "learning_rate": 7.150037026071405e-05, "loss": 0.9512, "step": 3976 }, { "epoch": 0.6077090575696222, "grad_norm": 0.2870212495326996, "learning_rate": 7.145252973184779e-05, "loss": 0.7329, "step": 3977 }, { "epoch": 0.6078618634679298, "grad_norm": 0.2842814326286316, "learning_rate": 7.140469631534414e-05, "loss": 0.8501, "step": 3978 }, { "epoch": 0.6080146693662375, "grad_norm": 0.3353877663612366, "learning_rate": 7.135687002312035e-05, "loss": 0.6133, "step": 3979 }, { "epoch": 0.6081674752645452, "grad_norm": 0.33758804202079773, "learning_rate": 7.130905086709196e-05, "loss": 0.5174, "step": 3980 }, { "epoch": 0.6083202811628529, "grad_norm": 0.2900623083114624, "learning_rate": 7.126123885917272e-05, "loss": 0.4506, "step": 3981 }, { "epoch": 0.6084730870611605, "grad_norm": 0.3299383819103241, "learning_rate": 7.121343401127456e-05, "loss": 0.6244, "step": 3982 }, { "epoch": 0.6086258929594682, "grad_norm": 0.25950318574905396, "learning_rate": 7.116563633530766e-05, "loss": 0.6782, "step": 3983 }, { "epoch": 0.6087786988577759, "grad_norm": 0.3207615613937378, "learning_rate": 7.111784584318044e-05, "loss": 0.7453, "step": 3984 }, { "epoch": 0.6089315047560836, "grad_norm": 0.30822837352752686, "learning_rate": 7.107006254679951e-05, "loss": 0.7912, "step": 3985 }, { "epoch": 0.6090843106543913, "grad_norm": 0.38215330243110657, "learning_rate": 7.102228645806963e-05, "loss": 0.7137, "step": 3986 }, { "epoch": 0.609237116552699, "grad_norm": 0.32029587030410767, "learning_rate": 7.097451758889382e-05, "loss": 0.762, "step": 3987 }, { "epoch": 0.6093899224510066, "grad_norm": 0.3142178952693939, "learning_rate": 7.092675595117333e-05, "loss": 0.6819, "step": 3988 }, { "epoch": 0.6095427283493143, "grad_norm": 0.28147318959236145, "learning_rate": 7.087900155680754e-05, "loss": 0.7674, "step": 3989 }, { "epoch": 0.6096955342476219, "grad_norm": 0.2938244938850403, "learning_rate": 7.083125441769402e-05, "loss": 0.6486, "step": 3990 }, { "epoch": 0.6098483401459296, "grad_norm": 0.3013629615306854, "learning_rate": 7.078351454572867e-05, "loss": 0.7303, "step": 3991 }, { "epoch": 0.6100011460442373, "grad_norm": 0.3084275722503662, "learning_rate": 7.073578195280541e-05, "loss": 0.7825, "step": 3992 }, { "epoch": 0.610153951942545, "grad_norm": 0.29917213320732117, "learning_rate": 7.068805665081641e-05, "loss": 0.7427, "step": 3993 }, { "epoch": 0.6103067578408526, "grad_norm": 0.26484012603759766, "learning_rate": 7.064033865165204e-05, "loss": 0.6877, "step": 3994 }, { "epoch": 0.6104595637391603, "grad_norm": 0.30923646688461304, "learning_rate": 7.059262796720088e-05, "loss": 0.7605, "step": 3995 }, { "epoch": 0.610612369637468, "grad_norm": 0.3539402484893799, "learning_rate": 7.054492460934958e-05, "loss": 0.6913, "step": 3996 }, { "epoch": 0.6107651755357757, "grad_norm": 0.3845704197883606, "learning_rate": 7.049722858998307e-05, "loss": 0.7764, "step": 3997 }, { "epoch": 0.6109179814340834, "grad_norm": 0.26904961466789246, "learning_rate": 7.044953992098436e-05, "loss": 0.6718, "step": 3998 }, { "epoch": 0.6110707873323911, "grad_norm": 0.31562212109565735, "learning_rate": 7.040185861423478e-05, "loss": 0.5668, "step": 3999 }, { "epoch": 0.6112235932306987, "grad_norm": 0.31812769174575806, "learning_rate": 7.035418468161365e-05, "loss": 0.7084, "step": 4000 }, { "epoch": 0.6113763991290064, "grad_norm": 0.28301897644996643, "learning_rate": 7.030651813499854e-05, "loss": 0.6407, "step": 4001 }, { "epoch": 0.611529205027314, "grad_norm": 0.3221738636493683, "learning_rate": 7.025885898626525e-05, "loss": 0.6902, "step": 4002 }, { "epoch": 0.6116820109256217, "grad_norm": 0.25532403588294983, "learning_rate": 7.021120724728751e-05, "loss": 0.7482, "step": 4003 }, { "epoch": 0.6118348168239294, "grad_norm": 0.2903250753879547, "learning_rate": 7.016356292993746e-05, "loss": 0.6027, "step": 4004 }, { "epoch": 0.6119876227222371, "grad_norm": 0.3196435272693634, "learning_rate": 7.011592604608523e-05, "loss": 0.6325, "step": 4005 }, { "epoch": 0.6121404286205447, "grad_norm": 0.251808226108551, "learning_rate": 7.006829660759923e-05, "loss": 0.6652, "step": 4006 }, { "epoch": 0.6122932345188524, "grad_norm": 0.3030737042427063, "learning_rate": 7.002067462634582e-05, "loss": 0.5607, "step": 4007 }, { "epoch": 0.6124460404171601, "grad_norm": 0.3054194748401642, "learning_rate": 6.99730601141897e-05, "loss": 0.6044, "step": 4008 }, { "epoch": 0.6125988463154678, "grad_norm": 0.2582065761089325, "learning_rate": 6.992545308299355e-05, "loss": 0.6113, "step": 4009 }, { "epoch": 0.6127516522137755, "grad_norm": 0.35208937525749207, "learning_rate": 6.987785354461838e-05, "loss": 0.6342, "step": 4010 }, { "epoch": 0.6129044581120832, "grad_norm": 0.35848256945610046, "learning_rate": 6.98302615109231e-05, "loss": 0.6829, "step": 4011 }, { "epoch": 0.6130572640103908, "grad_norm": 0.29076623916625977, "learning_rate": 6.978267699376494e-05, "loss": 0.8626, "step": 4012 }, { "epoch": 0.6132100699086985, "grad_norm": 0.28895699977874756, "learning_rate": 6.973510000499916e-05, "loss": 0.8021, "step": 4013 }, { "epoch": 0.6133628758070061, "grad_norm": 0.42235711216926575, "learning_rate": 6.968753055647915e-05, "loss": 1.0156, "step": 4014 }, { "epoch": 0.6135156817053138, "grad_norm": 0.2926298975944519, "learning_rate": 6.963996866005644e-05, "loss": 0.6561, "step": 4015 }, { "epoch": 0.6136684876036215, "grad_norm": 0.3840494751930237, "learning_rate": 6.959241432758067e-05, "loss": 0.6022, "step": 4016 }, { "epoch": 0.6138212935019292, "grad_norm": 0.28409114480018616, "learning_rate": 6.954486757089968e-05, "loss": 0.7626, "step": 4017 }, { "epoch": 0.6139740994002368, "grad_norm": 0.37249550223350525, "learning_rate": 6.949732840185926e-05, "loss": 0.8332, "step": 4018 }, { "epoch": 0.6141269052985445, "grad_norm": 0.2957054376602173, "learning_rate": 6.94497968323034e-05, "loss": 0.669, "step": 4019 }, { "epoch": 0.6142797111968522, "grad_norm": 0.24224689602851868, "learning_rate": 6.940227287407426e-05, "loss": 0.373, "step": 4020 }, { "epoch": 0.6144325170951599, "grad_norm": 0.4046684503555298, "learning_rate": 6.935475653901194e-05, "loss": 1.0801, "step": 4021 }, { "epoch": 0.6145853229934676, "grad_norm": 0.33295106887817383, "learning_rate": 6.930724783895481e-05, "loss": 0.8235, "step": 4022 }, { "epoch": 0.6147381288917753, "grad_norm": 0.2868274748325348, "learning_rate": 6.925974678573923e-05, "loss": 0.7053, "step": 4023 }, { "epoch": 0.6148909347900829, "grad_norm": 0.28332453966140747, "learning_rate": 6.921225339119972e-05, "loss": 0.7562, "step": 4024 }, { "epoch": 0.6150437406883905, "grad_norm": 0.3626477122306824, "learning_rate": 6.91647676671688e-05, "loss": 0.5815, "step": 4025 }, { "epoch": 0.6151965465866982, "grad_norm": 0.30137038230895996, "learning_rate": 6.911728962547719e-05, "loss": 0.6479, "step": 4026 }, { "epoch": 0.6153493524850059, "grad_norm": 0.29259753227233887, "learning_rate": 6.906981927795366e-05, "loss": 0.6555, "step": 4027 }, { "epoch": 0.6155021583833136, "grad_norm": 0.27849724888801575, "learning_rate": 6.9022356636425e-05, "loss": 0.6151, "step": 4028 }, { "epoch": 0.6156549642816213, "grad_norm": 0.36362895369529724, "learning_rate": 6.897490171271614e-05, "loss": 0.6835, "step": 4029 }, { "epoch": 0.6158077701799289, "grad_norm": 0.3303452432155609, "learning_rate": 6.892745451865008e-05, "loss": 0.7131, "step": 4030 }, { "epoch": 0.6159605760782366, "grad_norm": 0.3015969693660736, "learning_rate": 6.888001506604794e-05, "loss": 0.7166, "step": 4031 }, { "epoch": 0.6161133819765443, "grad_norm": 0.33131933212280273, "learning_rate": 6.883258336672879e-05, "loss": 0.8487, "step": 4032 }, { "epoch": 0.616266187874852, "grad_norm": 0.2961571216583252, "learning_rate": 6.878515943250985e-05, "loss": 0.5713, "step": 4033 }, { "epoch": 0.6164189937731597, "grad_norm": 0.28777509927749634, "learning_rate": 6.873774327520644e-05, "loss": 0.702, "step": 4034 }, { "epoch": 0.6165717996714674, "grad_norm": 0.29192861914634705, "learning_rate": 6.869033490663187e-05, "loss": 0.5561, "step": 4035 }, { "epoch": 0.616724605569775, "grad_norm": 0.31917914748191833, "learning_rate": 6.86429343385975e-05, "loss": 0.7583, "step": 4036 }, { "epoch": 0.6168774114680826, "grad_norm": 0.29778870940208435, "learning_rate": 6.859554158291285e-05, "loss": 0.6645, "step": 4037 }, { "epoch": 0.6170302173663903, "grad_norm": 0.2581726610660553, "learning_rate": 6.854815665138541e-05, "loss": 0.66, "step": 4038 }, { "epoch": 0.617183023264698, "grad_norm": 0.2948669493198395, "learning_rate": 6.850077955582072e-05, "loss": 0.6895, "step": 4039 }, { "epoch": 0.6173358291630057, "grad_norm": 0.2632709741592407, "learning_rate": 6.845341030802236e-05, "loss": 0.4614, "step": 4040 }, { "epoch": 0.6174886350613134, "grad_norm": 0.29188868403434753, "learning_rate": 6.840604891979205e-05, "loss": 0.8561, "step": 4041 }, { "epoch": 0.617641440959621, "grad_norm": 0.26110169291496277, "learning_rate": 6.835869540292943e-05, "loss": 0.8258, "step": 4042 }, { "epoch": 0.6177942468579287, "grad_norm": 0.313664048910141, "learning_rate": 6.831134976923224e-05, "loss": 0.5098, "step": 4043 }, { "epoch": 0.6179470527562364, "grad_norm": 0.3582458198070526, "learning_rate": 6.826401203049624e-05, "loss": 0.7877, "step": 4044 }, { "epoch": 0.6180998586545441, "grad_norm": 0.335791677236557, "learning_rate": 6.821668219851529e-05, "loss": 0.8319, "step": 4045 }, { "epoch": 0.6182526645528518, "grad_norm": 0.31981077790260315, "learning_rate": 6.816936028508114e-05, "loss": 0.7662, "step": 4046 }, { "epoch": 0.6184054704511595, "grad_norm": 0.3540882170200348, "learning_rate": 6.812204630198369e-05, "loss": 0.7006, "step": 4047 }, { "epoch": 0.6185582763494671, "grad_norm": 0.33336111903190613, "learning_rate": 6.807474026101079e-05, "loss": 0.6572, "step": 4048 }, { "epoch": 0.6187110822477747, "grad_norm": 0.34919145703315735, "learning_rate": 6.80274421739484e-05, "loss": 0.8531, "step": 4049 }, { "epoch": 0.6188638881460824, "grad_norm": 0.2679178714752197, "learning_rate": 6.798015205258039e-05, "loss": 0.6045, "step": 4050 }, { "epoch": 0.6190166940443901, "grad_norm": 0.35538893938064575, "learning_rate": 6.793286990868869e-05, "loss": 0.6461, "step": 4051 }, { "epoch": 0.6191694999426978, "grad_norm": 0.32986605167388916, "learning_rate": 6.788559575405333e-05, "loss": 0.8395, "step": 4052 }, { "epoch": 0.6193223058410054, "grad_norm": 0.3114609122276306, "learning_rate": 6.783832960045215e-05, "loss": 0.858, "step": 4053 }, { "epoch": 0.6194751117393131, "grad_norm": 0.28365322947502136, "learning_rate": 6.779107145966122e-05, "loss": 0.715, "step": 4054 }, { "epoch": 0.6196279176376208, "grad_norm": 0.3411410450935364, "learning_rate": 6.774382134345442e-05, "loss": 0.8184, "step": 4055 }, { "epoch": 0.6197807235359285, "grad_norm": 0.26076486706733704, "learning_rate": 6.769657926360382e-05, "loss": 0.7415, "step": 4056 }, { "epoch": 0.6199335294342362, "grad_norm": 0.24735046923160553, "learning_rate": 6.764934523187931e-05, "loss": 0.5559, "step": 4057 }, { "epoch": 0.6200863353325439, "grad_norm": 0.3628125488758087, "learning_rate": 6.760211926004889e-05, "loss": 0.6857, "step": 4058 }, { "epoch": 0.6202391412308516, "grad_norm": 0.4325239360332489, "learning_rate": 6.75549013598785e-05, "loss": 0.8027, "step": 4059 }, { "epoch": 0.6203919471291592, "grad_norm": 0.3012774586677551, "learning_rate": 6.750769154313206e-05, "loss": 0.7281, "step": 4060 }, { "epoch": 0.6205447530274668, "grad_norm": 0.2891152501106262, "learning_rate": 6.746048982157154e-05, "loss": 0.9022, "step": 4061 }, { "epoch": 0.6206975589257745, "grad_norm": 0.40588390827178955, "learning_rate": 6.74132962069568e-05, "loss": 0.8497, "step": 4062 }, { "epoch": 0.6208503648240822, "grad_norm": 0.26198041439056396, "learning_rate": 6.736611071104583e-05, "loss": 0.705, "step": 4063 }, { "epoch": 0.6210031707223899, "grad_norm": 0.29433444142341614, "learning_rate": 6.731893334559441e-05, "loss": 0.7995, "step": 4064 }, { "epoch": 0.6211559766206975, "grad_norm": 0.32308998703956604, "learning_rate": 6.727176412235641e-05, "loss": 0.7435, "step": 4065 }, { "epoch": 0.6213087825190052, "grad_norm": 0.32107511162757874, "learning_rate": 6.722460305308369e-05, "loss": 0.6198, "step": 4066 }, { "epoch": 0.6214615884173129, "grad_norm": 1.0581830739974976, "learning_rate": 6.717745014952594e-05, "loss": 0.6794, "step": 4067 }, { "epoch": 0.6216143943156206, "grad_norm": 0.45635101199150085, "learning_rate": 6.713030542343097e-05, "loss": 0.8428, "step": 4068 }, { "epoch": 0.6217672002139283, "grad_norm": 0.2648105323314667, "learning_rate": 6.70831688865445e-05, "loss": 0.7215, "step": 4069 }, { "epoch": 0.621920006112236, "grad_norm": 0.32674112915992737, "learning_rate": 6.703604055061022e-05, "loss": 0.6808, "step": 4070 }, { "epoch": 0.6220728120105437, "grad_norm": 0.2716309428215027, "learning_rate": 6.698892042736969e-05, "loss": 0.6815, "step": 4071 }, { "epoch": 0.6222256179088513, "grad_norm": 0.2864760160446167, "learning_rate": 6.694180852856254e-05, "loss": 0.6477, "step": 4072 }, { "epoch": 0.6223784238071589, "grad_norm": 0.30836066603660583, "learning_rate": 6.68947048659263e-05, "loss": 0.8132, "step": 4073 }, { "epoch": 0.6225312297054666, "grad_norm": 0.3510795533657074, "learning_rate": 6.684760945119645e-05, "loss": 0.8555, "step": 4074 }, { "epoch": 0.6226840356037743, "grad_norm": 0.3028537929058075, "learning_rate": 6.68005222961064e-05, "loss": 0.647, "step": 4075 }, { "epoch": 0.622836841502082, "grad_norm": 0.2671966254711151, "learning_rate": 6.675344341238757e-05, "loss": 0.6059, "step": 4076 }, { "epoch": 0.6229896474003896, "grad_norm": 0.39240017533302307, "learning_rate": 6.670637281176923e-05, "loss": 0.8019, "step": 4077 }, { "epoch": 0.6231424532986973, "grad_norm": 0.3353138267993927, "learning_rate": 6.66593105059786e-05, "loss": 0.7382, "step": 4078 }, { "epoch": 0.623295259197005, "grad_norm": 0.32272979617118835, "learning_rate": 6.661225650674089e-05, "loss": 0.7867, "step": 4079 }, { "epoch": 0.6234480650953127, "grad_norm": 0.3123871684074402, "learning_rate": 6.656521082577925e-05, "loss": 0.7258, "step": 4080 }, { "epoch": 0.6236008709936204, "grad_norm": 0.2753034830093384, "learning_rate": 6.651817347481462e-05, "loss": 0.5895, "step": 4081 }, { "epoch": 0.6237536768919281, "grad_norm": 0.28939950466156006, "learning_rate": 6.647114446556601e-05, "loss": 0.5629, "step": 4082 }, { "epoch": 0.6239064827902358, "grad_norm": 0.29780569672584534, "learning_rate": 6.642412380975033e-05, "loss": 0.8147, "step": 4083 }, { "epoch": 0.6240592886885433, "grad_norm": 0.3016960024833679, "learning_rate": 6.637711151908239e-05, "loss": 0.8671, "step": 4084 }, { "epoch": 0.624212094586851, "grad_norm": 0.33941328525543213, "learning_rate": 6.633010760527485e-05, "loss": 0.6496, "step": 4085 }, { "epoch": 0.6243649004851587, "grad_norm": 0.2641719877719879, "learning_rate": 6.628311208003834e-05, "loss": 0.639, "step": 4086 }, { "epoch": 0.6245177063834664, "grad_norm": 0.2600893974304199, "learning_rate": 6.623612495508146e-05, "loss": 0.6703, "step": 4087 }, { "epoch": 0.6246705122817741, "grad_norm": 0.29402458667755127, "learning_rate": 6.618914624211064e-05, "loss": 0.5691, "step": 4088 }, { "epoch": 0.6248233181800817, "grad_norm": 0.3072538673877716, "learning_rate": 6.614217595283019e-05, "loss": 0.758, "step": 4089 }, { "epoch": 0.6249761240783894, "grad_norm": 0.26396307349205017, "learning_rate": 6.609521409894237e-05, "loss": 0.7844, "step": 4090 }, { "epoch": 0.6251289299766971, "grad_norm": 0.3354957699775696, "learning_rate": 6.60482606921474e-05, "loss": 0.7251, "step": 4091 }, { "epoch": 0.6252817358750048, "grad_norm": 0.2975131869316101, "learning_rate": 6.600131574414325e-05, "loss": 0.7848, "step": 4092 }, { "epoch": 0.6254345417733125, "grad_norm": 0.3199508488178253, "learning_rate": 6.59543792666259e-05, "loss": 0.7637, "step": 4093 }, { "epoch": 0.6255873476716202, "grad_norm": 0.2629062533378601, "learning_rate": 6.590745127128914e-05, "loss": 0.5365, "step": 4094 }, { "epoch": 0.6257401535699278, "grad_norm": 0.3765850067138672, "learning_rate": 6.586053176982476e-05, "loss": 0.6494, "step": 4095 }, { "epoch": 0.6258929594682354, "grad_norm": 0.28707364201545715, "learning_rate": 6.58136207739223e-05, "loss": 0.7073, "step": 4096 }, { "epoch": 0.6260457653665431, "grad_norm": 0.339830607175827, "learning_rate": 6.576671829526923e-05, "loss": 0.6416, "step": 4097 }, { "epoch": 0.6261985712648508, "grad_norm": 0.28438472747802734, "learning_rate": 6.5719824345551e-05, "loss": 0.6059, "step": 4098 }, { "epoch": 0.6263513771631585, "grad_norm": 0.2813330590724945, "learning_rate": 6.56729389364507e-05, "loss": 0.7748, "step": 4099 }, { "epoch": 0.6265041830614662, "grad_norm": 0.27409806847572327, "learning_rate": 6.562606207964954e-05, "loss": 0.736, "step": 4100 }, { "epoch": 0.6266569889597738, "grad_norm": 0.28923875093460083, "learning_rate": 6.557919378682646e-05, "loss": 0.8389, "step": 4101 }, { "epoch": 0.6268097948580815, "grad_norm": 0.2727113366127014, "learning_rate": 6.553233406965835e-05, "loss": 0.6921, "step": 4102 }, { "epoch": 0.6269626007563892, "grad_norm": 0.2747400104999542, "learning_rate": 6.548548293981985e-05, "loss": 0.7585, "step": 4103 }, { "epoch": 0.6271154066546969, "grad_norm": 0.2889161705970764, "learning_rate": 6.543864040898355e-05, "loss": 0.6042, "step": 4104 }, { "epoch": 0.6272682125530046, "grad_norm": 0.28560250997543335, "learning_rate": 6.539180648881991e-05, "loss": 0.7497, "step": 4105 }, { "epoch": 0.6274210184513123, "grad_norm": 0.43822401762008667, "learning_rate": 6.534498119099712e-05, "loss": 0.7462, "step": 4106 }, { "epoch": 0.62757382434962, "grad_norm": 0.40729212760925293, "learning_rate": 6.529816452718139e-05, "loss": 0.8604, "step": 4107 }, { "epoch": 0.6277266302479275, "grad_norm": 0.2853194773197174, "learning_rate": 6.525135650903666e-05, "loss": 0.6286, "step": 4108 }, { "epoch": 0.6278794361462352, "grad_norm": 0.3668119013309479, "learning_rate": 6.520455714822481e-05, "loss": 0.5869, "step": 4109 }, { "epoch": 0.6280322420445429, "grad_norm": 0.34195056557655334, "learning_rate": 6.515776645640541e-05, "loss": 0.8816, "step": 4110 }, { "epoch": 0.6281850479428506, "grad_norm": 0.34236249327659607, "learning_rate": 6.511098444523604e-05, "loss": 0.8364, "step": 4111 }, { "epoch": 0.6283378538411583, "grad_norm": 0.27742037177085876, "learning_rate": 6.506421112637207e-05, "loss": 0.5767, "step": 4112 }, { "epoch": 0.6284906597394659, "grad_norm": 0.300037145614624, "learning_rate": 6.50174465114666e-05, "loss": 0.8639, "step": 4113 }, { "epoch": 0.6286434656377736, "grad_norm": 0.32458987832069397, "learning_rate": 6.497069061217065e-05, "loss": 0.672, "step": 4114 }, { "epoch": 0.6287962715360813, "grad_norm": 0.3458729386329651, "learning_rate": 6.492394344013313e-05, "loss": 0.6846, "step": 4115 }, { "epoch": 0.628949077434389, "grad_norm": 0.21836614608764648, "learning_rate": 6.487720500700067e-05, "loss": 0.7049, "step": 4116 }, { "epoch": 0.6291018833326967, "grad_norm": 0.2723524868488312, "learning_rate": 6.483047532441773e-05, "loss": 0.6425, "step": 4117 }, { "epoch": 0.6292546892310044, "grad_norm": 0.2965027093887329, "learning_rate": 6.478375440402664e-05, "loss": 0.7011, "step": 4118 }, { "epoch": 0.629407495129312, "grad_norm": 0.2943280041217804, "learning_rate": 6.473704225746755e-05, "loss": 0.5876, "step": 4119 }, { "epoch": 0.6295603010276196, "grad_norm": 0.3261384963989258, "learning_rate": 6.469033889637837e-05, "loss": 0.8015, "step": 4120 }, { "epoch": 0.6297131069259273, "grad_norm": 0.4228057265281677, "learning_rate": 6.464364433239484e-05, "loss": 0.7216, "step": 4121 }, { "epoch": 0.629865912824235, "grad_norm": 0.49428024888038635, "learning_rate": 6.459695857715053e-05, "loss": 0.7154, "step": 4122 }, { "epoch": 0.6300187187225427, "grad_norm": 0.45371702313423157, "learning_rate": 6.455028164227685e-05, "loss": 0.6947, "step": 4123 }, { "epoch": 0.6301715246208504, "grad_norm": 0.3593444526195526, "learning_rate": 6.45036135394029e-05, "loss": 0.709, "step": 4124 }, { "epoch": 0.630324330519158, "grad_norm": 0.38711681962013245, "learning_rate": 6.445695428015566e-05, "loss": 0.6442, "step": 4125 }, { "epoch": 0.6304771364174657, "grad_norm": 0.2977801561355591, "learning_rate": 6.44103038761599e-05, "loss": 0.7507, "step": 4126 }, { "epoch": 0.6306299423157734, "grad_norm": 0.25699782371520996, "learning_rate": 6.436366233903822e-05, "loss": 0.6813, "step": 4127 }, { "epoch": 0.6307827482140811, "grad_norm": 0.251458078622818, "learning_rate": 6.431702968041091e-05, "loss": 0.8123, "step": 4128 }, { "epoch": 0.6309355541123888, "grad_norm": 0.3088221251964569, "learning_rate": 6.427040591189609e-05, "loss": 0.9976, "step": 4129 }, { "epoch": 0.6310883600106965, "grad_norm": 0.35455629229545593, "learning_rate": 6.422379104510976e-05, "loss": 0.8277, "step": 4130 }, { "epoch": 0.6312411659090041, "grad_norm": 0.2564350366592407, "learning_rate": 6.417718509166557e-05, "loss": 0.5566, "step": 4131 }, { "epoch": 0.6313939718073117, "grad_norm": 0.3636449873447418, "learning_rate": 6.413058806317496e-05, "loss": 0.7471, "step": 4132 }, { "epoch": 0.6315467777056194, "grad_norm": 0.25471046566963196, "learning_rate": 6.408399997124728e-05, "loss": 0.6974, "step": 4133 }, { "epoch": 0.6316995836039271, "grad_norm": 0.2742546796798706, "learning_rate": 6.403742082748954e-05, "loss": 0.5548, "step": 4134 }, { "epoch": 0.6318523895022348, "grad_norm": 0.29743149876594543, "learning_rate": 6.399085064350648e-05, "loss": 0.7215, "step": 4135 }, { "epoch": 0.6320051954005425, "grad_norm": 0.34070295095443726, "learning_rate": 6.394428943090071e-05, "loss": 0.8442, "step": 4136 }, { "epoch": 0.6321580012988501, "grad_norm": 0.3170759975910187, "learning_rate": 6.389773720127262e-05, "loss": 0.5968, "step": 4137 }, { "epoch": 0.6323108071971578, "grad_norm": 0.3096469044685364, "learning_rate": 6.385119396622021e-05, "loss": 0.8517, "step": 4138 }, { "epoch": 0.6324636130954655, "grad_norm": 0.3050990104675293, "learning_rate": 6.38046597373394e-05, "loss": 0.5937, "step": 4139 }, { "epoch": 0.6326164189937732, "grad_norm": 0.30861058831214905, "learning_rate": 6.375813452622375e-05, "loss": 0.6394, "step": 4140 }, { "epoch": 0.6327692248920809, "grad_norm": 0.270451158285141, "learning_rate": 6.37116183444647e-05, "loss": 0.719, "step": 4141 }, { "epoch": 0.6329220307903886, "grad_norm": 0.29544568061828613, "learning_rate": 6.366511120365132e-05, "loss": 0.759, "step": 4142 }, { "epoch": 0.6330748366886961, "grad_norm": 0.34251758456230164, "learning_rate": 6.361861311537046e-05, "loss": 0.6881, "step": 4143 }, { "epoch": 0.6332276425870038, "grad_norm": 0.36020627617836, "learning_rate": 6.357212409120679e-05, "loss": 0.8744, "step": 4144 }, { "epoch": 0.6333804484853115, "grad_norm": 0.3259471654891968, "learning_rate": 6.352564414274256e-05, "loss": 0.5879, "step": 4145 }, { "epoch": 0.6335332543836192, "grad_norm": 0.2928166687488556, "learning_rate": 6.347917328155795e-05, "loss": 0.7869, "step": 4146 }, { "epoch": 0.6336860602819269, "grad_norm": 0.316599577665329, "learning_rate": 6.343271151923074e-05, "loss": 0.8952, "step": 4147 }, { "epoch": 0.6338388661802346, "grad_norm": 0.30253708362579346, "learning_rate": 6.338625886733654e-05, "loss": 0.6866, "step": 4148 }, { "epoch": 0.6339916720785422, "grad_norm": 0.29290080070495605, "learning_rate": 6.333981533744856e-05, "loss": 0.681, "step": 4149 }, { "epoch": 0.6341444779768499, "grad_norm": 0.26810938119888306, "learning_rate": 6.329338094113785e-05, "loss": 0.4452, "step": 4150 }, { "epoch": 0.6342972838751576, "grad_norm": 0.3918895423412323, "learning_rate": 6.324695568997319e-05, "loss": 0.6125, "step": 4151 }, { "epoch": 0.6344500897734653, "grad_norm": 0.2929452657699585, "learning_rate": 6.320053959552095e-05, "loss": 0.5832, "step": 4152 }, { "epoch": 0.634602895671773, "grad_norm": 0.31379982829093933, "learning_rate": 6.31541326693454e-05, "loss": 0.5962, "step": 4153 }, { "epoch": 0.6347557015700807, "grad_norm": 0.8252871036529541, "learning_rate": 6.310773492300839e-05, "loss": 0.6811, "step": 4154 }, { "epoch": 0.6349085074683882, "grad_norm": 0.2837304472923279, "learning_rate": 6.306134636806957e-05, "loss": 0.5664, "step": 4155 }, { "epoch": 0.6350613133666959, "grad_norm": 0.32201525568962097, "learning_rate": 6.30149670160862e-05, "loss": 0.7099, "step": 4156 }, { "epoch": 0.6352141192650036, "grad_norm": 0.30925363302230835, "learning_rate": 6.296859687861335e-05, "loss": 0.6987, "step": 4157 }, { "epoch": 0.6353669251633113, "grad_norm": 0.3547913134098053, "learning_rate": 6.292223596720371e-05, "loss": 0.6015, "step": 4158 }, { "epoch": 0.635519731061619, "grad_norm": 0.28169745206832886, "learning_rate": 6.287588429340781e-05, "loss": 0.5393, "step": 4159 }, { "epoch": 0.6356725369599266, "grad_norm": 0.2913646996021271, "learning_rate": 6.282954186877364e-05, "loss": 0.6671, "step": 4160 }, { "epoch": 0.6358253428582343, "grad_norm": 0.38874661922454834, "learning_rate": 6.27832087048471e-05, "loss": 0.7775, "step": 4161 }, { "epoch": 0.635978148756542, "grad_norm": 0.26316070556640625, "learning_rate": 6.273688481317175e-05, "loss": 0.6152, "step": 4162 }, { "epoch": 0.6361309546548497, "grad_norm": 0.398821622133255, "learning_rate": 6.269057020528872e-05, "loss": 0.5058, "step": 4163 }, { "epoch": 0.6362837605531574, "grad_norm": 0.3221498727798462, "learning_rate": 6.264426489273694e-05, "loss": 0.6687, "step": 4164 }, { "epoch": 0.6364365664514651, "grad_norm": 0.27947044372558594, "learning_rate": 6.259796888705298e-05, "loss": 0.673, "step": 4165 }, { "epoch": 0.6365893723497728, "grad_norm": 0.27743926644325256, "learning_rate": 6.255168219977114e-05, "loss": 0.7665, "step": 4166 }, { "epoch": 0.6367421782480803, "grad_norm": 0.24967680871486664, "learning_rate": 6.250540484242331e-05, "loss": 0.584, "step": 4167 }, { "epoch": 0.636894984146388, "grad_norm": 0.2937239408493042, "learning_rate": 6.245913682653912e-05, "loss": 0.6989, "step": 4168 }, { "epoch": 0.6370477900446957, "grad_norm": 0.7510557770729065, "learning_rate": 6.24128781636459e-05, "loss": 0.5784, "step": 4169 }, { "epoch": 0.6372005959430034, "grad_norm": 0.2786187529563904, "learning_rate": 6.236662886526854e-05, "loss": 0.6723, "step": 4170 }, { "epoch": 0.6373534018413111, "grad_norm": 0.2596394419670105, "learning_rate": 6.232038894292966e-05, "loss": 0.7527, "step": 4171 }, { "epoch": 0.6375062077396187, "grad_norm": 0.3109414577484131, "learning_rate": 6.227415840814963e-05, "loss": 0.6461, "step": 4172 }, { "epoch": 0.6376590136379264, "grad_norm": 0.314042866230011, "learning_rate": 6.222793727244635e-05, "loss": 0.6459, "step": 4173 }, { "epoch": 0.6378118195362341, "grad_norm": 0.2707376480102539, "learning_rate": 6.218172554733543e-05, "loss": 0.5276, "step": 4174 }, { "epoch": 0.6379646254345418, "grad_norm": 0.3110902011394501, "learning_rate": 6.21355232443301e-05, "loss": 0.7231, "step": 4175 }, { "epoch": 0.6381174313328495, "grad_norm": 0.29810798168182373, "learning_rate": 6.208933037494136e-05, "loss": 0.6088, "step": 4176 }, { "epoch": 0.6382702372311572, "grad_norm": 0.32729408144950867, "learning_rate": 6.20431469506777e-05, "loss": 0.6172, "step": 4177 }, { "epoch": 0.6384230431294649, "grad_norm": 0.3452955484390259, "learning_rate": 6.199697298304534e-05, "loss": 0.7917, "step": 4178 }, { "epoch": 0.6385758490277724, "grad_norm": 0.28180643916130066, "learning_rate": 6.195080848354818e-05, "loss": 0.7181, "step": 4179 }, { "epoch": 0.6387286549260801, "grad_norm": 0.3455478250980377, "learning_rate": 6.19046534636877e-05, "loss": 0.653, "step": 4180 }, { "epoch": 0.6388814608243878, "grad_norm": 0.28653568029403687, "learning_rate": 6.185850793496301e-05, "loss": 0.6431, "step": 4181 }, { "epoch": 0.6390342667226955, "grad_norm": 0.268694132566452, "learning_rate": 6.181237190887088e-05, "loss": 0.6316, "step": 4182 }, { "epoch": 0.6391870726210032, "grad_norm": 0.2896341383457184, "learning_rate": 6.176624539690579e-05, "loss": 0.537, "step": 4183 }, { "epoch": 0.6393398785193108, "grad_norm": 0.2786364257335663, "learning_rate": 6.172012841055968e-05, "loss": 0.7144, "step": 4184 }, { "epoch": 0.6394926844176185, "grad_norm": 0.33958667516708374, "learning_rate": 6.167402096132224e-05, "loss": 0.7105, "step": 4185 }, { "epoch": 0.6396454903159262, "grad_norm": 0.27773991227149963, "learning_rate": 6.162792306068075e-05, "loss": 0.7349, "step": 4186 }, { "epoch": 0.6397982962142339, "grad_norm": 0.3180773854255676, "learning_rate": 6.158183472012015e-05, "loss": 0.614, "step": 4187 }, { "epoch": 0.6399511021125416, "grad_norm": 0.2762540578842163, "learning_rate": 6.153575595112295e-05, "loss": 0.5515, "step": 4188 }, { "epoch": 0.6401039080108493, "grad_norm": 0.28452420234680176, "learning_rate": 6.148968676516925e-05, "loss": 0.7795, "step": 4189 }, { "epoch": 0.6402567139091568, "grad_norm": 0.2750689387321472, "learning_rate": 6.144362717373686e-05, "loss": 0.7882, "step": 4190 }, { "epoch": 0.6404095198074645, "grad_norm": 0.2844794988632202, "learning_rate": 6.139757718830106e-05, "loss": 0.6313, "step": 4191 }, { "epoch": 0.6405623257057722, "grad_norm": 0.2462836503982544, "learning_rate": 6.135153682033489e-05, "loss": 0.4304, "step": 4192 }, { "epoch": 0.6407151316040799, "grad_norm": 0.45701074600219727, "learning_rate": 6.130550608130887e-05, "loss": 0.7714, "step": 4193 }, { "epoch": 0.6408679375023876, "grad_norm": 0.270158976316452, "learning_rate": 6.125948498269126e-05, "loss": 0.7841, "step": 4194 }, { "epoch": 0.6410207434006953, "grad_norm": 0.30690333247184753, "learning_rate": 6.12134735359477e-05, "loss": 0.5731, "step": 4195 }, { "epoch": 0.6411735492990029, "grad_norm": 0.3889475166797638, "learning_rate": 6.116747175254167e-05, "loss": 0.5577, "step": 4196 }, { "epoch": 0.6413263551973106, "grad_norm": 0.2712765336036682, "learning_rate": 6.112147964393405e-05, "loss": 0.6571, "step": 4197 }, { "epoch": 0.6414791610956183, "grad_norm": 0.3843899667263031, "learning_rate": 6.107549722158347e-05, "loss": 0.6538, "step": 4198 }, { "epoch": 0.641631966993926, "grad_norm": 0.24763554334640503, "learning_rate": 6.102952449694599e-05, "loss": 0.5702, "step": 4199 }, { "epoch": 0.6417847728922337, "grad_norm": 0.2887122929096222, "learning_rate": 6.098356148147535e-05, "loss": 0.8121, "step": 4200 }, { "epoch": 0.6419375787905414, "grad_norm": 0.3069363534450531, "learning_rate": 6.0937608186622865e-05, "loss": 0.6811, "step": 4201 }, { "epoch": 0.6420903846888489, "grad_norm": 0.28866079449653625, "learning_rate": 6.0891664623837374e-05, "loss": 0.7553, "step": 4202 }, { "epoch": 0.6422431905871566, "grad_norm": 0.299434632062912, "learning_rate": 6.084573080456537e-05, "loss": 0.664, "step": 4203 }, { "epoch": 0.6423959964854643, "grad_norm": 0.350629985332489, "learning_rate": 6.0799806740250854e-05, "loss": 0.6892, "step": 4204 }, { "epoch": 0.642548802383772, "grad_norm": 0.3066038489341736, "learning_rate": 6.075389244233549e-05, "loss": 0.7243, "step": 4205 }, { "epoch": 0.6427016082820797, "grad_norm": 0.2728354334831238, "learning_rate": 6.0707987922258316e-05, "loss": 0.635, "step": 4206 }, { "epoch": 0.6428544141803874, "grad_norm": 0.2741679549217224, "learning_rate": 6.066209319145615e-05, "loss": 0.7023, "step": 4207 }, { "epoch": 0.643007220078695, "grad_norm": 0.30276694893836975, "learning_rate": 6.061620826136327e-05, "loss": 0.6974, "step": 4208 }, { "epoch": 0.6431600259770027, "grad_norm": 0.28418371081352234, "learning_rate": 6.0570333143411476e-05, "loss": 0.5183, "step": 4209 }, { "epoch": 0.6433128318753104, "grad_norm": 0.26944833993911743, "learning_rate": 6.0524467849030206e-05, "loss": 0.6816, "step": 4210 }, { "epoch": 0.6434656377736181, "grad_norm": 0.25730451941490173, "learning_rate": 6.0478612389646404e-05, "loss": 0.732, "step": 4211 }, { "epoch": 0.6436184436719258, "grad_norm": 0.2732875347137451, "learning_rate": 6.043276677668459e-05, "loss": 0.5747, "step": 4212 }, { "epoch": 0.6437712495702335, "grad_norm": 0.2730986773967743, "learning_rate": 6.038693102156676e-05, "loss": 0.63, "step": 4213 }, { "epoch": 0.643924055468541, "grad_norm": 0.5691524744033813, "learning_rate": 6.034110513571257e-05, "loss": 0.7707, "step": 4214 }, { "epoch": 0.6440768613668487, "grad_norm": 0.2587032616138458, "learning_rate": 6.029528913053914e-05, "loss": 0.6522, "step": 4215 }, { "epoch": 0.6442296672651564, "grad_norm": 0.46911007165908813, "learning_rate": 6.0249483017461117e-05, "loss": 0.6487, "step": 4216 }, { "epoch": 0.6443824731634641, "grad_norm": 0.39704567193984985, "learning_rate": 6.0203686807890704e-05, "loss": 0.5755, "step": 4217 }, { "epoch": 0.6445352790617718, "grad_norm": 0.4068554639816284, "learning_rate": 6.015790051323769e-05, "loss": 0.6695, "step": 4218 }, { "epoch": 0.6446880849600795, "grad_norm": 0.354889839887619, "learning_rate": 6.0112124144909335e-05, "loss": 0.7831, "step": 4219 }, { "epoch": 0.6448408908583871, "grad_norm": 0.2730399966239929, "learning_rate": 6.006635771431039e-05, "loss": 0.8288, "step": 4220 }, { "epoch": 0.6449936967566948, "grad_norm": 0.30042764544487, "learning_rate": 6.002060123284321e-05, "loss": 0.7643, "step": 4221 }, { "epoch": 0.6451465026550025, "grad_norm": 0.3824443519115448, "learning_rate": 5.9974854711907646e-05, "loss": 0.7536, "step": 4222 }, { "epoch": 0.6452993085533102, "grad_norm": 0.2480231523513794, "learning_rate": 5.9929118162901056e-05, "loss": 0.847, "step": 4223 }, { "epoch": 0.6454521144516179, "grad_norm": 0.4015941023826599, "learning_rate": 5.988339159721828e-05, "loss": 0.7163, "step": 4224 }, { "epoch": 0.6456049203499256, "grad_norm": 0.3213943839073181, "learning_rate": 5.983767502625176e-05, "loss": 0.7023, "step": 4225 }, { "epoch": 0.6457577262482331, "grad_norm": 0.3295423090457916, "learning_rate": 5.979196846139139e-05, "loss": 0.8445, "step": 4226 }, { "epoch": 0.6459105321465408, "grad_norm": 0.3309463858604431, "learning_rate": 5.9746271914024554e-05, "loss": 0.8821, "step": 4227 }, { "epoch": 0.6460633380448485, "grad_norm": 0.28392040729522705, "learning_rate": 5.970058539553614e-05, "loss": 0.5415, "step": 4228 }, { "epoch": 0.6462161439431562, "grad_norm": 0.2704792022705078, "learning_rate": 5.965490891730863e-05, "loss": 0.8293, "step": 4229 }, { "epoch": 0.6463689498414639, "grad_norm": 0.30566468834877014, "learning_rate": 5.9609242490721884e-05, "loss": 0.5895, "step": 4230 }, { "epoch": 0.6465217557397716, "grad_norm": 0.4203466475009918, "learning_rate": 5.9563586127153315e-05, "loss": 0.7199, "step": 4231 }, { "epoch": 0.6466745616380792, "grad_norm": 0.3807709813117981, "learning_rate": 5.951793983797782e-05, "loss": 0.6698, "step": 4232 }, { "epoch": 0.6468273675363869, "grad_norm": 0.37328287959098816, "learning_rate": 5.9472303634567836e-05, "loss": 0.6147, "step": 4233 }, { "epoch": 0.6469801734346946, "grad_norm": 0.33843472599983215, "learning_rate": 5.942667752829317e-05, "loss": 0.6556, "step": 4234 }, { "epoch": 0.6471329793330023, "grad_norm": 0.34154462814331055, "learning_rate": 5.938106153052123e-05, "loss": 0.8309, "step": 4235 }, { "epoch": 0.64728578523131, "grad_norm": 0.27381810545921326, "learning_rate": 5.933545565261682e-05, "loss": 0.8016, "step": 4236 }, { "epoch": 0.6474385911296177, "grad_norm": 0.2713511884212494, "learning_rate": 5.928985990594231e-05, "loss": 0.5769, "step": 4237 }, { "epoch": 0.6475913970279252, "grad_norm": 0.3166002333164215, "learning_rate": 5.9244274301857484e-05, "loss": 0.9896, "step": 4238 }, { "epoch": 0.6477442029262329, "grad_norm": 0.3390193581581116, "learning_rate": 5.919869885171956e-05, "loss": 0.707, "step": 4239 }, { "epoch": 0.6478970088245406, "grad_norm": 0.4529277980327606, "learning_rate": 5.915313356688339e-05, "loss": 0.7401, "step": 4240 }, { "epoch": 0.6480498147228483, "grad_norm": 0.37623921036720276, "learning_rate": 5.910757845870105e-05, "loss": 0.6358, "step": 4241 }, { "epoch": 0.648202620621156, "grad_norm": 0.29530203342437744, "learning_rate": 5.9062033538522286e-05, "loss": 0.7905, "step": 4242 }, { "epoch": 0.6483554265194637, "grad_norm": 0.2699858248233795, "learning_rate": 5.901649881769422e-05, "loss": 0.6672, "step": 4243 }, { "epoch": 0.6485082324177713, "grad_norm": 0.24652545154094696, "learning_rate": 5.8970974307561475e-05, "loss": 0.6473, "step": 4244 }, { "epoch": 0.648661038316079, "grad_norm": 0.26167502999305725, "learning_rate": 5.892546001946606e-05, "loss": 0.5892, "step": 4245 }, { "epoch": 0.6488138442143867, "grad_norm": 0.3461175560951233, "learning_rate": 5.887995596474749e-05, "loss": 0.6392, "step": 4246 }, { "epoch": 0.6489666501126944, "grad_norm": 0.2689460515975952, "learning_rate": 5.8834462154742745e-05, "loss": 0.6877, "step": 4247 }, { "epoch": 0.6491194560110021, "grad_norm": 0.3303474485874176, "learning_rate": 5.878897860078616e-05, "loss": 0.6899, "step": 4248 }, { "epoch": 0.6492722619093096, "grad_norm": 0.29973793029785156, "learning_rate": 5.8743505314209634e-05, "loss": 0.8927, "step": 4249 }, { "epoch": 0.6494250678076173, "grad_norm": 0.30865025520324707, "learning_rate": 5.8698042306342416e-05, "loss": 0.6779, "step": 4250 }, { "epoch": 0.649577873705925, "grad_norm": 0.3161505162715912, "learning_rate": 5.865258958851134e-05, "loss": 0.8618, "step": 4251 }, { "epoch": 0.6497306796042327, "grad_norm": 0.27236294746398926, "learning_rate": 5.860714717204041e-05, "loss": 0.5867, "step": 4252 }, { "epoch": 0.6498834855025404, "grad_norm": 0.2424437701702118, "learning_rate": 5.856171506825132e-05, "loss": 0.6115, "step": 4253 }, { "epoch": 0.6500362914008481, "grad_norm": 0.2960748076438904, "learning_rate": 5.851629328846311e-05, "loss": 0.7064, "step": 4254 }, { "epoch": 0.6501890972991557, "grad_norm": 0.31836503744125366, "learning_rate": 5.8470881843992185e-05, "loss": 0.6482, "step": 4255 }, { "epoch": 0.6503419031974634, "grad_norm": 0.24373292922973633, "learning_rate": 5.842548074615242e-05, "loss": 0.5645, "step": 4256 }, { "epoch": 0.6504947090957711, "grad_norm": 0.2876763343811035, "learning_rate": 5.838009000625515e-05, "loss": 0.7036, "step": 4257 }, { "epoch": 0.6506475149940788, "grad_norm": 0.27968302369117737, "learning_rate": 5.8334709635609106e-05, "loss": 0.8507, "step": 4258 }, { "epoch": 0.6508003208923865, "grad_norm": 0.33190199732780457, "learning_rate": 5.828933964552037e-05, "loss": 0.6497, "step": 4259 }, { "epoch": 0.6509531267906942, "grad_norm": 0.28241148591041565, "learning_rate": 5.8243980047292545e-05, "loss": 0.6532, "step": 4260 }, { "epoch": 0.6511059326890017, "grad_norm": 0.30200818181037903, "learning_rate": 5.819863085222665e-05, "loss": 0.715, "step": 4261 }, { "epoch": 0.6512587385873094, "grad_norm": 0.31453654170036316, "learning_rate": 5.81532920716209e-05, "loss": 0.6918, "step": 4262 }, { "epoch": 0.6514115444856171, "grad_norm": 0.31839510798454285, "learning_rate": 5.810796371677117e-05, "loss": 0.7786, "step": 4263 }, { "epoch": 0.6515643503839248, "grad_norm": 0.28044262528419495, "learning_rate": 5.806264579897063e-05, "loss": 0.7164, "step": 4264 }, { "epoch": 0.6517171562822325, "grad_norm": 0.31478336453437805, "learning_rate": 5.8017338329509926e-05, "loss": 0.6987, "step": 4265 }, { "epoch": 0.6518699621805402, "grad_norm": 0.5099149346351624, "learning_rate": 5.797204131967691e-05, "loss": 0.6539, "step": 4266 }, { "epoch": 0.6520227680788478, "grad_norm": 0.3031832277774811, "learning_rate": 5.792675478075697e-05, "loss": 0.7614, "step": 4267 }, { "epoch": 0.6521755739771555, "grad_norm": 0.2523060142993927, "learning_rate": 5.788147872403293e-05, "loss": 0.6402, "step": 4268 }, { "epoch": 0.6523283798754632, "grad_norm": 0.31935545802116394, "learning_rate": 5.783621316078495e-05, "loss": 0.7183, "step": 4269 }, { "epoch": 0.6524811857737709, "grad_norm": 0.27997279167175293, "learning_rate": 5.779095810229052e-05, "loss": 0.6922, "step": 4270 }, { "epoch": 0.6526339916720786, "grad_norm": 0.3088814318180084, "learning_rate": 5.774571355982452e-05, "loss": 0.6417, "step": 4271 }, { "epoch": 0.6527867975703863, "grad_norm": 0.35518980026245117, "learning_rate": 5.7700479544659346e-05, "loss": 0.8312, "step": 4272 }, { "epoch": 0.6529396034686938, "grad_norm": 0.32195112109184265, "learning_rate": 5.7655256068064576e-05, "loss": 0.6058, "step": 4273 }, { "epoch": 0.6530924093670015, "grad_norm": 0.27744877338409424, "learning_rate": 5.7610043141307345e-05, "loss": 0.75, "step": 4274 }, { "epoch": 0.6532452152653092, "grad_norm": 0.34689977765083313, "learning_rate": 5.7564840775651994e-05, "loss": 0.6277, "step": 4275 }, { "epoch": 0.6533980211636169, "grad_norm": 0.3278833329677582, "learning_rate": 5.7519648982360395e-05, "loss": 0.7029, "step": 4276 }, { "epoch": 0.6535508270619246, "grad_norm": 0.3406006395816803, "learning_rate": 5.7474467772691606e-05, "loss": 0.577, "step": 4277 }, { "epoch": 0.6537036329602323, "grad_norm": 0.32342788577079773, "learning_rate": 5.7429297157902264e-05, "loss": 0.6111, "step": 4278 }, { "epoch": 0.65385643885854, "grad_norm": 0.27723947167396545, "learning_rate": 5.7384137149246175e-05, "loss": 0.7135, "step": 4279 }, { "epoch": 0.6540092447568476, "grad_norm": 0.2640012502670288, "learning_rate": 5.733898775797455e-05, "loss": 0.62, "step": 4280 }, { "epoch": 0.6541620506551553, "grad_norm": 0.2819145619869232, "learning_rate": 5.729384899533602e-05, "loss": 0.5432, "step": 4281 }, { "epoch": 0.654314856553463, "grad_norm": 0.36311617493629456, "learning_rate": 5.724872087257657e-05, "loss": 0.5153, "step": 4282 }, { "epoch": 0.6544676624517707, "grad_norm": 0.3131016790866852, "learning_rate": 5.7203603400939445e-05, "loss": 0.5944, "step": 4283 }, { "epoch": 0.6546204683500784, "grad_norm": 0.2309597134590149, "learning_rate": 5.715849659166525e-05, "loss": 0.6252, "step": 4284 }, { "epoch": 0.6547732742483859, "grad_norm": 0.44529998302459717, "learning_rate": 5.7113400455992e-05, "loss": 0.8177, "step": 4285 }, { "epoch": 0.6549260801466936, "grad_norm": 0.3021618127822876, "learning_rate": 5.706831500515507e-05, "loss": 0.7102, "step": 4286 }, { "epoch": 0.6550788860450013, "grad_norm": 0.30636557936668396, "learning_rate": 5.7023240250387075e-05, "loss": 0.8765, "step": 4287 }, { "epoch": 0.655231691943309, "grad_norm": 0.27565455436706543, "learning_rate": 5.697817620291799e-05, "loss": 0.6337, "step": 4288 }, { "epoch": 0.6553844978416167, "grad_norm": 0.30019816756248474, "learning_rate": 5.693312287397515e-05, "loss": 0.825, "step": 4289 }, { "epoch": 0.6555373037399244, "grad_norm": 0.33282437920570374, "learning_rate": 5.688808027478328e-05, "loss": 0.6767, "step": 4290 }, { "epoch": 0.655690109638232, "grad_norm": 0.31007322669029236, "learning_rate": 5.6843048416564314e-05, "loss": 0.8461, "step": 4291 }, { "epoch": 0.6558429155365397, "grad_norm": 0.37334969639778137, "learning_rate": 5.679802731053754e-05, "loss": 0.867, "step": 4292 }, { "epoch": 0.6559957214348474, "grad_norm": 0.3802035450935364, "learning_rate": 5.6753016967919633e-05, "loss": 0.7248, "step": 4293 }, { "epoch": 0.6561485273331551, "grad_norm": 0.29199114441871643, "learning_rate": 5.6708017399924485e-05, "loss": 0.7837, "step": 4294 }, { "epoch": 0.6563013332314628, "grad_norm": 0.27255427837371826, "learning_rate": 5.6663028617763415e-05, "loss": 0.6914, "step": 4295 }, { "epoch": 0.6564541391297705, "grad_norm": 0.28946343064308167, "learning_rate": 5.6618050632645e-05, "loss": 0.7849, "step": 4296 }, { "epoch": 0.656606945028078, "grad_norm": 0.28791841864585876, "learning_rate": 5.6573083455775136e-05, "loss": 0.6734, "step": 4297 }, { "epoch": 0.6567597509263857, "grad_norm": 0.3184029161930084, "learning_rate": 5.652812709835694e-05, "loss": 0.6667, "step": 4298 }, { "epoch": 0.6569125568246934, "grad_norm": 0.48238903284072876, "learning_rate": 5.648318157159096e-05, "loss": 0.7216, "step": 4299 }, { "epoch": 0.6570653627230011, "grad_norm": 0.3703603148460388, "learning_rate": 5.643824688667505e-05, "loss": 0.6124, "step": 4300 }, { "epoch": 0.6572181686213088, "grad_norm": 0.3254699110984802, "learning_rate": 5.639332305480426e-05, "loss": 0.7546, "step": 4301 }, { "epoch": 0.6573709745196165, "grad_norm": 0.2918962240219116, "learning_rate": 5.634841008717093e-05, "loss": 0.5583, "step": 4302 }, { "epoch": 0.6575237804179241, "grad_norm": 0.28097614645957947, "learning_rate": 5.630350799496482e-05, "loss": 0.6999, "step": 4303 }, { "epoch": 0.6576765863162318, "grad_norm": 0.30386725068092346, "learning_rate": 5.625861678937294e-05, "loss": 0.7967, "step": 4304 }, { "epoch": 0.6578293922145395, "grad_norm": 0.2628733515739441, "learning_rate": 5.62137364815795e-05, "loss": 0.785, "step": 4305 }, { "epoch": 0.6579821981128472, "grad_norm": 0.2997375726699829, "learning_rate": 5.616886708276603e-05, "loss": 0.6496, "step": 4306 }, { "epoch": 0.6581350040111549, "grad_norm": 0.37791678309440613, "learning_rate": 5.612400860411139e-05, "loss": 0.7869, "step": 4307 }, { "epoch": 0.6582878099094625, "grad_norm": 0.2886675298213959, "learning_rate": 5.607916105679174e-05, "loss": 0.671, "step": 4308 }, { "epoch": 0.6584406158077701, "grad_norm": 0.28003209829330444, "learning_rate": 5.6034324451980425e-05, "loss": 0.7855, "step": 4309 }, { "epoch": 0.6585934217060778, "grad_norm": 0.3257627487182617, "learning_rate": 5.5989498800848094e-05, "loss": 0.8834, "step": 4310 }, { "epoch": 0.6587462276043855, "grad_norm": 0.29753580689430237, "learning_rate": 5.594468411456273e-05, "loss": 0.7202, "step": 4311 }, { "epoch": 0.6588990335026932, "grad_norm": 0.29642051458358765, "learning_rate": 5.5899880404289465e-05, "loss": 0.7634, "step": 4312 }, { "epoch": 0.6590518394010009, "grad_norm": 0.2864471673965454, "learning_rate": 5.585508768119085e-05, "loss": 0.7543, "step": 4313 }, { "epoch": 0.6592046452993086, "grad_norm": 0.3022642135620117, "learning_rate": 5.581030595642653e-05, "loss": 0.8052, "step": 4314 }, { "epoch": 0.6593574511976162, "grad_norm": 0.31377336382865906, "learning_rate": 5.5765535241153596e-05, "loss": 0.8731, "step": 4315 }, { "epoch": 0.6595102570959239, "grad_norm": 0.2930757701396942, "learning_rate": 5.5720775546526205e-05, "loss": 0.7746, "step": 4316 }, { "epoch": 0.6596630629942316, "grad_norm": 0.3031260371208191, "learning_rate": 5.567602688369593e-05, "loss": 0.7174, "step": 4317 }, { "epoch": 0.6598158688925393, "grad_norm": 0.3256378471851349, "learning_rate": 5.5631289263811495e-05, "loss": 0.7988, "step": 4318 }, { "epoch": 0.659968674790847, "grad_norm": 0.2856435477733612, "learning_rate": 5.558656269801884e-05, "loss": 0.7675, "step": 4319 }, { "epoch": 0.6601214806891545, "grad_norm": 0.2758930027484894, "learning_rate": 5.5541847197461296e-05, "loss": 0.7446, "step": 4320 }, { "epoch": 0.6602742865874622, "grad_norm": 0.2451760172843933, "learning_rate": 5.549714277327931e-05, "loss": 0.6915, "step": 4321 }, { "epoch": 0.6604270924857699, "grad_norm": 0.3307189643383026, "learning_rate": 5.545244943661072e-05, "loss": 0.7638, "step": 4322 }, { "epoch": 0.6605798983840776, "grad_norm": 0.35473012924194336, "learning_rate": 5.5407767198590335e-05, "loss": 0.6032, "step": 4323 }, { "epoch": 0.6607327042823853, "grad_norm": 0.2760302722454071, "learning_rate": 5.536309607035043e-05, "loss": 0.6474, "step": 4324 }, { "epoch": 0.660885510180693, "grad_norm": 0.45210763812065125, "learning_rate": 5.5318436063020485e-05, "loss": 0.823, "step": 4325 }, { "epoch": 0.6610383160790007, "grad_norm": 0.30650877952575684, "learning_rate": 5.527378718772713e-05, "loss": 0.7758, "step": 4326 }, { "epoch": 0.6611911219773083, "grad_norm": 0.2780720591545105, "learning_rate": 5.522914945559421e-05, "loss": 0.6157, "step": 4327 }, { "epoch": 0.661343927875616, "grad_norm": 0.2897791862487793, "learning_rate": 5.518452287774289e-05, "loss": 0.772, "step": 4328 }, { "epoch": 0.6614967337739237, "grad_norm": 0.3310716152191162, "learning_rate": 5.513990746529154e-05, "loss": 0.8176, "step": 4329 }, { "epoch": 0.6616495396722314, "grad_norm": 0.29766175150871277, "learning_rate": 5.509530322935565e-05, "loss": 0.7393, "step": 4330 }, { "epoch": 0.6618023455705391, "grad_norm": 0.2601233124732971, "learning_rate": 5.505071018104804e-05, "loss": 0.6172, "step": 4331 }, { "epoch": 0.6619551514688466, "grad_norm": 0.3184444010257721, "learning_rate": 5.500612833147869e-05, "loss": 0.7818, "step": 4332 }, { "epoch": 0.6621079573671543, "grad_norm": 0.31976786255836487, "learning_rate": 5.4961557691754727e-05, "loss": 0.8395, "step": 4333 }, { "epoch": 0.662260763265462, "grad_norm": 0.26618340611457825, "learning_rate": 5.49169982729806e-05, "loss": 0.7842, "step": 4334 }, { "epoch": 0.6624135691637697, "grad_norm": 0.29280707240104675, "learning_rate": 5.487245008625796e-05, "loss": 0.6204, "step": 4335 }, { "epoch": 0.6625663750620774, "grad_norm": 0.29666972160339355, "learning_rate": 5.4827913142685586e-05, "loss": 0.6752, "step": 4336 }, { "epoch": 0.6627191809603851, "grad_norm": 0.2703108787536621, "learning_rate": 5.47833874533594e-05, "loss": 0.6249, "step": 4337 }, { "epoch": 0.6628719868586928, "grad_norm": 0.2508685886859894, "learning_rate": 5.473887302937268e-05, "loss": 0.7276, "step": 4338 }, { "epoch": 0.6630247927570004, "grad_norm": 0.28797996044158936, "learning_rate": 5.469436988181585e-05, "loss": 0.6227, "step": 4339 }, { "epoch": 0.6631775986553081, "grad_norm": 0.27974840998649597, "learning_rate": 5.464987802177646e-05, "loss": 0.5932, "step": 4340 }, { "epoch": 0.6633304045536158, "grad_norm": 0.4193362295627594, "learning_rate": 5.460539746033925e-05, "loss": 0.6765, "step": 4341 }, { "epoch": 0.6634832104519235, "grad_norm": 0.32927194237709045, "learning_rate": 5.4560928208586205e-05, "loss": 0.6033, "step": 4342 }, { "epoch": 0.6636360163502312, "grad_norm": 0.3113420009613037, "learning_rate": 5.45164702775965e-05, "loss": 0.7376, "step": 4343 }, { "epoch": 0.6637888222485387, "grad_norm": 0.2748812735080719, "learning_rate": 5.447202367844644e-05, "loss": 0.7156, "step": 4344 }, { "epoch": 0.6639416281468464, "grad_norm": 0.2942165732383728, "learning_rate": 5.4427588422209455e-05, "loss": 0.6998, "step": 4345 }, { "epoch": 0.6640944340451541, "grad_norm": 0.32953986525535583, "learning_rate": 5.438316451995626e-05, "loss": 0.5686, "step": 4346 }, { "epoch": 0.6642472399434618, "grad_norm": 0.2765321731567383, "learning_rate": 5.4338751982754766e-05, "loss": 0.6983, "step": 4347 }, { "epoch": 0.6644000458417695, "grad_norm": 0.31935787200927734, "learning_rate": 5.429435082166992e-05, "loss": 0.6046, "step": 4348 }, { "epoch": 0.6645528517400772, "grad_norm": 0.2649092972278595, "learning_rate": 5.424996104776385e-05, "loss": 0.8986, "step": 4349 }, { "epoch": 0.6647056576383849, "grad_norm": 0.8897063136100769, "learning_rate": 5.4205582672096e-05, "loss": 0.5864, "step": 4350 }, { "epoch": 0.6648584635366925, "grad_norm": 0.2820856273174286, "learning_rate": 5.416121570572278e-05, "loss": 0.6672, "step": 4351 }, { "epoch": 0.6650112694350002, "grad_norm": 0.3262161612510681, "learning_rate": 5.4116860159697926e-05, "loss": 0.6761, "step": 4352 }, { "epoch": 0.6651640753333079, "grad_norm": 0.2937242090702057, "learning_rate": 5.407251604507215e-05, "loss": 0.6514, "step": 4353 }, { "epoch": 0.6653168812316156, "grad_norm": 0.30296847224235535, "learning_rate": 5.402818337289353e-05, "loss": 0.7588, "step": 4354 }, { "epoch": 0.6654696871299232, "grad_norm": 0.29046115279197693, "learning_rate": 5.398386215420708e-05, "loss": 0.7606, "step": 4355 }, { "epoch": 0.6656224930282308, "grad_norm": 0.3020663261413574, "learning_rate": 5.393955240005511e-05, "loss": 0.7264, "step": 4356 }, { "epoch": 0.6657752989265385, "grad_norm": 0.26482436060905457, "learning_rate": 5.389525412147709e-05, "loss": 0.7413, "step": 4357 }, { "epoch": 0.6659281048248462, "grad_norm": 0.3416441082954407, "learning_rate": 5.3850967329509416e-05, "loss": 0.7522, "step": 4358 }, { "epoch": 0.6660809107231539, "grad_norm": 0.8151885271072388, "learning_rate": 5.380669203518585e-05, "loss": 0.6949, "step": 4359 }, { "epoch": 0.6662337166214616, "grad_norm": 0.4770559072494507, "learning_rate": 5.376242824953719e-05, "loss": 0.8184, "step": 4360 }, { "epoch": 0.6663865225197693, "grad_norm": 0.7262836694717407, "learning_rate": 5.371817598359146e-05, "loss": 0.7664, "step": 4361 }, { "epoch": 0.666539328418077, "grad_norm": 0.3330950140953064, "learning_rate": 5.3673935248373666e-05, "loss": 0.8146, "step": 4362 }, { "epoch": 0.6666921343163846, "grad_norm": 0.24412184953689575, "learning_rate": 5.3629706054906006e-05, "loss": 0.821, "step": 4363 }, { "epoch": 0.6668449402146923, "grad_norm": 0.2575673460960388, "learning_rate": 5.358548841420787e-05, "loss": 0.6553, "step": 4364 }, { "epoch": 0.6668449402146923, "eval_loss": 0.7003983855247498, "eval_runtime": 1444.4156, "eval_samples_per_second": 7.721, "eval_steps_per_second": 3.86, "step": 4364 }, { "epoch": 0.666997746113, "grad_norm": 0.30834752321243286, "learning_rate": 5.354128233729564e-05, "loss": 0.7385, "step": 4365 }, { "epoch": 0.6671505520113077, "grad_norm": 0.28052300214767456, "learning_rate": 5.349708783518297e-05, "loss": 0.9207, "step": 4366 }, { "epoch": 0.6673033579096153, "grad_norm": 0.39452987909317017, "learning_rate": 5.345290491888047e-05, "loss": 0.7037, "step": 4367 }, { "epoch": 0.6674561638079229, "grad_norm": 0.28399553894996643, "learning_rate": 5.3408733599396034e-05, "loss": 0.833, "step": 4368 }, { "epoch": 0.6676089697062306, "grad_norm": 0.3832983672618866, "learning_rate": 5.336457388773447e-05, "loss": 0.7027, "step": 4369 }, { "epoch": 0.6677617756045383, "grad_norm": 0.3385736346244812, "learning_rate": 5.33204257948979e-05, "loss": 0.6905, "step": 4370 }, { "epoch": 0.667914581502846, "grad_norm": 0.39011090993881226, "learning_rate": 5.32762893318854e-05, "loss": 0.7341, "step": 4371 }, { "epoch": 0.6680673874011537, "grad_norm": 0.35168904066085815, "learning_rate": 5.323216450969316e-05, "loss": 0.6786, "step": 4372 }, { "epoch": 0.6682201932994614, "grad_norm": 0.2878551483154297, "learning_rate": 5.318805133931456e-05, "loss": 0.461, "step": 4373 }, { "epoch": 0.668372999197769, "grad_norm": 0.295841783285141, "learning_rate": 5.314394983174005e-05, "loss": 0.5517, "step": 4374 }, { "epoch": 0.6685258050960767, "grad_norm": 0.2754735052585602, "learning_rate": 5.3099859997957126e-05, "loss": 0.6457, "step": 4375 }, { "epoch": 0.6686786109943844, "grad_norm": 0.27244895696640015, "learning_rate": 5.305578184895035e-05, "loss": 0.6681, "step": 4376 }, { "epoch": 0.6688314168926921, "grad_norm": 0.28593409061431885, "learning_rate": 5.301171539570146e-05, "loss": 0.6394, "step": 4377 }, { "epoch": 0.6689842227909998, "grad_norm": 0.26024940609931946, "learning_rate": 5.296766064918929e-05, "loss": 0.6228, "step": 4378 }, { "epoch": 0.6691370286893074, "grad_norm": 0.26800084114074707, "learning_rate": 5.292361762038967e-05, "loss": 0.4603, "step": 4379 }, { "epoch": 0.669289834587615, "grad_norm": 0.2919711768627167, "learning_rate": 5.28795863202755e-05, "loss": 0.7626, "step": 4380 }, { "epoch": 0.6694426404859227, "grad_norm": 0.2848813831806183, "learning_rate": 5.2835566759816865e-05, "loss": 0.6784, "step": 4381 }, { "epoch": 0.6695954463842304, "grad_norm": 0.2870771586894989, "learning_rate": 5.2791558949980915e-05, "loss": 0.7467, "step": 4382 }, { "epoch": 0.6697482522825381, "grad_norm": 0.33729174733161926, "learning_rate": 5.274756290173175e-05, "loss": 0.7281, "step": 4383 }, { "epoch": 0.6699010581808458, "grad_norm": 0.3717024326324463, "learning_rate": 5.2703578626030614e-05, "loss": 0.7451, "step": 4384 }, { "epoch": 0.6700538640791535, "grad_norm": 0.28417110443115234, "learning_rate": 5.265960613383585e-05, "loss": 0.5677, "step": 4385 }, { "epoch": 0.6702066699774611, "grad_norm": 0.44522276520729065, "learning_rate": 5.261564543610287e-05, "loss": 0.8297, "step": 4386 }, { "epoch": 0.6703594758757688, "grad_norm": 0.29529058933258057, "learning_rate": 5.257169654378405e-05, "loss": 0.4472, "step": 4387 }, { "epoch": 0.6705122817740765, "grad_norm": 0.27961522340774536, "learning_rate": 5.25277594678289e-05, "loss": 0.7851, "step": 4388 }, { "epoch": 0.6706650876723842, "grad_norm": 0.31498804688453674, "learning_rate": 5.248383421918401e-05, "loss": 0.6458, "step": 4389 }, { "epoch": 0.6708178935706919, "grad_norm": 0.3034273386001587, "learning_rate": 5.243992080879292e-05, "loss": 0.6193, "step": 4390 }, { "epoch": 0.6709706994689995, "grad_norm": 0.3019751310348511, "learning_rate": 5.239601924759634e-05, "loss": 0.7018, "step": 4391 }, { "epoch": 0.6711235053673071, "grad_norm": 0.3216398060321808, "learning_rate": 5.2352129546532e-05, "loss": 0.9789, "step": 4392 }, { "epoch": 0.6712763112656148, "grad_norm": 0.32603368163108826, "learning_rate": 5.2308251716534614e-05, "loss": 0.789, "step": 4393 }, { "epoch": 0.6714291171639225, "grad_norm": 0.28429511189460754, "learning_rate": 5.226438576853594e-05, "loss": 0.789, "step": 4394 }, { "epoch": 0.6715819230622302, "grad_norm": 0.2872902452945709, "learning_rate": 5.222053171346486e-05, "loss": 0.676, "step": 4395 }, { "epoch": 0.6717347289605379, "grad_norm": 0.31137508153915405, "learning_rate": 5.217668956224725e-05, "loss": 0.6885, "step": 4396 }, { "epoch": 0.6718875348588456, "grad_norm": 0.344843327999115, "learning_rate": 5.2132859325806003e-05, "loss": 0.6596, "step": 4397 }, { "epoch": 0.6720403407571532, "grad_norm": 0.26653701066970825, "learning_rate": 5.2089041015061e-05, "loss": 0.8741, "step": 4398 }, { "epoch": 0.6721931466554609, "grad_norm": 0.3506641387939453, "learning_rate": 5.2045234640929266e-05, "loss": 0.7127, "step": 4399 }, { "epoch": 0.6723459525537686, "grad_norm": 0.2936185300350189, "learning_rate": 5.2001440214324804e-05, "loss": 0.6775, "step": 4400 }, { "epoch": 0.6724987584520763, "grad_norm": 0.3009645938873291, "learning_rate": 5.1957657746158616e-05, "loss": 0.6648, "step": 4401 }, { "epoch": 0.672651564350384, "grad_norm": 0.26885971426963806, "learning_rate": 5.1913887247338664e-05, "loss": 0.8003, "step": 4402 }, { "epoch": 0.6728043702486916, "grad_norm": 0.304047554731369, "learning_rate": 5.1870128728770105e-05, "loss": 0.5644, "step": 4403 }, { "epoch": 0.6729571761469992, "grad_norm": 0.37616226077079773, "learning_rate": 5.182638220135492e-05, "loss": 0.7497, "step": 4404 }, { "epoch": 0.6731099820453069, "grad_norm": 0.27434805035591125, "learning_rate": 5.178264767599227e-05, "loss": 0.6505, "step": 4405 }, { "epoch": 0.6732627879436146, "grad_norm": 0.32058754563331604, "learning_rate": 5.1738925163578165e-05, "loss": 0.7122, "step": 4406 }, { "epoch": 0.6734155938419223, "grad_norm": 0.3557422459125519, "learning_rate": 5.169521467500578e-05, "loss": 0.4908, "step": 4407 }, { "epoch": 0.67356839974023, "grad_norm": 0.8077619075775146, "learning_rate": 5.165151622116513e-05, "loss": 0.6526, "step": 4408 }, { "epoch": 0.6737212056385377, "grad_norm": 0.26014748215675354, "learning_rate": 5.160782981294341e-05, "loss": 0.5546, "step": 4409 }, { "epoch": 0.6738740115368453, "grad_norm": 0.4278123676776886, "learning_rate": 5.156415546122467e-05, "loss": 0.7425, "step": 4410 }, { "epoch": 0.674026817435153, "grad_norm": 0.41125205159187317, "learning_rate": 5.1520493176889987e-05, "loss": 0.8237, "step": 4411 }, { "epoch": 0.6741796233334607, "grad_norm": 0.24250277876853943, "learning_rate": 5.147684297081747e-05, "loss": 0.652, "step": 4412 }, { "epoch": 0.6743324292317684, "grad_norm": 0.29083165526390076, "learning_rate": 5.143320485388226e-05, "loss": 0.6756, "step": 4413 }, { "epoch": 0.674485235130076, "grad_norm": 0.34704911708831787, "learning_rate": 5.1389578836956365e-05, "loss": 0.8026, "step": 4414 }, { "epoch": 0.6746380410283837, "grad_norm": 0.28725364804267883, "learning_rate": 5.134596493090882e-05, "loss": 0.8781, "step": 4415 }, { "epoch": 0.6747908469266913, "grad_norm": 0.34252092242240906, "learning_rate": 5.13023631466057e-05, "loss": 0.6341, "step": 4416 }, { "epoch": 0.674943652824999, "grad_norm": 0.2834259271621704, "learning_rate": 5.1258773494910025e-05, "loss": 0.7033, "step": 4417 }, { "epoch": 0.6750964587233067, "grad_norm": 0.2758314609527588, "learning_rate": 5.121519598668188e-05, "loss": 0.729, "step": 4418 }, { "epoch": 0.6752492646216144, "grad_norm": 0.2702345848083496, "learning_rate": 5.1171630632778035e-05, "loss": 0.6454, "step": 4419 }, { "epoch": 0.6754020705199221, "grad_norm": 0.38108593225479126, "learning_rate": 5.112807744405257e-05, "loss": 0.6539, "step": 4420 }, { "epoch": 0.6755548764182298, "grad_norm": 0.3102193772792816, "learning_rate": 5.108453643135638e-05, "loss": 0.6399, "step": 4421 }, { "epoch": 0.6757076823165374, "grad_norm": 0.2749772369861603, "learning_rate": 5.104100760553731e-05, "loss": 0.7171, "step": 4422 }, { "epoch": 0.6758604882148451, "grad_norm": 0.25176766514778137, "learning_rate": 5.099749097744024e-05, "loss": 0.6431, "step": 4423 }, { "epoch": 0.6760132941131528, "grad_norm": 0.3452298045158386, "learning_rate": 5.095398655790694e-05, "loss": 0.8327, "step": 4424 }, { "epoch": 0.6761661000114605, "grad_norm": 0.2980501651763916, "learning_rate": 5.091049435777622e-05, "loss": 0.8754, "step": 4425 }, { "epoch": 0.6763189059097681, "grad_norm": 0.2884484529495239, "learning_rate": 5.0867014387883706e-05, "loss": 0.8527, "step": 4426 }, { "epoch": 0.6764717118080757, "grad_norm": 0.3645104467868805, "learning_rate": 5.082354665906217e-05, "loss": 0.6624, "step": 4427 }, { "epoch": 0.6766245177063834, "grad_norm": 0.5415614247322083, "learning_rate": 5.078009118214119e-05, "loss": 0.5252, "step": 4428 }, { "epoch": 0.6767773236046911, "grad_norm": 0.3443562984466553, "learning_rate": 5.073664796794728e-05, "loss": 0.7448, "step": 4429 }, { "epoch": 0.6769301295029988, "grad_norm": 0.3706625699996948, "learning_rate": 5.069321702730401e-05, "loss": 0.9034, "step": 4430 }, { "epoch": 0.6770829354013065, "grad_norm": 0.2914830446243286, "learning_rate": 5.064979837103185e-05, "loss": 0.712, "step": 4431 }, { "epoch": 0.6772357412996142, "grad_norm": 0.3097488284111023, "learning_rate": 5.060639200994819e-05, "loss": 0.6135, "step": 4432 }, { "epoch": 0.6773885471979219, "grad_norm": 0.3045434057712555, "learning_rate": 5.056299795486728e-05, "loss": 0.6765, "step": 4433 }, { "epoch": 0.6775413530962295, "grad_norm": 0.3066057562828064, "learning_rate": 5.0519616216600453e-05, "loss": 0.7716, "step": 4434 }, { "epoch": 0.6776941589945372, "grad_norm": 0.2661169767379761, "learning_rate": 5.047624680595593e-05, "loss": 0.6888, "step": 4435 }, { "epoch": 0.6778469648928449, "grad_norm": 0.35913899540901184, "learning_rate": 5.043288973373881e-05, "loss": 0.9291, "step": 4436 }, { "epoch": 0.6779997707911526, "grad_norm": 0.26369667053222656, "learning_rate": 5.038954501075108e-05, "loss": 0.6952, "step": 4437 }, { "epoch": 0.6781525766894602, "grad_norm": 0.30785226821899414, "learning_rate": 5.034621264779178e-05, "loss": 0.7973, "step": 4438 }, { "epoch": 0.6783053825877678, "grad_norm": 0.2791886627674103, "learning_rate": 5.030289265565682e-05, "loss": 0.5642, "step": 4439 }, { "epoch": 0.6784581884860755, "grad_norm": 0.2842103838920593, "learning_rate": 5.025958504513899e-05, "loss": 0.7388, "step": 4440 }, { "epoch": 0.6786109943843832, "grad_norm": 0.3138188421726227, "learning_rate": 5.0216289827027986e-05, "loss": 0.773, "step": 4441 }, { "epoch": 0.6787638002826909, "grad_norm": 0.4475540220737457, "learning_rate": 5.017300701211049e-05, "loss": 0.89, "step": 4442 }, { "epoch": 0.6789166061809986, "grad_norm": 0.3245783746242523, "learning_rate": 5.012973661117002e-05, "loss": 0.8021, "step": 4443 }, { "epoch": 0.6790694120793063, "grad_norm": 0.29661089181900024, "learning_rate": 5.008647863498709e-05, "loss": 0.8557, "step": 4444 }, { "epoch": 0.679222217977614, "grad_norm": 0.38732847571372986, "learning_rate": 5.0043233094338985e-05, "loss": 0.793, "step": 4445 }, { "epoch": 0.6793750238759216, "grad_norm": 0.3176628053188324, "learning_rate": 5.000000000000002e-05, "loss": 0.6918, "step": 4446 }, { "epoch": 0.6795278297742293, "grad_norm": 0.2673543691635132, "learning_rate": 4.995677936274132e-05, "loss": 0.7953, "step": 4447 }, { "epoch": 0.679680635672537, "grad_norm": 0.2867792546749115, "learning_rate": 4.9913571193331e-05, "loss": 0.6188, "step": 4448 }, { "epoch": 0.6798334415708447, "grad_norm": 0.27831536531448364, "learning_rate": 4.987037550253398e-05, "loss": 0.6003, "step": 4449 }, { "epoch": 0.6799862474691523, "grad_norm": 0.2510976493358612, "learning_rate": 4.982719230111208e-05, "loss": 0.7919, "step": 4450 }, { "epoch": 0.68013905336746, "grad_norm": 0.29773804545402527, "learning_rate": 4.978402159982404e-05, "loss": 0.6, "step": 4451 }, { "epoch": 0.6802918592657676, "grad_norm": 0.26814860105514526, "learning_rate": 4.97408634094255e-05, "loss": 0.7553, "step": 4452 }, { "epoch": 0.6804446651640753, "grad_norm": 0.30513063073158264, "learning_rate": 4.9697717740669025e-05, "loss": 0.7529, "step": 4453 }, { "epoch": 0.680597471062383, "grad_norm": 0.27793049812316895, "learning_rate": 4.9654584604303845e-05, "loss": 0.6122, "step": 4454 }, { "epoch": 0.6807502769606907, "grad_norm": 0.26808398962020874, "learning_rate": 4.961146401107632e-05, "loss": 0.5882, "step": 4455 }, { "epoch": 0.6809030828589984, "grad_norm": 0.4149441123008728, "learning_rate": 4.956835597172954e-05, "loss": 0.8469, "step": 4456 }, { "epoch": 0.681055888757306, "grad_norm": 0.31907710433006287, "learning_rate": 4.952526049700358e-05, "loss": 0.6695, "step": 4457 }, { "epoch": 0.6812086946556137, "grad_norm": 0.2895703613758087, "learning_rate": 4.948217759763527e-05, "loss": 0.618, "step": 4458 }, { "epoch": 0.6813615005539214, "grad_norm": 0.3641390800476074, "learning_rate": 4.943910728435831e-05, "loss": 0.7025, "step": 4459 }, { "epoch": 0.6815143064522291, "grad_norm": 0.26010552048683167, "learning_rate": 4.939604956790339e-05, "loss": 0.5716, "step": 4460 }, { "epoch": 0.6816671123505368, "grad_norm": 0.28951773047447205, "learning_rate": 4.935300445899791e-05, "loss": 0.4312, "step": 4461 }, { "epoch": 0.6818199182488444, "grad_norm": 0.29047438502311707, "learning_rate": 4.930997196836625e-05, "loss": 0.7299, "step": 4462 }, { "epoch": 0.681972724147152, "grad_norm": 0.2965889871120453, "learning_rate": 4.926695210672955e-05, "loss": 0.6235, "step": 4463 }, { "epoch": 0.6821255300454597, "grad_norm": 0.3306009769439697, "learning_rate": 4.922394488480588e-05, "loss": 0.6667, "step": 4464 }, { "epoch": 0.6822783359437674, "grad_norm": 0.4301811754703522, "learning_rate": 4.918095031331011e-05, "loss": 0.588, "step": 4465 }, { "epoch": 0.6824311418420751, "grad_norm": 0.3095620572566986, "learning_rate": 4.913796840295399e-05, "loss": 0.7026, "step": 4466 }, { "epoch": 0.6825839477403828, "grad_norm": 0.27729034423828125, "learning_rate": 4.909499916444611e-05, "loss": 0.5636, "step": 4467 }, { "epoch": 0.6827367536386905, "grad_norm": 0.5523043870925903, "learning_rate": 4.905204260849183e-05, "loss": 0.8391, "step": 4468 }, { "epoch": 0.6828895595369981, "grad_norm": 0.3127119541168213, "learning_rate": 4.900909874579347e-05, "loss": 0.6599, "step": 4469 }, { "epoch": 0.6830423654353058, "grad_norm": 0.2766704261302948, "learning_rate": 4.896616758705017e-05, "loss": 0.6034, "step": 4470 }, { "epoch": 0.6831951713336135, "grad_norm": 0.31232303380966187, "learning_rate": 4.8923249142957816e-05, "loss": 0.8211, "step": 4471 }, { "epoch": 0.6833479772319212, "grad_norm": 0.2655163109302521, "learning_rate": 4.888034342420916e-05, "loss": 0.5255, "step": 4472 }, { "epoch": 0.6835007831302288, "grad_norm": 0.3622485101222992, "learning_rate": 4.8837450441493824e-05, "loss": 0.7362, "step": 4473 }, { "epoch": 0.6836535890285365, "grad_norm": 0.2688015401363373, "learning_rate": 4.879457020549828e-05, "loss": 0.7041, "step": 4474 }, { "epoch": 0.6838063949268441, "grad_norm": 0.28353452682495117, "learning_rate": 4.8751702726905733e-05, "loss": 0.7001, "step": 4475 }, { "epoch": 0.6839592008251518, "grad_norm": 0.36507824063301086, "learning_rate": 4.870884801639622e-05, "loss": 0.8537, "step": 4476 }, { "epoch": 0.6841120067234595, "grad_norm": 0.3329671621322632, "learning_rate": 4.866600608464669e-05, "loss": 0.7158, "step": 4477 }, { "epoch": 0.6842648126217672, "grad_norm": 0.2636788785457611, "learning_rate": 4.862317694233085e-05, "loss": 0.7174, "step": 4478 }, { "epoch": 0.6844176185200749, "grad_norm": 0.3977915346622467, "learning_rate": 4.858036060011922e-05, "loss": 0.8131, "step": 4479 }, { "epoch": 0.6845704244183826, "grad_norm": 0.3472137749195099, "learning_rate": 4.8537557068679075e-05, "loss": 0.7707, "step": 4480 }, { "epoch": 0.6847232303166902, "grad_norm": 0.27570462226867676, "learning_rate": 4.849476635867464e-05, "loss": 0.5611, "step": 4481 }, { "epoch": 0.6848760362149979, "grad_norm": 0.2932675778865814, "learning_rate": 4.845198848076678e-05, "loss": 0.7531, "step": 4482 }, { "epoch": 0.6850288421133056, "grad_norm": 0.28090453147888184, "learning_rate": 4.840922344561328e-05, "loss": 0.7064, "step": 4483 }, { "epoch": 0.6851816480116133, "grad_norm": 0.4638606607913971, "learning_rate": 4.8366471263868726e-05, "loss": 0.6788, "step": 4484 }, { "epoch": 0.6853344539099209, "grad_norm": 0.3164824843406677, "learning_rate": 4.8323731946184446e-05, "loss": 0.8831, "step": 4485 }, { "epoch": 0.6854872598082286, "grad_norm": 0.5579379200935364, "learning_rate": 4.828100550320852e-05, "loss": 0.5889, "step": 4486 }, { "epoch": 0.6856400657065362, "grad_norm": 0.2748773694038391, "learning_rate": 4.823829194558593e-05, "loss": 0.5735, "step": 4487 }, { "epoch": 0.6857928716048439, "grad_norm": 0.3306643068790436, "learning_rate": 4.8195591283958483e-05, "loss": 0.7205, "step": 4488 }, { "epoch": 0.6859456775031516, "grad_norm": 0.4027121365070343, "learning_rate": 4.815290352896453e-05, "loss": 0.8095, "step": 4489 }, { "epoch": 0.6860984834014593, "grad_norm": 0.3824899196624756, "learning_rate": 4.8110228691239453e-05, "loss": 0.6471, "step": 4490 }, { "epoch": 0.686251289299767, "grad_norm": 0.2688082456588745, "learning_rate": 4.806756678141532e-05, "loss": 0.5867, "step": 4491 }, { "epoch": 0.6864040951980747, "grad_norm": 0.3712558448314667, "learning_rate": 4.8024917810121015e-05, "loss": 0.7572, "step": 4492 }, { "epoch": 0.6865569010963823, "grad_norm": 0.26351073384284973, "learning_rate": 4.7982281787982165e-05, "loss": 0.6777, "step": 4493 }, { "epoch": 0.68670970699469, "grad_norm": 0.5376992225646973, "learning_rate": 4.7939658725621104e-05, "loss": 0.7894, "step": 4494 }, { "epoch": 0.6868625128929977, "grad_norm": 0.34455183148384094, "learning_rate": 4.789704863365707e-05, "loss": 0.4973, "step": 4495 }, { "epoch": 0.6870153187913054, "grad_norm": 0.28120651841163635, "learning_rate": 4.7854451522706044e-05, "loss": 0.6821, "step": 4496 }, { "epoch": 0.687168124689613, "grad_norm": 0.36648836731910706, "learning_rate": 4.7811867403380696e-05, "loss": 0.6997, "step": 4497 }, { "epoch": 0.6873209305879207, "grad_norm": 0.3456977605819702, "learning_rate": 4.776929628629047e-05, "loss": 0.8019, "step": 4498 }, { "epoch": 0.6874737364862283, "grad_norm": 0.32706472277641296, "learning_rate": 4.7726738182041674e-05, "loss": 0.761, "step": 4499 }, { "epoch": 0.687626542384536, "grad_norm": 0.2870117723941803, "learning_rate": 4.768419310123723e-05, "loss": 0.6598, "step": 4500 }, { "epoch": 0.6877793482828437, "grad_norm": 0.4577259421348572, "learning_rate": 4.7641661054476946e-05, "loss": 0.7088, "step": 4501 }, { "epoch": 0.6879321541811514, "grad_norm": 0.260759562253952, "learning_rate": 4.759914205235728e-05, "loss": 0.7125, "step": 4502 }, { "epoch": 0.6880849600794591, "grad_norm": 0.3623206317424774, "learning_rate": 4.755663610547154e-05, "loss": 0.6856, "step": 4503 }, { "epoch": 0.6882377659777668, "grad_norm": 0.3873855471611023, "learning_rate": 4.751414322440966e-05, "loss": 0.545, "step": 4504 }, { "epoch": 0.6883905718760744, "grad_norm": 0.4338493049144745, "learning_rate": 4.747166341975844e-05, "loss": 0.8404, "step": 4505 }, { "epoch": 0.6885433777743821, "grad_norm": 0.23762832581996918, "learning_rate": 4.742919670210135e-05, "loss": 0.7745, "step": 4506 }, { "epoch": 0.6886961836726898, "grad_norm": 0.3028179109096527, "learning_rate": 4.738674308201858e-05, "loss": 0.6844, "step": 4507 }, { "epoch": 0.6888489895709975, "grad_norm": 0.3360441327095032, "learning_rate": 4.7344302570087115e-05, "loss": 0.773, "step": 4508 }, { "epoch": 0.6890017954693051, "grad_norm": 0.27509480714797974, "learning_rate": 4.730187517688069e-05, "loss": 0.6513, "step": 4509 }, { "epoch": 0.6891546013676128, "grad_norm": 0.4068647027015686, "learning_rate": 4.725946091296972e-05, "loss": 0.6546, "step": 4510 }, { "epoch": 0.6893074072659204, "grad_norm": 0.31606605648994446, "learning_rate": 4.72170597889213e-05, "loss": 0.5298, "step": 4511 }, { "epoch": 0.6894602131642281, "grad_norm": 0.2957019805908203, "learning_rate": 4.717467181529937e-05, "loss": 0.7674, "step": 4512 }, { "epoch": 0.6896130190625358, "grad_norm": 0.30565783381462097, "learning_rate": 4.713229700266455e-05, "loss": 0.5802, "step": 4513 }, { "epoch": 0.6897658249608435, "grad_norm": 0.30693966150283813, "learning_rate": 4.7089935361574154e-05, "loss": 0.5424, "step": 4514 }, { "epoch": 0.6899186308591512, "grad_norm": 0.2552562654018402, "learning_rate": 4.704758690258218e-05, "loss": 0.719, "step": 4515 }, { "epoch": 0.6900714367574589, "grad_norm": 0.2818084955215454, "learning_rate": 4.700525163623944e-05, "loss": 0.7768, "step": 4516 }, { "epoch": 0.6902242426557665, "grad_norm": 0.2802093029022217, "learning_rate": 4.696292957309345e-05, "loss": 0.6998, "step": 4517 }, { "epoch": 0.6903770485540742, "grad_norm": 0.4027109444141388, "learning_rate": 4.69206207236883e-05, "loss": 0.8152, "step": 4518 }, { "epoch": 0.6905298544523819, "grad_norm": 0.38950568437576294, "learning_rate": 4.687832509856498e-05, "loss": 0.6509, "step": 4519 }, { "epoch": 0.6906826603506895, "grad_norm": 0.2784578204154968, "learning_rate": 4.6836042708261044e-05, "loss": 0.7362, "step": 4520 }, { "epoch": 0.6908354662489972, "grad_norm": 0.3080911338329315, "learning_rate": 4.679377356331076e-05, "loss": 0.6629, "step": 4521 }, { "epoch": 0.6909882721473048, "grad_norm": 0.3341425359249115, "learning_rate": 4.675151767424516e-05, "loss": 0.6944, "step": 4522 }, { "epoch": 0.6911410780456125, "grad_norm": 0.3041728734970093, "learning_rate": 4.670927505159199e-05, "loss": 0.7363, "step": 4523 }, { "epoch": 0.6912938839439202, "grad_norm": 0.33875536918640137, "learning_rate": 4.666704570587561e-05, "loss": 0.6821, "step": 4524 }, { "epoch": 0.6914466898422279, "grad_norm": 0.34854626655578613, "learning_rate": 4.662482964761707e-05, "loss": 0.6976, "step": 4525 }, { "epoch": 0.6915994957405356, "grad_norm": 0.3705041706562042, "learning_rate": 4.6582626887334166e-05, "loss": 0.7212, "step": 4526 }, { "epoch": 0.6917523016388433, "grad_norm": 0.26057520508766174, "learning_rate": 4.654043743554143e-05, "loss": 0.6315, "step": 4527 }, { "epoch": 0.691905107537151, "grad_norm": 0.2733753025531769, "learning_rate": 4.649826130274993e-05, "loss": 0.7938, "step": 4528 }, { "epoch": 0.6920579134354586, "grad_norm": 0.3895609676837921, "learning_rate": 4.6456098499467504e-05, "loss": 0.693, "step": 4529 }, { "epoch": 0.6922107193337663, "grad_norm": 0.2385978400707245, "learning_rate": 4.6413949036198665e-05, "loss": 0.7292, "step": 4530 }, { "epoch": 0.692363525232074, "grad_norm": 0.31826111674308777, "learning_rate": 4.6371812923444645e-05, "loss": 0.6661, "step": 4531 }, { "epoch": 0.6925163311303816, "grad_norm": 0.2785007357597351, "learning_rate": 4.632969017170328e-05, "loss": 0.7982, "step": 4532 }, { "epoch": 0.6926691370286893, "grad_norm": 0.32651248574256897, "learning_rate": 4.628758079146904e-05, "loss": 0.6782, "step": 4533 }, { "epoch": 0.692821942926997, "grad_norm": 0.3232291638851166, "learning_rate": 4.6245484793233174e-05, "loss": 0.7127, "step": 4534 }, { "epoch": 0.6929747488253046, "grad_norm": 0.3334408700466156, "learning_rate": 4.620340218748358e-05, "loss": 0.77, "step": 4535 }, { "epoch": 0.6931275547236123, "grad_norm": 0.28033608198165894, "learning_rate": 4.6161332984704745e-05, "loss": 0.6162, "step": 4536 }, { "epoch": 0.69328036062192, "grad_norm": 0.30538347363471985, "learning_rate": 4.611927719537783e-05, "loss": 0.7848, "step": 4537 }, { "epoch": 0.6934331665202277, "grad_norm": 0.3362586796283722, "learning_rate": 4.6077234829980744e-05, "loss": 0.6955, "step": 4538 }, { "epoch": 0.6935859724185354, "grad_norm": 0.2605217695236206, "learning_rate": 4.603520589898792e-05, "loss": 0.8394, "step": 4539 }, { "epoch": 0.693738778316843, "grad_norm": 0.27571675181388855, "learning_rate": 4.59931904128706e-05, "loss": 0.5412, "step": 4540 }, { "epoch": 0.6938915842151507, "grad_norm": 0.28749173879623413, "learning_rate": 4.59511883820965e-05, "loss": 0.6878, "step": 4541 }, { "epoch": 0.6940443901134584, "grad_norm": 0.49778303503990173, "learning_rate": 4.590919981713016e-05, "loss": 0.9041, "step": 4542 }, { "epoch": 0.6941971960117661, "grad_norm": 0.23859632015228271, "learning_rate": 4.586722472843259e-05, "loss": 0.7782, "step": 4543 }, { "epoch": 0.6943500019100737, "grad_norm": 0.3211337625980377, "learning_rate": 4.582526312646158e-05, "loss": 0.7629, "step": 4544 }, { "epoch": 0.6945028078083814, "grad_norm": 0.3579085171222687, "learning_rate": 4.578331502167157e-05, "loss": 0.7138, "step": 4545 }, { "epoch": 0.694655613706689, "grad_norm": 0.2770020365715027, "learning_rate": 4.5741380424513446e-05, "loss": 0.5127, "step": 4546 }, { "epoch": 0.6948084196049967, "grad_norm": 0.35301145911216736, "learning_rate": 4.5699459345434937e-05, "loss": 0.854, "step": 4547 }, { "epoch": 0.6949612255033044, "grad_norm": 0.48538297414779663, "learning_rate": 4.5657551794880316e-05, "loss": 0.7938, "step": 4548 }, { "epoch": 0.6951140314016121, "grad_norm": 0.2835939824581146, "learning_rate": 4.561565778329057e-05, "loss": 0.7409, "step": 4549 }, { "epoch": 0.6952668372999198, "grad_norm": 0.28189727663993835, "learning_rate": 4.557377732110309e-05, "loss": 0.6853, "step": 4550 }, { "epoch": 0.6954196431982275, "grad_norm": 0.2865130603313446, "learning_rate": 4.553191041875214e-05, "loss": 0.8017, "step": 4551 }, { "epoch": 0.6955724490965352, "grad_norm": 0.31105712056159973, "learning_rate": 4.549005708666852e-05, "loss": 0.7171, "step": 4552 }, { "epoch": 0.6957252549948428, "grad_norm": 0.29811492562294006, "learning_rate": 4.544821733527958e-05, "loss": 0.7542, "step": 4553 }, { "epoch": 0.6958780608931505, "grad_norm": 0.29602208733558655, "learning_rate": 4.54063911750094e-05, "loss": 0.8067, "step": 4554 }, { "epoch": 0.6960308667914582, "grad_norm": 0.27338042855262756, "learning_rate": 4.536457861627854e-05, "loss": 0.6514, "step": 4555 }, { "epoch": 0.6961836726897658, "grad_norm": 0.3065601885318756, "learning_rate": 4.5322779669504344e-05, "loss": 0.6071, "step": 4556 }, { "epoch": 0.6963364785880735, "grad_norm": 0.253862202167511, "learning_rate": 4.528099434510058e-05, "loss": 0.5923, "step": 4557 }, { "epoch": 0.6964892844863811, "grad_norm": 0.2711423337459564, "learning_rate": 4.5239222653477786e-05, "loss": 0.616, "step": 4558 }, { "epoch": 0.6966420903846888, "grad_norm": 0.31371334195137024, "learning_rate": 4.5197464605043e-05, "loss": 0.7606, "step": 4559 }, { "epoch": 0.6967948962829965, "grad_norm": 0.3264442980289459, "learning_rate": 4.515572021019984e-05, "loss": 0.8191, "step": 4560 }, { "epoch": 0.6969477021813042, "grad_norm": 0.3145497739315033, "learning_rate": 4.511398947934861e-05, "loss": 0.6609, "step": 4561 }, { "epoch": 0.6971005080796119, "grad_norm": 0.4659073054790497, "learning_rate": 4.507227242288621e-05, "loss": 0.9295, "step": 4562 }, { "epoch": 0.6972533139779196, "grad_norm": 0.36418986320495605, "learning_rate": 4.503056905120606e-05, "loss": 0.6698, "step": 4563 }, { "epoch": 0.6974061198762272, "grad_norm": 0.27688685059547424, "learning_rate": 4.4988879374698165e-05, "loss": 0.7545, "step": 4564 }, { "epoch": 0.6975589257745349, "grad_norm": 0.4654596149921417, "learning_rate": 4.49472034037492e-05, "loss": 0.7939, "step": 4565 }, { "epoch": 0.6977117316728426, "grad_norm": 0.27792832255363464, "learning_rate": 4.4905541148742426e-05, "loss": 0.5877, "step": 4566 }, { "epoch": 0.6978645375711503, "grad_norm": 0.36160188913345337, "learning_rate": 4.486389262005759e-05, "loss": 0.7599, "step": 4567 }, { "epoch": 0.6980173434694579, "grad_norm": 0.28020909428596497, "learning_rate": 4.4822257828071046e-05, "loss": 0.4777, "step": 4568 }, { "epoch": 0.6981701493677656, "grad_norm": 0.27438125014305115, "learning_rate": 4.478063678315578e-05, "loss": 0.5457, "step": 4569 }, { "epoch": 0.6983229552660732, "grad_norm": 0.3062969744205475, "learning_rate": 4.473902949568138e-05, "loss": 0.7137, "step": 4570 }, { "epoch": 0.6984757611643809, "grad_norm": 0.29249826073646545, "learning_rate": 4.469743597601391e-05, "loss": 0.6558, "step": 4571 }, { "epoch": 0.6986285670626886, "grad_norm": 0.404161661863327, "learning_rate": 4.465585623451601e-05, "loss": 0.6618, "step": 4572 }, { "epoch": 0.6987813729609963, "grad_norm": 0.3219901919364929, "learning_rate": 4.4614290281546945e-05, "loss": 0.7573, "step": 4573 }, { "epoch": 0.698934178859304, "grad_norm": 0.30580899119377136, "learning_rate": 4.457273812746257e-05, "loss": 0.8471, "step": 4574 }, { "epoch": 0.6990869847576117, "grad_norm": 0.46826639771461487, "learning_rate": 4.453119978261524e-05, "loss": 0.7451, "step": 4575 }, { "epoch": 0.6992397906559193, "grad_norm": 0.3270047605037689, "learning_rate": 4.448967525735381e-05, "loss": 0.6547, "step": 4576 }, { "epoch": 0.699392596554227, "grad_norm": 0.27389249205589294, "learning_rate": 4.444816456202388e-05, "loss": 0.8253, "step": 4577 }, { "epoch": 0.6995454024525347, "grad_norm": 0.5950169563293457, "learning_rate": 4.4406667706967375e-05, "loss": 0.7164, "step": 4578 }, { "epoch": 0.6996982083508423, "grad_norm": 0.28673499822616577, "learning_rate": 4.4365184702522956e-05, "loss": 0.8878, "step": 4579 }, { "epoch": 0.69985101424915, "grad_norm": 0.2988281846046448, "learning_rate": 4.432371555902579e-05, "loss": 0.6067, "step": 4580 }, { "epoch": 0.7000038201474577, "grad_norm": 0.2832471430301666, "learning_rate": 4.428226028680754e-05, "loss": 0.6567, "step": 4581 }, { "epoch": 0.7001566260457653, "grad_norm": 0.31266599893569946, "learning_rate": 4.424081889619639e-05, "loss": 0.6556, "step": 4582 }, { "epoch": 0.700309431944073, "grad_norm": 0.3293362855911255, "learning_rate": 4.4199391397517154e-05, "loss": 0.7416, "step": 4583 }, { "epoch": 0.7004622378423807, "grad_norm": 0.3653784394264221, "learning_rate": 4.415797780109118e-05, "loss": 0.8261, "step": 4584 }, { "epoch": 0.7006150437406884, "grad_norm": 0.5519762635231018, "learning_rate": 4.4116578117236296e-05, "loss": 0.5055, "step": 4585 }, { "epoch": 0.7007678496389961, "grad_norm": 0.26612091064453125, "learning_rate": 4.407519235626683e-05, "loss": 0.6856, "step": 4586 }, { "epoch": 0.7009206555373038, "grad_norm": 0.3700180947780609, "learning_rate": 4.403382052849374e-05, "loss": 0.7334, "step": 4587 }, { "epoch": 0.7010734614356114, "grad_norm": 0.8454820513725281, "learning_rate": 4.399246264422452e-05, "loss": 0.5672, "step": 4588 }, { "epoch": 0.7012262673339191, "grad_norm": 0.2447136491537094, "learning_rate": 4.395111871376308e-05, "loss": 0.6859, "step": 4589 }, { "epoch": 0.7013790732322268, "grad_norm": 0.48516905307769775, "learning_rate": 4.39097887474099e-05, "loss": 0.6208, "step": 4590 }, { "epoch": 0.7015318791305344, "grad_norm": 0.3230639398097992, "learning_rate": 4.3868472755462043e-05, "loss": 0.8115, "step": 4591 }, { "epoch": 0.7016846850288421, "grad_norm": 0.2862485647201538, "learning_rate": 4.3827170748212985e-05, "loss": 0.6882, "step": 4592 }, { "epoch": 0.7018374909271498, "grad_norm": 0.2857116758823395, "learning_rate": 4.3785882735952844e-05, "loss": 0.5287, "step": 4593 }, { "epoch": 0.7019902968254574, "grad_norm": 0.31349948048591614, "learning_rate": 4.3744608728968104e-05, "loss": 0.7222, "step": 4594 }, { "epoch": 0.7021431027237651, "grad_norm": 0.3246481120586395, "learning_rate": 4.3703348737541914e-05, "loss": 0.7359, "step": 4595 }, { "epoch": 0.7022959086220728, "grad_norm": 0.2840207517147064, "learning_rate": 4.3662102771953785e-05, "loss": 0.7063, "step": 4596 }, { "epoch": 0.7024487145203805, "grad_norm": 0.4355444610118866, "learning_rate": 4.362087084247988e-05, "loss": 0.7508, "step": 4597 }, { "epoch": 0.7026015204186882, "grad_norm": 0.30774471163749695, "learning_rate": 4.3579652959392736e-05, "loss": 0.9731, "step": 4598 }, { "epoch": 0.7027543263169959, "grad_norm": 0.3269573748111725, "learning_rate": 4.3538449132961415e-05, "loss": 0.5729, "step": 4599 }, { "epoch": 0.7029071322153035, "grad_norm": 0.4266010820865631, "learning_rate": 4.3497259373451536e-05, "loss": 0.4978, "step": 4600 }, { "epoch": 0.7030599381136112, "grad_norm": 0.2678755521774292, "learning_rate": 4.345608369112523e-05, "loss": 0.8741, "step": 4601 }, { "epoch": 0.7032127440119189, "grad_norm": 0.558524489402771, "learning_rate": 4.3414922096241025e-05, "loss": 0.5434, "step": 4602 }, { "epoch": 0.7033655499102265, "grad_norm": 0.43573448061943054, "learning_rate": 4.3373774599053966e-05, "loss": 0.6227, "step": 4603 }, { "epoch": 0.7035183558085342, "grad_norm": 0.2877878248691559, "learning_rate": 4.3332641209815615e-05, "loss": 0.5567, "step": 4604 }, { "epoch": 0.7036711617068419, "grad_norm": 0.28066501021385193, "learning_rate": 4.329152193877404e-05, "loss": 0.6789, "step": 4605 }, { "epoch": 0.7038239676051495, "grad_norm": 0.2897006571292877, "learning_rate": 4.325041679617381e-05, "loss": 0.6965, "step": 4606 }, { "epoch": 0.7039767735034572, "grad_norm": 0.40210720896720886, "learning_rate": 4.3209325792255796e-05, "loss": 0.7664, "step": 4607 }, { "epoch": 0.7041295794017649, "grad_norm": 0.5120120048522949, "learning_rate": 4.316824893725755e-05, "loss": 0.618, "step": 4608 }, { "epoch": 0.7042823853000726, "grad_norm": 0.30149465799331665, "learning_rate": 4.3127186241413055e-05, "loss": 0.5665, "step": 4609 }, { "epoch": 0.7044351911983803, "grad_norm": 0.3012610673904419, "learning_rate": 4.308613771495267e-05, "loss": 0.8612, "step": 4610 }, { "epoch": 0.704587997096688, "grad_norm": 0.29444053769111633, "learning_rate": 4.3045103368103355e-05, "loss": 0.7114, "step": 4611 }, { "epoch": 0.7047408029949956, "grad_norm": 0.3533160388469696, "learning_rate": 4.300408321108842e-05, "loss": 0.8258, "step": 4612 }, { "epoch": 0.7048936088933033, "grad_norm": 0.25161516666412354, "learning_rate": 4.296307725412774e-05, "loss": 0.5679, "step": 4613 }, { "epoch": 0.705046414791611, "grad_norm": 0.2681884169578552, "learning_rate": 4.292208550743755e-05, "loss": 0.8816, "step": 4614 }, { "epoch": 0.7051992206899186, "grad_norm": 0.2852892279624939, "learning_rate": 4.288110798123066e-05, "loss": 0.6218, "step": 4615 }, { "epoch": 0.7053520265882263, "grad_norm": 0.27947360277175903, "learning_rate": 4.2840144685716245e-05, "loss": 0.6808, "step": 4616 }, { "epoch": 0.705504832486534, "grad_norm": 0.25709283351898193, "learning_rate": 4.2799195631099944e-05, "loss": 0.7561, "step": 4617 }, { "epoch": 0.7056576383848416, "grad_norm": 0.33592861890792847, "learning_rate": 4.275826082758388e-05, "loss": 0.7795, "step": 4618 }, { "epoch": 0.7058104442831493, "grad_norm": 0.3069939613342285, "learning_rate": 4.271734028536667e-05, "loss": 0.5641, "step": 4619 }, { "epoch": 0.705963250181457, "grad_norm": 0.2989427447319031, "learning_rate": 4.2676434014643285e-05, "loss": 0.7034, "step": 4620 }, { "epoch": 0.7061160560797647, "grad_norm": 0.2989204227924347, "learning_rate": 4.2635542025605146e-05, "loss": 0.6892, "step": 4621 }, { "epoch": 0.7062688619780724, "grad_norm": 0.29351404309272766, "learning_rate": 4.259466432844017e-05, "loss": 0.6761, "step": 4622 }, { "epoch": 0.70642166787638, "grad_norm": 0.27882349491119385, "learning_rate": 4.255380093333274e-05, "loss": 0.6945, "step": 4623 }, { "epoch": 0.7065744737746877, "grad_norm": 0.28001776337623596, "learning_rate": 4.25129518504636e-05, "loss": 0.7171, "step": 4624 }, { "epoch": 0.7067272796729954, "grad_norm": 0.3076488673686981, "learning_rate": 4.247211709000991e-05, "loss": 0.6851, "step": 4625 }, { "epoch": 0.7068800855713031, "grad_norm": 0.43300819396972656, "learning_rate": 4.243129666214534e-05, "loss": 0.6699, "step": 4626 }, { "epoch": 0.7070328914696107, "grad_norm": 0.31589996814727783, "learning_rate": 4.239049057703999e-05, "loss": 0.5801, "step": 4627 }, { "epoch": 0.7071856973679184, "grad_norm": 0.28026047348976135, "learning_rate": 4.234969884486033e-05, "loss": 0.7313, "step": 4628 }, { "epoch": 0.707338503266226, "grad_norm": 0.31301337480545044, "learning_rate": 4.230892147576924e-05, "loss": 0.7132, "step": 4629 }, { "epoch": 0.7074913091645337, "grad_norm": 0.3458845913410187, "learning_rate": 4.226815847992611e-05, "loss": 0.6742, "step": 4630 }, { "epoch": 0.7076441150628414, "grad_norm": 0.2839435636997223, "learning_rate": 4.2227409867486665e-05, "loss": 0.7291, "step": 4631 }, { "epoch": 0.7077969209611491, "grad_norm": 0.3225105404853821, "learning_rate": 4.2186675648603125e-05, "loss": 0.7466, "step": 4632 }, { "epoch": 0.7079497268594568, "grad_norm": 0.2577323913574219, "learning_rate": 4.2145955833424e-05, "loss": 0.7672, "step": 4633 }, { "epoch": 0.7081025327577645, "grad_norm": 0.3790148198604584, "learning_rate": 4.210525043209439e-05, "loss": 0.637, "step": 4634 }, { "epoch": 0.7082553386560722, "grad_norm": 0.30979880690574646, "learning_rate": 4.20645594547556e-05, "loss": 0.8716, "step": 4635 }, { "epoch": 0.7084081445543798, "grad_norm": 0.28678464889526367, "learning_rate": 4.202388291154555e-05, "loss": 0.801, "step": 4636 }, { "epoch": 0.7085609504526875, "grad_norm": 0.2768631875514984, "learning_rate": 4.19832208125984e-05, "loss": 0.8203, "step": 4637 }, { "epoch": 0.7087137563509951, "grad_norm": 0.2772439122200012, "learning_rate": 4.1942573168044743e-05, "loss": 0.7469, "step": 4638 }, { "epoch": 0.7088665622493028, "grad_norm": 0.3146333694458008, "learning_rate": 4.1901939988011626e-05, "loss": 0.8942, "step": 4639 }, { "epoch": 0.7090193681476105, "grad_norm": 0.26574763655662537, "learning_rate": 4.186132128262248e-05, "loss": 0.6315, "step": 4640 }, { "epoch": 0.7091721740459181, "grad_norm": 0.31069281697273254, "learning_rate": 4.182071706199717e-05, "loss": 0.6383, "step": 4641 }, { "epoch": 0.7093249799442258, "grad_norm": 0.359215646982193, "learning_rate": 4.1780127336251776e-05, "loss": 0.7897, "step": 4642 }, { "epoch": 0.7094777858425335, "grad_norm": 0.3164921700954437, "learning_rate": 4.1739552115498924e-05, "loss": 0.8371, "step": 4643 }, { "epoch": 0.7096305917408412, "grad_norm": 0.4171659052371979, "learning_rate": 4.169899140984763e-05, "loss": 0.8449, "step": 4644 }, { "epoch": 0.7097833976391489, "grad_norm": 0.3271016776561737, "learning_rate": 4.165844522940325e-05, "loss": 0.7758, "step": 4645 }, { "epoch": 0.7099362035374566, "grad_norm": 0.33073553442955017, "learning_rate": 4.161791358426752e-05, "loss": 0.7159, "step": 4646 }, { "epoch": 0.7100890094357643, "grad_norm": 0.23120705783367157, "learning_rate": 4.157739648453851e-05, "loss": 0.799, "step": 4647 }, { "epoch": 0.7102418153340719, "grad_norm": 0.2860707938671112, "learning_rate": 4.15368939403108e-05, "loss": 0.7074, "step": 4648 }, { "epoch": 0.7103946212323796, "grad_norm": 0.29551446437835693, "learning_rate": 4.1496405961675155e-05, "loss": 0.5792, "step": 4649 }, { "epoch": 0.7105474271306872, "grad_norm": 0.2846836447715759, "learning_rate": 4.1455932558718915e-05, "loss": 0.7368, "step": 4650 }, { "epoch": 0.7107002330289949, "grad_norm": 0.31006723642349243, "learning_rate": 4.14154737415256e-05, "loss": 0.7059, "step": 4651 }, { "epoch": 0.7108530389273026, "grad_norm": 0.25622233748435974, "learning_rate": 4.137502952017528e-05, "loss": 0.652, "step": 4652 }, { "epoch": 0.7110058448256102, "grad_norm": 0.34584179520606995, "learning_rate": 4.1334599904744195e-05, "loss": 0.5198, "step": 4653 }, { "epoch": 0.7111586507239179, "grad_norm": 0.28006860613822937, "learning_rate": 4.1294184905305146e-05, "loss": 0.6968, "step": 4654 }, { "epoch": 0.7113114566222256, "grad_norm": 0.28837811946868896, "learning_rate": 4.125378453192712e-05, "loss": 0.7734, "step": 4655 }, { "epoch": 0.7114642625205333, "grad_norm": 0.5305294990539551, "learning_rate": 4.121339879467552e-05, "loss": 0.6769, "step": 4656 }, { "epoch": 0.711617068418841, "grad_norm": 0.3161180913448334, "learning_rate": 4.117302770361213e-05, "loss": 0.6046, "step": 4657 }, { "epoch": 0.7117698743171487, "grad_norm": 0.31682288646698, "learning_rate": 4.113267126879513e-05, "loss": 0.6814, "step": 4658 }, { "epoch": 0.7119226802154563, "grad_norm": 0.25436070561408997, "learning_rate": 4.109232950027893e-05, "loss": 0.6177, "step": 4659 }, { "epoch": 0.712075486113764, "grad_norm": 0.3298552632331848, "learning_rate": 4.105200240811431e-05, "loss": 0.7724, "step": 4660 }, { "epoch": 0.7122282920120717, "grad_norm": 0.33188796043395996, "learning_rate": 4.101169000234847e-05, "loss": 0.5912, "step": 4661 }, { "epoch": 0.7123810979103793, "grad_norm": 0.27348458766937256, "learning_rate": 4.0971392293024946e-05, "loss": 0.7854, "step": 4662 }, { "epoch": 0.712533903808687, "grad_norm": 0.30659839510917664, "learning_rate": 4.093110929018352e-05, "loss": 0.7274, "step": 4663 }, { "epoch": 0.7126867097069947, "grad_norm": 0.5424551963806152, "learning_rate": 4.0890841003860346e-05, "loss": 0.5983, "step": 4664 }, { "epoch": 0.7128395156053023, "grad_norm": 0.29840460419654846, "learning_rate": 4.085058744408796e-05, "loss": 0.836, "step": 4665 }, { "epoch": 0.71299232150361, "grad_norm": 0.25009292364120483, "learning_rate": 4.081034862089523e-05, "loss": 0.6681, "step": 4666 }, { "epoch": 0.7131451274019177, "grad_norm": 0.2948574125766754, "learning_rate": 4.07701245443073e-05, "loss": 0.7458, "step": 4667 }, { "epoch": 0.7132979333002254, "grad_norm": 0.2596202790737152, "learning_rate": 4.072991522434559e-05, "loss": 0.62, "step": 4668 }, { "epoch": 0.7134507391985331, "grad_norm": 0.29064345359802246, "learning_rate": 4.068972067102803e-05, "loss": 0.7136, "step": 4669 }, { "epoch": 0.7136035450968408, "grad_norm": 0.28021040558815, "learning_rate": 4.0649540894368666e-05, "loss": 0.5538, "step": 4670 }, { "epoch": 0.7137563509951484, "grad_norm": 0.2738005220890045, "learning_rate": 4.0609375904377975e-05, "loss": 0.6811, "step": 4671 }, { "epoch": 0.7139091568934561, "grad_norm": 0.23311470448970795, "learning_rate": 4.056922571106277e-05, "loss": 0.5184, "step": 4672 }, { "epoch": 0.7140619627917638, "grad_norm": 0.389201819896698, "learning_rate": 4.0529090324426125e-05, "loss": 0.7657, "step": 4673 }, { "epoch": 0.7142147686900714, "grad_norm": 0.27108216285705566, "learning_rate": 4.048896975446736e-05, "loss": 0.6359, "step": 4674 }, { "epoch": 0.7143675745883791, "grad_norm": 0.2944895625114441, "learning_rate": 4.044886401118223e-05, "loss": 0.6726, "step": 4675 }, { "epoch": 0.7145203804866868, "grad_norm": 0.27678659558296204, "learning_rate": 4.040877310456278e-05, "loss": 0.7396, "step": 4676 }, { "epoch": 0.7146731863849944, "grad_norm": 0.3424450755119324, "learning_rate": 4.036869704459729e-05, "loss": 0.6605, "step": 4677 }, { "epoch": 0.7148259922833021, "grad_norm": 0.9500914216041565, "learning_rate": 4.0328635841270346e-05, "loss": 0.593, "step": 4678 }, { "epoch": 0.7149787981816098, "grad_norm": 0.41044652462005615, "learning_rate": 4.0288589504562865e-05, "loss": 0.7482, "step": 4679 }, { "epoch": 0.7151316040799175, "grad_norm": 0.36469408869743347, "learning_rate": 4.024855804445213e-05, "loss": 0.7855, "step": 4680 }, { "epoch": 0.7152844099782252, "grad_norm": 0.24194401502609253, "learning_rate": 4.0208541470911584e-05, "loss": 0.6371, "step": 4681 }, { "epoch": 0.7154372158765329, "grad_norm": 0.2713262736797333, "learning_rate": 4.0168539793911e-05, "loss": 0.6222, "step": 4682 }, { "epoch": 0.7155900217748405, "grad_norm": 0.3250422477722168, "learning_rate": 4.012855302341647e-05, "loss": 0.8088, "step": 4683 }, { "epoch": 0.7157428276731482, "grad_norm": 0.3172820806503296, "learning_rate": 4.0088581169390424e-05, "loss": 0.7694, "step": 4684 }, { "epoch": 0.7158956335714558, "grad_norm": 0.3846489191055298, "learning_rate": 4.0048624241791464e-05, "loss": 0.8115, "step": 4685 }, { "epoch": 0.7160484394697635, "grad_norm": 0.2700871527194977, "learning_rate": 4.0008682250574504e-05, "loss": 0.7215, "step": 4686 }, { "epoch": 0.7162012453680712, "grad_norm": 0.33228370547294617, "learning_rate": 3.99687552056908e-05, "loss": 0.8418, "step": 4687 }, { "epoch": 0.7163540512663789, "grad_norm": 0.2891543209552765, "learning_rate": 3.992884311708779e-05, "loss": 0.52, "step": 4688 }, { "epoch": 0.7165068571646865, "grad_norm": 0.31329476833343506, "learning_rate": 3.9888945994709306e-05, "loss": 0.7719, "step": 4689 }, { "epoch": 0.7166596630629942, "grad_norm": 0.28452637791633606, "learning_rate": 3.9849063848495295e-05, "loss": 0.4992, "step": 4690 }, { "epoch": 0.7168124689613019, "grad_norm": 0.3185611069202423, "learning_rate": 3.9809196688382145e-05, "loss": 0.8112, "step": 4691 }, { "epoch": 0.7169652748596096, "grad_norm": 0.2967831790447235, "learning_rate": 3.9769344524302355e-05, "loss": 0.6697, "step": 4692 }, { "epoch": 0.7171180807579173, "grad_norm": 0.3396419584751129, "learning_rate": 3.972950736618482e-05, "loss": 0.8737, "step": 4693 }, { "epoch": 0.717270886656225, "grad_norm": 0.2848491966724396, "learning_rate": 3.968968522395459e-05, "loss": 0.7481, "step": 4694 }, { "epoch": 0.7174236925545326, "grad_norm": 0.3522728681564331, "learning_rate": 3.9649878107533e-05, "loss": 0.6715, "step": 4695 }, { "epoch": 0.7175764984528403, "grad_norm": 0.2862434685230255, "learning_rate": 3.961008602683768e-05, "loss": 0.4666, "step": 4696 }, { "epoch": 0.7177293043511479, "grad_norm": 0.32041534781455994, "learning_rate": 3.9570308991782534e-05, "loss": 0.9037, "step": 4697 }, { "epoch": 0.7178821102494556, "grad_norm": 0.29981473088264465, "learning_rate": 3.953054701227764e-05, "loss": 0.7015, "step": 4698 }, { "epoch": 0.7180349161477633, "grad_norm": 0.33403995633125305, "learning_rate": 3.949080009822933e-05, "loss": 0.5762, "step": 4699 }, { "epoch": 0.718187722046071, "grad_norm": 0.27824243903160095, "learning_rate": 3.9451068259540244e-05, "loss": 0.8935, "step": 4700 }, { "epoch": 0.7183405279443786, "grad_norm": 0.2845570743083954, "learning_rate": 3.941135150610929e-05, "loss": 0.7272, "step": 4701 }, { "epoch": 0.7184933338426863, "grad_norm": 0.34788352251052856, "learning_rate": 3.937164984783149e-05, "loss": 0.7927, "step": 4702 }, { "epoch": 0.718646139740994, "grad_norm": 0.3194750249385834, "learning_rate": 3.933196329459818e-05, "loss": 0.809, "step": 4703 }, { "epoch": 0.7187989456393017, "grad_norm": 0.3060329258441925, "learning_rate": 3.9292291856296945e-05, "loss": 0.7795, "step": 4704 }, { "epoch": 0.7189517515376094, "grad_norm": 0.26089486479759216, "learning_rate": 3.9252635542811645e-05, "loss": 0.5469, "step": 4705 }, { "epoch": 0.7191045574359171, "grad_norm": 0.32387828826904297, "learning_rate": 3.9212994364022224e-05, "loss": 0.713, "step": 4706 }, { "epoch": 0.7192573633342247, "grad_norm": 0.2914409339427948, "learning_rate": 3.917336832980504e-05, "loss": 0.663, "step": 4707 }, { "epoch": 0.7194101692325324, "grad_norm": 0.2582574486732483, "learning_rate": 3.913375745003254e-05, "loss": 0.6321, "step": 4708 }, { "epoch": 0.71956297513084, "grad_norm": 0.28708699345588684, "learning_rate": 3.909416173457341e-05, "loss": 0.642, "step": 4709 }, { "epoch": 0.7197157810291477, "grad_norm": 0.34295615553855896, "learning_rate": 3.905458119329262e-05, "loss": 0.672, "step": 4710 }, { "epoch": 0.7198685869274554, "grad_norm": 0.2741999924182892, "learning_rate": 3.9015015836051375e-05, "loss": 0.6434, "step": 4711 }, { "epoch": 0.720021392825763, "grad_norm": 0.27946770191192627, "learning_rate": 3.897546567270701e-05, "loss": 0.8582, "step": 4712 }, { "epoch": 0.7201741987240707, "grad_norm": 0.33194059133529663, "learning_rate": 3.893593071311309e-05, "loss": 0.6621, "step": 4713 }, { "epoch": 0.7203270046223784, "grad_norm": 0.335144966840744, "learning_rate": 3.8896410967119434e-05, "loss": 0.7997, "step": 4714 }, { "epoch": 0.7204798105206861, "grad_norm": 0.31013959646224976, "learning_rate": 3.8856906444572114e-05, "loss": 0.832, "step": 4715 }, { "epoch": 0.7206326164189938, "grad_norm": 0.3294447660446167, "learning_rate": 3.8817417155313295e-05, "loss": 0.7865, "step": 4716 }, { "epoch": 0.7207854223173015, "grad_norm": 0.4748566448688507, "learning_rate": 3.877794310918138e-05, "loss": 0.7879, "step": 4717 }, { "epoch": 0.7209382282156092, "grad_norm": 0.3431771695613861, "learning_rate": 3.873848431601102e-05, "loss": 0.6674, "step": 4718 }, { "epoch": 0.7210910341139168, "grad_norm": 0.2861068844795227, "learning_rate": 3.869904078563309e-05, "loss": 0.6783, "step": 4719 }, { "epoch": 0.7212438400122245, "grad_norm": 0.4855247735977173, "learning_rate": 3.8659612527874576e-05, "loss": 0.5126, "step": 4720 }, { "epoch": 0.7213966459105321, "grad_norm": 0.25908955931663513, "learning_rate": 3.8620199552558654e-05, "loss": 0.6448, "step": 4721 }, { "epoch": 0.7215494518088398, "grad_norm": 0.34213340282440186, "learning_rate": 3.8580801869504776e-05, "loss": 0.5608, "step": 4722 }, { "epoch": 0.7217022577071475, "grad_norm": 0.2678838074207306, "learning_rate": 3.8541419488528585e-05, "loss": 0.8313, "step": 4723 }, { "epoch": 0.7218550636054551, "grad_norm": 0.3597886562347412, "learning_rate": 3.8502052419441826e-05, "loss": 0.5639, "step": 4724 }, { "epoch": 0.7220078695037628, "grad_norm": 0.27323228120803833, "learning_rate": 3.846270067205244e-05, "loss": 0.7698, "step": 4725 }, { "epoch": 0.7221606754020705, "grad_norm": 0.38855069875717163, "learning_rate": 3.842336425616466e-05, "loss": 0.8291, "step": 4726 }, { "epoch": 0.7223134813003782, "grad_norm": 0.3744564354419708, "learning_rate": 3.838404318157875e-05, "loss": 0.6755, "step": 4727 }, { "epoch": 0.7224662871986859, "grad_norm": 0.448920339345932, "learning_rate": 3.834473745809131e-05, "loss": 0.6632, "step": 4728 }, { "epoch": 0.7226190930969936, "grad_norm": 0.40930065512657166, "learning_rate": 3.830544709549493e-05, "loss": 0.805, "step": 4729 }, { "epoch": 0.7227718989953013, "grad_norm": 0.33985939621925354, "learning_rate": 3.826617210357857e-05, "loss": 0.698, "step": 4730 }, { "epoch": 0.7229247048936089, "grad_norm": 0.3228268325328827, "learning_rate": 3.822691249212719e-05, "loss": 0.586, "step": 4731 }, { "epoch": 0.7230775107919166, "grad_norm": 0.2895890772342682, "learning_rate": 3.818766827092202e-05, "loss": 0.6855, "step": 4732 }, { "epoch": 0.7232303166902242, "grad_norm": 0.36268943548202515, "learning_rate": 3.8148439449740494e-05, "loss": 0.5884, "step": 4733 }, { "epoch": 0.7233831225885319, "grad_norm": 0.2500065863132477, "learning_rate": 3.810922603835602e-05, "loss": 0.6736, "step": 4734 }, { "epoch": 0.7235359284868396, "grad_norm": 0.34311750531196594, "learning_rate": 3.807002804653835e-05, "loss": 0.9017, "step": 4735 }, { "epoch": 0.7236887343851472, "grad_norm": 0.3456333577632904, "learning_rate": 3.803084548405335e-05, "loss": 0.5246, "step": 4736 }, { "epoch": 0.7238415402834549, "grad_norm": 0.2903348207473755, "learning_rate": 3.799167836066306e-05, "loss": 0.5095, "step": 4737 }, { "epoch": 0.7239943461817626, "grad_norm": 0.3143153488636017, "learning_rate": 3.7952526686125545e-05, "loss": 0.732, "step": 4738 }, { "epoch": 0.7241471520800703, "grad_norm": 0.3224197328090668, "learning_rate": 3.791339047019515e-05, "loss": 0.6884, "step": 4739 }, { "epoch": 0.724299957978378, "grad_norm": 0.297520250082016, "learning_rate": 3.7874269722622394e-05, "loss": 0.7355, "step": 4740 }, { "epoch": 0.7244527638766857, "grad_norm": 0.3008733093738556, "learning_rate": 3.7835164453153806e-05, "loss": 0.6028, "step": 4741 }, { "epoch": 0.7246055697749934, "grad_norm": 0.34514373540878296, "learning_rate": 3.779607467153219e-05, "loss": 0.8412, "step": 4742 }, { "epoch": 0.724758375673301, "grad_norm": 0.31843021512031555, "learning_rate": 3.775700038749639e-05, "loss": 0.722, "step": 4743 }, { "epoch": 0.7249111815716086, "grad_norm": 0.23146043717861176, "learning_rate": 3.7717941610781485e-05, "loss": 0.6464, "step": 4744 }, { "epoch": 0.7250639874699163, "grad_norm": 0.2926734387874603, "learning_rate": 3.7678898351118586e-05, "loss": 0.6496, "step": 4745 }, { "epoch": 0.725216793368224, "grad_norm": 0.33022475242614746, "learning_rate": 3.763987061823506e-05, "loss": 0.6851, "step": 4746 }, { "epoch": 0.7253695992665317, "grad_norm": 0.5083408951759338, "learning_rate": 3.760085842185431e-05, "loss": 0.6626, "step": 4747 }, { "epoch": 0.7255224051648393, "grad_norm": 0.2732957601547241, "learning_rate": 3.756186177169585e-05, "loss": 0.5641, "step": 4748 }, { "epoch": 0.725675211063147, "grad_norm": 0.27765700221061707, "learning_rate": 3.7522880677475415e-05, "loss": 0.7062, "step": 4749 }, { "epoch": 0.7258280169614547, "grad_norm": 0.2967795431613922, "learning_rate": 3.748391514890484e-05, "loss": 0.7701, "step": 4750 }, { "epoch": 0.7259808228597624, "grad_norm": 0.284739226102829, "learning_rate": 3.744496519569203e-05, "loss": 0.838, "step": 4751 }, { "epoch": 0.7261336287580701, "grad_norm": 0.28099575638771057, "learning_rate": 3.740603082754101e-05, "loss": 0.7573, "step": 4752 }, { "epoch": 0.7262864346563778, "grad_norm": 0.35599344968795776, "learning_rate": 3.7367112054151964e-05, "loss": 0.6278, "step": 4753 }, { "epoch": 0.7264392405546855, "grad_norm": 0.2718389630317688, "learning_rate": 3.732820888522124e-05, "loss": 0.6404, "step": 4754 }, { "epoch": 0.7265920464529931, "grad_norm": 0.2878887951374054, "learning_rate": 3.728932133044119e-05, "loss": 0.6494, "step": 4755 }, { "epoch": 0.7267448523513007, "grad_norm": 0.6028104424476624, "learning_rate": 3.725044939950029e-05, "loss": 0.7728, "step": 4756 }, { "epoch": 0.7268976582496084, "grad_norm": 0.28222134709358215, "learning_rate": 3.7211593102083186e-05, "loss": 0.7582, "step": 4757 }, { "epoch": 0.7270504641479161, "grad_norm": 0.29730039834976196, "learning_rate": 3.717275244787063e-05, "loss": 0.8627, "step": 4758 }, { "epoch": 0.7272032700462238, "grad_norm": 0.32104724645614624, "learning_rate": 3.713392744653942e-05, "loss": 0.7272, "step": 4759 }, { "epoch": 0.7273560759445314, "grad_norm": 0.2982363998889923, "learning_rate": 3.709511810776244e-05, "loss": 0.7101, "step": 4760 }, { "epoch": 0.7275088818428391, "grad_norm": 0.25882184505462646, "learning_rate": 3.7056324441208734e-05, "loss": 0.7863, "step": 4761 }, { "epoch": 0.7276616877411468, "grad_norm": 0.29439249634742737, "learning_rate": 3.7017546456543476e-05, "loss": 0.7806, "step": 4762 }, { "epoch": 0.7278144936394545, "grad_norm": 0.33841472864151, "learning_rate": 3.697878416342781e-05, "loss": 0.6692, "step": 4763 }, { "epoch": 0.7279672995377622, "grad_norm": 0.42538225650787354, "learning_rate": 3.694003757151904e-05, "loss": 0.5909, "step": 4764 }, { "epoch": 0.7281201054360699, "grad_norm": 0.30907660722732544, "learning_rate": 3.690130669047059e-05, "loss": 0.7845, "step": 4765 }, { "epoch": 0.7282729113343775, "grad_norm": 0.369582861661911, "learning_rate": 3.686259152993189e-05, "loss": 0.5686, "step": 4766 }, { "epoch": 0.7284257172326852, "grad_norm": 0.3360534906387329, "learning_rate": 3.6823892099548506e-05, "loss": 0.6306, "step": 4767 }, { "epoch": 0.7285785231309928, "grad_norm": 0.35224616527557373, "learning_rate": 3.6785208408962133e-05, "loss": 0.5248, "step": 4768 }, { "epoch": 0.7287313290293005, "grad_norm": 0.3557858467102051, "learning_rate": 3.674654046781044e-05, "loss": 0.8301, "step": 4769 }, { "epoch": 0.7288841349276082, "grad_norm": 0.3504233956336975, "learning_rate": 3.67078882857272e-05, "loss": 0.6455, "step": 4770 }, { "epoch": 0.7290369408259159, "grad_norm": 0.34271594882011414, "learning_rate": 3.666925187234229e-05, "loss": 0.9036, "step": 4771 }, { "epoch": 0.7291897467242235, "grad_norm": 0.26382726430892944, "learning_rate": 3.66306312372817e-05, "loss": 0.7683, "step": 4772 }, { "epoch": 0.7293425526225312, "grad_norm": 0.2812560498714447, "learning_rate": 3.6592026390167413e-05, "loss": 0.6227, "step": 4773 }, { "epoch": 0.7294953585208389, "grad_norm": 0.30799320340156555, "learning_rate": 3.6553437340617436e-05, "loss": 0.7421, "step": 4774 }, { "epoch": 0.7296481644191466, "grad_norm": 0.28414100408554077, "learning_rate": 3.651486409824597e-05, "loss": 0.6298, "step": 4775 }, { "epoch": 0.7298009703174543, "grad_norm": 0.34525686502456665, "learning_rate": 3.647630667266323e-05, "loss": 0.6816, "step": 4776 }, { "epoch": 0.729953776215762, "grad_norm": 0.28054291009902954, "learning_rate": 3.643776507347546e-05, "loss": 0.7858, "step": 4777 }, { "epoch": 0.7301065821140696, "grad_norm": 0.25552770495414734, "learning_rate": 3.639923931028493e-05, "loss": 0.6176, "step": 4778 }, { "epoch": 0.7302593880123773, "grad_norm": 0.2665732800960541, "learning_rate": 3.636072939269008e-05, "loss": 0.6894, "step": 4779 }, { "epoch": 0.7304121939106849, "grad_norm": 0.3016633987426758, "learning_rate": 3.632223533028525e-05, "loss": 0.8169, "step": 4780 }, { "epoch": 0.7305649998089926, "grad_norm": 0.31519678235054016, "learning_rate": 3.6283757132661e-05, "loss": 0.6808, "step": 4781 }, { "epoch": 0.7307178057073003, "grad_norm": 0.27059051394462585, "learning_rate": 3.624529480940379e-05, "loss": 0.6657, "step": 4782 }, { "epoch": 0.730870611605608, "grad_norm": 0.2791256308555603, "learning_rate": 3.6206848370096225e-05, "loss": 0.7948, "step": 4783 }, { "epoch": 0.7310234175039156, "grad_norm": 0.3773775100708008, "learning_rate": 3.616841782431687e-05, "loss": 0.7192, "step": 4784 }, { "epoch": 0.7311762234022233, "grad_norm": 0.287503719329834, "learning_rate": 3.6130003181640425e-05, "loss": 0.6652, "step": 4785 }, { "epoch": 0.731329029300531, "grad_norm": 0.3499451279640198, "learning_rate": 3.6091604451637516e-05, "loss": 0.6854, "step": 4786 }, { "epoch": 0.7314818351988387, "grad_norm": 0.331950306892395, "learning_rate": 3.605322164387493e-05, "loss": 0.9495, "step": 4787 }, { "epoch": 0.7316346410971464, "grad_norm": 0.2893081605434418, "learning_rate": 3.601485476791534e-05, "loss": 0.6678, "step": 4788 }, { "epoch": 0.7317874469954541, "grad_norm": 0.34236064553260803, "learning_rate": 3.597650383331762e-05, "loss": 0.6551, "step": 4789 }, { "epoch": 0.7319402528937617, "grad_norm": 0.33083659410476685, "learning_rate": 3.5938168849636544e-05, "loss": 0.8684, "step": 4790 }, { "epoch": 0.7320930587920694, "grad_norm": 0.23866380751132965, "learning_rate": 3.589984982642291e-05, "loss": 0.5983, "step": 4791 }, { "epoch": 0.732245864690377, "grad_norm": 0.28487899899482727, "learning_rate": 3.586154677322363e-05, "loss": 0.6288, "step": 4792 }, { "epoch": 0.7323986705886847, "grad_norm": 0.25790512561798096, "learning_rate": 3.582325969958157e-05, "loss": 0.6921, "step": 4793 }, { "epoch": 0.7325514764869924, "grad_norm": 0.5388302803039551, "learning_rate": 3.578498861503571e-05, "loss": 0.8078, "step": 4794 }, { "epoch": 0.7327042823853, "grad_norm": 0.37389835715293884, "learning_rate": 3.5746733529120826e-05, "loss": 0.7492, "step": 4795 }, { "epoch": 0.7328570882836077, "grad_norm": 0.26483863592147827, "learning_rate": 3.5708494451367936e-05, "loss": 0.8071, "step": 4796 }, { "epoch": 0.7330098941819154, "grad_norm": 0.35608604550361633, "learning_rate": 3.5670271391304e-05, "loss": 0.5763, "step": 4797 }, { "epoch": 0.7331627000802231, "grad_norm": 0.33788082003593445, "learning_rate": 3.563206435845196e-05, "loss": 0.657, "step": 4798 }, { "epoch": 0.7333155059785308, "grad_norm": 0.32307055592536926, "learning_rate": 3.559387336233071e-05, "loss": 0.8049, "step": 4799 }, { "epoch": 0.7334683118768385, "grad_norm": 0.28910204768180847, "learning_rate": 3.5555698412455284e-05, "loss": 0.8353, "step": 4800 }, { "epoch": 0.7336211177751462, "grad_norm": 0.26586541533470154, "learning_rate": 3.5517539518336676e-05, "loss": 0.7005, "step": 4801 }, { "epoch": 0.7337739236734538, "grad_norm": 0.27746787667274475, "learning_rate": 3.547939668948177e-05, "loss": 0.7271, "step": 4802 }, { "epoch": 0.7339267295717614, "grad_norm": 0.5220523476600647, "learning_rate": 3.544126993539362e-05, "loss": 0.7498, "step": 4803 }, { "epoch": 0.7340795354700691, "grad_norm": 0.39568111300468445, "learning_rate": 3.540315926557114e-05, "loss": 0.5222, "step": 4804 }, { "epoch": 0.7342323413683768, "grad_norm": 0.270342081785202, "learning_rate": 3.5365064689509254e-05, "loss": 0.4921, "step": 4805 }, { "epoch": 0.7343851472666845, "grad_norm": 0.29624781012535095, "learning_rate": 3.5326986216698944e-05, "loss": 0.7302, "step": 4806 }, { "epoch": 0.7345379531649922, "grad_norm": 0.2960861027240753, "learning_rate": 3.5288923856627164e-05, "loss": 0.7035, "step": 4807 }, { "epoch": 0.7346907590632998, "grad_norm": 0.3095923066139221, "learning_rate": 3.52508776187768e-05, "loss": 0.6334, "step": 4808 }, { "epoch": 0.7348435649616075, "grad_norm": 0.26498642563819885, "learning_rate": 3.5212847512626736e-05, "loss": 0.7783, "step": 4809 }, { "epoch": 0.7349963708599152, "grad_norm": 0.27664878964424133, "learning_rate": 3.517483354765187e-05, "loss": 0.5974, "step": 4810 }, { "epoch": 0.7351491767582229, "grad_norm": 0.6068941354751587, "learning_rate": 3.5136835733323105e-05, "loss": 0.9139, "step": 4811 }, { "epoch": 0.7353019826565306, "grad_norm": 0.36670371890068054, "learning_rate": 3.509885407910724e-05, "loss": 0.55, "step": 4812 }, { "epoch": 0.7354547885548383, "grad_norm": 0.30022528767585754, "learning_rate": 3.506088859446704e-05, "loss": 0.6745, "step": 4813 }, { "epoch": 0.7356075944531459, "grad_norm": 0.2669506371021271, "learning_rate": 3.5022939288861335e-05, "loss": 0.6979, "step": 4814 }, { "epoch": 0.7357604003514535, "grad_norm": 0.2814632058143616, "learning_rate": 3.4985006171744916e-05, "loss": 0.6519, "step": 4815 }, { "epoch": 0.7359132062497612, "grad_norm": 0.4638700485229492, "learning_rate": 3.4947089252568446e-05, "loss": 0.9276, "step": 4816 }, { "epoch": 0.7360660121480689, "grad_norm": 0.2916383743286133, "learning_rate": 3.490918854077859e-05, "loss": 0.8922, "step": 4817 }, { "epoch": 0.7362188180463766, "grad_norm": 0.29278457164764404, "learning_rate": 3.487130404581806e-05, "loss": 0.532, "step": 4818 }, { "epoch": 0.7363716239446843, "grad_norm": 0.27625879645347595, "learning_rate": 3.483343577712538e-05, "loss": 0.7354, "step": 4819 }, { "epoch": 0.7365244298429919, "grad_norm": 0.38489770889282227, "learning_rate": 3.47955837441352e-05, "loss": 0.7309, "step": 4820 }, { "epoch": 0.7366772357412996, "grad_norm": 0.30396920442581177, "learning_rate": 3.475774795627794e-05, "loss": 0.7055, "step": 4821 }, { "epoch": 0.7368300416396073, "grad_norm": 0.29432806372642517, "learning_rate": 3.4719928422980155e-05, "loss": 0.6346, "step": 4822 }, { "epoch": 0.736982847537915, "grad_norm": 0.4341113269329071, "learning_rate": 3.468212515366419e-05, "loss": 0.5119, "step": 4823 }, { "epoch": 0.7371356534362227, "grad_norm": 0.2815232276916504, "learning_rate": 3.464433815774848e-05, "loss": 0.7706, "step": 4824 }, { "epoch": 0.7372884593345304, "grad_norm": 0.28113171458244324, "learning_rate": 3.460656744464729e-05, "loss": 0.8289, "step": 4825 }, { "epoch": 0.737441265232838, "grad_norm": 0.4249742925167084, "learning_rate": 3.4568813023770905e-05, "loss": 0.7503, "step": 4826 }, { "epoch": 0.7375940711311456, "grad_norm": 0.285725861787796, "learning_rate": 3.4531074904525486e-05, "loss": 0.8374, "step": 4827 }, { "epoch": 0.7377468770294533, "grad_norm": 0.29470476508140564, "learning_rate": 3.44933530963132e-05, "loss": 0.6421, "step": 4828 }, { "epoch": 0.737899682927761, "grad_norm": 0.2831245958805084, "learning_rate": 3.445564760853216e-05, "loss": 0.5626, "step": 4829 }, { "epoch": 0.7380524888260687, "grad_norm": 0.333756685256958, "learning_rate": 3.441795845057627e-05, "loss": 0.6658, "step": 4830 }, { "epoch": 0.7382052947243763, "grad_norm": 0.25924742221832275, "learning_rate": 3.438028563183552e-05, "loss": 0.7106, "step": 4831 }, { "epoch": 0.738358100622684, "grad_norm": 0.33355987071990967, "learning_rate": 3.434262916169577e-05, "loss": 0.6727, "step": 4832 }, { "epoch": 0.7385109065209917, "grad_norm": 0.856724739074707, "learning_rate": 3.430498904953886e-05, "loss": 0.7553, "step": 4833 }, { "epoch": 0.7386637124192994, "grad_norm": 0.27116596698760986, "learning_rate": 3.426736530474247e-05, "loss": 0.6955, "step": 4834 }, { "epoch": 0.7388165183176071, "grad_norm": 0.31083372235298157, "learning_rate": 3.4229757936680195e-05, "loss": 0.5857, "step": 4835 }, { "epoch": 0.7389693242159148, "grad_norm": 0.29667478799819946, "learning_rate": 3.419216695472168e-05, "loss": 0.6607, "step": 4836 }, { "epoch": 0.7391221301142225, "grad_norm": 0.4294913709163666, "learning_rate": 3.415459236823233e-05, "loss": 0.4775, "step": 4837 }, { "epoch": 0.7392749360125301, "grad_norm": 0.27344828844070435, "learning_rate": 3.4117034186573594e-05, "loss": 0.6111, "step": 4838 }, { "epoch": 0.7394277419108377, "grad_norm": 0.3142082691192627, "learning_rate": 3.407949241910272e-05, "loss": 0.6906, "step": 4839 }, { "epoch": 0.7395805478091454, "grad_norm": 0.2933219373226166, "learning_rate": 3.4041967075172995e-05, "loss": 0.6802, "step": 4840 }, { "epoch": 0.7397333537074531, "grad_norm": 0.30935943126678467, "learning_rate": 3.400445816413348e-05, "loss": 0.7207, "step": 4841 }, { "epoch": 0.7398861596057608, "grad_norm": 0.33251291513442993, "learning_rate": 3.396696569532926e-05, "loss": 0.7258, "step": 4842 }, { "epoch": 0.7400389655040684, "grad_norm": 0.32766956090927124, "learning_rate": 3.3929489678101236e-05, "loss": 0.6056, "step": 4843 }, { "epoch": 0.7401917714023761, "grad_norm": 0.29472458362579346, "learning_rate": 3.38920301217862e-05, "loss": 0.7408, "step": 4844 }, { "epoch": 0.7403445773006838, "grad_norm": 0.3219550549983978, "learning_rate": 3.385458703571696e-05, "loss": 0.7757, "step": 4845 }, { "epoch": 0.7404973831989915, "grad_norm": 0.42171233892440796, "learning_rate": 3.381716042922213e-05, "loss": 0.5873, "step": 4846 }, { "epoch": 0.7406501890972992, "grad_norm": 0.4623895287513733, "learning_rate": 3.3779750311626235e-05, "loss": 0.708, "step": 4847 }, { "epoch": 0.7408029949956069, "grad_norm": 0.3930194675922394, "learning_rate": 3.374235669224965e-05, "loss": 0.6904, "step": 4848 }, { "epoch": 0.7409558008939146, "grad_norm": 0.31731662154197693, "learning_rate": 3.37049795804087e-05, "loss": 0.8618, "step": 4849 }, { "epoch": 0.7411086067922221, "grad_norm": 0.35052576661109924, "learning_rate": 3.3667618985415625e-05, "loss": 0.8385, "step": 4850 }, { "epoch": 0.7412614126905298, "grad_norm": 0.544321596622467, "learning_rate": 3.3630274916578483e-05, "loss": 0.6843, "step": 4851 }, { "epoch": 0.7414142185888375, "grad_norm": 0.2999391555786133, "learning_rate": 3.359294738320118e-05, "loss": 0.623, "step": 4852 }, { "epoch": 0.7415670244871452, "grad_norm": 0.29683953523635864, "learning_rate": 3.35556363945836e-05, "loss": 0.6822, "step": 4853 }, { "epoch": 0.7417198303854529, "grad_norm": 0.43165406584739685, "learning_rate": 3.3518341960021504e-05, "loss": 0.7974, "step": 4854 }, { "epoch": 0.7418726362837605, "grad_norm": 0.3263550102710724, "learning_rate": 3.348106408880643e-05, "loss": 0.7315, "step": 4855 }, { "epoch": 0.7420254421820682, "grad_norm": 0.2833004891872406, "learning_rate": 3.344380279022584e-05, "loss": 0.5614, "step": 4856 }, { "epoch": 0.7421782480803759, "grad_norm": 0.3153781592845917, "learning_rate": 3.340655807356313e-05, "loss": 0.8439, "step": 4857 }, { "epoch": 0.7423310539786836, "grad_norm": 0.28415146470069885, "learning_rate": 3.336932994809744e-05, "loss": 0.7368, "step": 4858 }, { "epoch": 0.7424838598769913, "grad_norm": 0.33800607919692993, "learning_rate": 3.333211842310391e-05, "loss": 0.6789, "step": 4859 }, { "epoch": 0.742636665775299, "grad_norm": 0.30534127354621887, "learning_rate": 3.329492350785342e-05, "loss": 0.868, "step": 4860 }, { "epoch": 0.7427894716736066, "grad_norm": 0.28079915046691895, "learning_rate": 3.325774521161282e-05, "loss": 0.6768, "step": 4861 }, { "epoch": 0.7429422775719142, "grad_norm": 0.36504265666007996, "learning_rate": 3.3220583543644724e-05, "loss": 0.6346, "step": 4862 }, { "epoch": 0.7430950834702219, "grad_norm": 0.3121730387210846, "learning_rate": 3.3183438513207676e-05, "loss": 0.5665, "step": 4863 }, { "epoch": 0.7432478893685296, "grad_norm": 0.28789499402046204, "learning_rate": 3.314631012955608e-05, "loss": 0.8213, "step": 4864 }, { "epoch": 0.7434006952668373, "grad_norm": 0.4110698103904724, "learning_rate": 3.310919840194013e-05, "loss": 0.6911, "step": 4865 }, { "epoch": 0.743553501165145, "grad_norm": 0.4067875146865845, "learning_rate": 3.3072103339605866e-05, "loss": 0.7366, "step": 4866 }, { "epoch": 0.7437063070634526, "grad_norm": 0.27583467960357666, "learning_rate": 3.3035024951795246e-05, "loss": 0.639, "step": 4867 }, { "epoch": 0.7438591129617603, "grad_norm": 0.2784540355205536, "learning_rate": 3.2997963247746075e-05, "loss": 0.5332, "step": 4868 }, { "epoch": 0.744011918860068, "grad_norm": 0.4741950035095215, "learning_rate": 3.2960918236691926e-05, "loss": 0.6251, "step": 4869 }, { "epoch": 0.7441647247583757, "grad_norm": 0.31669479608535767, "learning_rate": 3.2923889927862227e-05, "loss": 0.7696, "step": 4870 }, { "epoch": 0.7443175306566834, "grad_norm": 0.4130287170410156, "learning_rate": 3.2886878330482296e-05, "loss": 0.9864, "step": 4871 }, { "epoch": 0.7444703365549911, "grad_norm": 0.25285977125167847, "learning_rate": 3.28498834537733e-05, "loss": 0.6078, "step": 4872 }, { "epoch": 0.7446231424532987, "grad_norm": 0.25762438774108887, "learning_rate": 3.281290530695217e-05, "loss": 0.804, "step": 4873 }, { "epoch": 0.7447759483516063, "grad_norm": 0.2802187204360962, "learning_rate": 3.2775943899231654e-05, "loss": 0.6797, "step": 4874 }, { "epoch": 0.744928754249914, "grad_norm": 0.3005053997039795, "learning_rate": 3.273899923982047e-05, "loss": 0.6974, "step": 4875 }, { "epoch": 0.7450815601482217, "grad_norm": 0.2399023026227951, "learning_rate": 3.270207133792297e-05, "loss": 0.6692, "step": 4876 }, { "epoch": 0.7452343660465294, "grad_norm": 0.24347561597824097, "learning_rate": 3.266516020273952e-05, "loss": 0.6747, "step": 4877 }, { "epoch": 0.7453871719448371, "grad_norm": 0.31855449080467224, "learning_rate": 3.262826584346616e-05, "loss": 0.6217, "step": 4878 }, { "epoch": 0.7455399778431447, "grad_norm": 0.2576698958873749, "learning_rate": 3.259138826929484e-05, "loss": 0.9534, "step": 4879 }, { "epoch": 0.7456927837414524, "grad_norm": 0.3313029706478119, "learning_rate": 3.255452748941327e-05, "loss": 0.7103, "step": 4880 }, { "epoch": 0.7458455896397601, "grad_norm": 0.44755053520202637, "learning_rate": 3.251768351300506e-05, "loss": 0.6426, "step": 4881 }, { "epoch": 0.7459983955380678, "grad_norm": 0.25972044467926025, "learning_rate": 3.248085634924952e-05, "loss": 0.683, "step": 4882 }, { "epoch": 0.7461512014363755, "grad_norm": 0.27888035774230957, "learning_rate": 3.2444046007321836e-05, "loss": 0.6486, "step": 4883 }, { "epoch": 0.7463040073346832, "grad_norm": 0.3077915906906128, "learning_rate": 3.2407252496393006e-05, "loss": 0.6959, "step": 4884 }, { "epoch": 0.7464568132329908, "grad_norm": 0.28220564126968384, "learning_rate": 3.2370475825629844e-05, "loss": 0.5924, "step": 4885 }, { "epoch": 0.7466096191312984, "grad_norm": 0.3948167860507965, "learning_rate": 3.233371600419495e-05, "loss": 0.6198, "step": 4886 }, { "epoch": 0.7467624250296061, "grad_norm": 0.30583393573760986, "learning_rate": 3.229697304124666e-05, "loss": 0.8064, "step": 4887 }, { "epoch": 0.7469152309279138, "grad_norm": 0.38143858313560486, "learning_rate": 3.226024694593922e-05, "loss": 0.7402, "step": 4888 }, { "epoch": 0.7470680368262215, "grad_norm": 0.23864120244979858, "learning_rate": 3.222353772742267e-05, "loss": 0.6484, "step": 4889 }, { "epoch": 0.7472208427245292, "grad_norm": 0.2785201668739319, "learning_rate": 3.2186845394842766e-05, "loss": 0.5658, "step": 4890 }, { "epoch": 0.7473736486228368, "grad_norm": 0.2768336534500122, "learning_rate": 3.215016995734105e-05, "loss": 0.6543, "step": 4891 }, { "epoch": 0.7475264545211445, "grad_norm": 0.2642340064048767, "learning_rate": 3.211351142405494e-05, "loss": 0.8071, "step": 4892 }, { "epoch": 0.7476792604194522, "grad_norm": 0.30296286940574646, "learning_rate": 3.207686980411765e-05, "loss": 0.7364, "step": 4893 }, { "epoch": 0.7478320663177599, "grad_norm": 0.2418510764837265, "learning_rate": 3.204024510665804e-05, "loss": 0.7106, "step": 4894 }, { "epoch": 0.7479848722160676, "grad_norm": 0.28688812255859375, "learning_rate": 3.200363734080093e-05, "loss": 0.7859, "step": 4895 }, { "epoch": 0.7481376781143753, "grad_norm": 0.29675769805908203, "learning_rate": 3.19670465156668e-05, "loss": 0.6998, "step": 4896 }, { "epoch": 0.7482904840126829, "grad_norm": 0.26952260732650757, "learning_rate": 3.19304726403719e-05, "loss": 0.6482, "step": 4897 }, { "epoch": 0.7484432899109905, "grad_norm": 0.32863569259643555, "learning_rate": 3.189391572402836e-05, "loss": 0.7665, "step": 4898 }, { "epoch": 0.7485960958092982, "grad_norm": 0.2946239113807678, "learning_rate": 3.185737577574405e-05, "loss": 0.788, "step": 4899 }, { "epoch": 0.7487489017076059, "grad_norm": 0.37878304719924927, "learning_rate": 3.182085280462256e-05, "loss": 0.98, "step": 4900 }, { "epoch": 0.7489017076059136, "grad_norm": 0.9184777736663818, "learning_rate": 3.178434681976324e-05, "loss": 0.8426, "step": 4901 }, { "epoch": 0.7490545135042213, "grad_norm": 0.33207786083221436, "learning_rate": 3.1747857830261306e-05, "loss": 0.8065, "step": 4902 }, { "epoch": 0.7492073194025289, "grad_norm": 0.27969595789909363, "learning_rate": 3.171138584520769e-05, "loss": 0.6431, "step": 4903 }, { "epoch": 0.7493601253008366, "grad_norm": 0.39739498496055603, "learning_rate": 3.167493087368906e-05, "loss": 0.7623, "step": 4904 }, { "epoch": 0.7495129311991443, "grad_norm": 0.23574230074882507, "learning_rate": 3.163849292478783e-05, "loss": 0.5813, "step": 4905 }, { "epoch": 0.749665737097452, "grad_norm": 0.3343149721622467, "learning_rate": 3.160207200758226e-05, "loss": 0.5977, "step": 4906 }, { "epoch": 0.7498185429957597, "grad_norm": 0.27852675318717957, "learning_rate": 3.156566813114632e-05, "loss": 0.8232, "step": 4907 }, { "epoch": 0.7499713488940674, "grad_norm": 0.25333601236343384, "learning_rate": 3.152928130454972e-05, "loss": 0.7445, "step": 4908 }, { "epoch": 0.7501241547923749, "grad_norm": 0.30783987045288086, "learning_rate": 3.1492911536857886e-05, "loss": 0.6641, "step": 4909 }, { "epoch": 0.7502769606906826, "grad_norm": 0.26670366525650024, "learning_rate": 3.1456558837132065e-05, "loss": 0.6984, "step": 4910 }, { "epoch": 0.7504297665889903, "grad_norm": 0.2689191401004791, "learning_rate": 3.142022321442929e-05, "loss": 0.731, "step": 4911 }, { "epoch": 0.750582572487298, "grad_norm": 0.28377169370651245, "learning_rate": 3.138390467780221e-05, "loss": 0.5752, "step": 4912 }, { "epoch": 0.7507353783856057, "grad_norm": 0.29742759466171265, "learning_rate": 3.134760323629928e-05, "loss": 0.7233, "step": 4913 }, { "epoch": 0.7508881842839134, "grad_norm": 0.33162474632263184, "learning_rate": 3.131131889896475e-05, "loss": 0.863, "step": 4914 }, { "epoch": 0.751040990182221, "grad_norm": 0.4513492286205292, "learning_rate": 3.127505167483848e-05, "loss": 0.5617, "step": 4915 }, { "epoch": 0.7511937960805287, "grad_norm": 0.325539231300354, "learning_rate": 3.1238801572956246e-05, "loss": 0.703, "step": 4916 }, { "epoch": 0.7513466019788364, "grad_norm": 0.3028418719768524, "learning_rate": 3.120256860234936e-05, "loss": 0.7182, "step": 4917 }, { "epoch": 0.7514994078771441, "grad_norm": 0.33599185943603516, "learning_rate": 3.116635277204503e-05, "loss": 0.6111, "step": 4918 }, { "epoch": 0.7516522137754518, "grad_norm": 0.3110974431037903, "learning_rate": 3.1130154091066074e-05, "loss": 0.7623, "step": 4919 }, { "epoch": 0.7518050196737595, "grad_norm": 0.2833666205406189, "learning_rate": 3.109397256843114e-05, "loss": 0.7789, "step": 4920 }, { "epoch": 0.751957825572067, "grad_norm": 0.9882724285125732, "learning_rate": 3.1057808213154535e-05, "loss": 0.5325, "step": 4921 }, { "epoch": 0.7521106314703747, "grad_norm": 0.2542802691459656, "learning_rate": 3.102166103424626e-05, "loss": 0.7007, "step": 4922 }, { "epoch": 0.7522634373686824, "grad_norm": 0.27196669578552246, "learning_rate": 3.0985531040712125e-05, "loss": 0.7287, "step": 4923 }, { "epoch": 0.7524162432669901, "grad_norm": 0.27083972096443176, "learning_rate": 3.0949418241553605e-05, "loss": 0.6532, "step": 4924 }, { "epoch": 0.7525690491652978, "grad_norm": 0.3268282413482666, "learning_rate": 3.091332264576796e-05, "loss": 0.7181, "step": 4925 }, { "epoch": 0.7527218550636054, "grad_norm": 0.4242473840713501, "learning_rate": 3.0877244262347995e-05, "loss": 0.7455, "step": 4926 }, { "epoch": 0.7528746609619131, "grad_norm": 0.3833047151565552, "learning_rate": 3.084118310028238e-05, "loss": 0.6763, "step": 4927 }, { "epoch": 0.7530274668602208, "grad_norm": 0.47573891282081604, "learning_rate": 3.0805139168555485e-05, "loss": 0.6871, "step": 4928 }, { "epoch": 0.7531802727585285, "grad_norm": 0.339206725358963, "learning_rate": 3.076911247614731e-05, "loss": 0.765, "step": 4929 }, { "epoch": 0.7533330786568362, "grad_norm": 0.2713732421398163, "learning_rate": 3.073310303203364e-05, "loss": 0.6879, "step": 4930 }, { "epoch": 0.7534858845551439, "grad_norm": 0.38381505012512207, "learning_rate": 3.069711084518588e-05, "loss": 0.8672, "step": 4931 }, { "epoch": 0.7536386904534516, "grad_norm": 0.3012462258338928, "learning_rate": 3.066113592457124e-05, "loss": 0.8056, "step": 4932 }, { "epoch": 0.7537914963517591, "grad_norm": 0.295955091714859, "learning_rate": 3.0625178279152514e-05, "loss": 0.6531, "step": 4933 }, { "epoch": 0.7539443022500668, "grad_norm": 0.2634066641330719, "learning_rate": 3.058923791788829e-05, "loss": 0.6273, "step": 4934 }, { "epoch": 0.7540971081483745, "grad_norm": 0.33165842294692993, "learning_rate": 3.055331484973276e-05, "loss": 0.6211, "step": 4935 }, { "epoch": 0.7542499140466822, "grad_norm": 0.37168869376182556, "learning_rate": 3.0517409083635906e-05, "loss": 0.8095, "step": 4936 }, { "epoch": 0.7544027199449899, "grad_norm": 0.30196696519851685, "learning_rate": 3.0481520628543303e-05, "loss": 0.6351, "step": 4937 }, { "epoch": 0.7545555258432975, "grad_norm": 0.2617061138153076, "learning_rate": 3.044564949339631e-05, "loss": 0.599, "step": 4938 }, { "epoch": 0.7547083317416052, "grad_norm": 0.28290948271751404, "learning_rate": 3.040979568713189e-05, "loss": 0.761, "step": 4939 }, { "epoch": 0.7548611376399129, "grad_norm": 0.2870292663574219, "learning_rate": 3.037395921868269e-05, "loss": 0.7592, "step": 4940 }, { "epoch": 0.7550139435382206, "grad_norm": 0.30262330174446106, "learning_rate": 3.0338140096977086e-05, "loss": 0.5503, "step": 4941 }, { "epoch": 0.7551667494365283, "grad_norm": 0.3084609806537628, "learning_rate": 3.030233833093915e-05, "loss": 0.7879, "step": 4942 }, { "epoch": 0.755319555334836, "grad_norm": 0.4206237494945526, "learning_rate": 3.0266553929488563e-05, "loss": 0.6484, "step": 4943 }, { "epoch": 0.7554723612331437, "grad_norm": 0.2730026841163635, "learning_rate": 3.0230786901540677e-05, "loss": 0.7605, "step": 4944 }, { "epoch": 0.7556251671314512, "grad_norm": 0.29598739743232727, "learning_rate": 3.0195037256006563e-05, "loss": 0.7792, "step": 4945 }, { "epoch": 0.7557779730297589, "grad_norm": 0.31024444103240967, "learning_rate": 3.0159305001793004e-05, "loss": 0.6362, "step": 4946 }, { "epoch": 0.7559307789280666, "grad_norm": 0.2978576421737671, "learning_rate": 3.012359014780234e-05, "loss": 0.5886, "step": 4947 }, { "epoch": 0.7560835848263743, "grad_norm": 0.2910394072532654, "learning_rate": 3.0087892702932584e-05, "loss": 0.575, "step": 4948 }, { "epoch": 0.756236390724682, "grad_norm": 0.3117895722389221, "learning_rate": 3.0052212676077517e-05, "loss": 0.565, "step": 4949 }, { "epoch": 0.7563891966229896, "grad_norm": 0.27814364433288574, "learning_rate": 3.0016550076126527e-05, "loss": 0.5543, "step": 4950 }, { "epoch": 0.7565420025212973, "grad_norm": 0.27082252502441406, "learning_rate": 2.9980904911964637e-05, "loss": 0.5369, "step": 4951 }, { "epoch": 0.756694808419605, "grad_norm": 0.2814607322216034, "learning_rate": 2.9945277192472486e-05, "loss": 0.779, "step": 4952 }, { "epoch": 0.7568476143179127, "grad_norm": 0.47501417994499207, "learning_rate": 2.9909666926526515e-05, "loss": 0.6097, "step": 4953 }, { "epoch": 0.7570004202162204, "grad_norm": 0.31489819288253784, "learning_rate": 2.987407412299863e-05, "loss": 0.7388, "step": 4954 }, { "epoch": 0.7571532261145281, "grad_norm": 0.26809048652648926, "learning_rate": 2.983849879075652e-05, "loss": 0.5423, "step": 4955 }, { "epoch": 0.7573060320128358, "grad_norm": 0.30268001556396484, "learning_rate": 2.9802940938663526e-05, "loss": 0.688, "step": 4956 }, { "epoch": 0.7574588379111433, "grad_norm": 0.289115846157074, "learning_rate": 2.976740057557854e-05, "loss": 0.6581, "step": 4957 }, { "epoch": 0.757611643809451, "grad_norm": 0.2887480854988098, "learning_rate": 2.9731877710356117e-05, "loss": 0.6738, "step": 4958 }, { "epoch": 0.7577644497077587, "grad_norm": 0.2813575863838196, "learning_rate": 2.9696372351846515e-05, "loss": 0.6847, "step": 4959 }, { "epoch": 0.7579172556060664, "grad_norm": 0.4002649188041687, "learning_rate": 2.9660884508895635e-05, "loss": 0.6783, "step": 4960 }, { "epoch": 0.7580700615043741, "grad_norm": 0.3163740634918213, "learning_rate": 2.9625414190344923e-05, "loss": 0.6138, "step": 4961 }, { "epoch": 0.7582228674026817, "grad_norm": 0.338833212852478, "learning_rate": 2.9589961405031507e-05, "loss": 0.5459, "step": 4962 }, { "epoch": 0.7583756733009894, "grad_norm": 0.28364452719688416, "learning_rate": 2.9554526161788166e-05, "loss": 0.6336, "step": 4963 }, { "epoch": 0.7585284791992971, "grad_norm": 0.26455238461494446, "learning_rate": 2.9519108469443313e-05, "loss": 0.6763, "step": 4964 }, { "epoch": 0.7586812850976048, "grad_norm": 0.27129480242729187, "learning_rate": 2.948370833682096e-05, "loss": 0.6584, "step": 4965 }, { "epoch": 0.7588340909959125, "grad_norm": 0.3427901864051819, "learning_rate": 2.9448325772740713e-05, "loss": 0.8716, "step": 4966 }, { "epoch": 0.7589868968942202, "grad_norm": 0.3092363476753235, "learning_rate": 2.9412960786017906e-05, "loss": 0.6354, "step": 4967 }, { "epoch": 0.7591397027925277, "grad_norm": 0.27754339575767517, "learning_rate": 2.9377613385463366e-05, "loss": 0.5946, "step": 4968 }, { "epoch": 0.7592925086908354, "grad_norm": 0.28938060998916626, "learning_rate": 2.9342283579883644e-05, "loss": 0.7985, "step": 4969 }, { "epoch": 0.7594453145891431, "grad_norm": 0.33980458974838257, "learning_rate": 2.930697137808084e-05, "loss": 0.7286, "step": 4970 }, { "epoch": 0.7595981204874508, "grad_norm": 0.40130120515823364, "learning_rate": 2.927167678885272e-05, "loss": 0.8292, "step": 4971 }, { "epoch": 0.7597509263857585, "grad_norm": 0.2771167457103729, "learning_rate": 2.9236399820992587e-05, "loss": 0.6023, "step": 4972 }, { "epoch": 0.7599037322840662, "grad_norm": 0.3820517361164093, "learning_rate": 2.9201140483289468e-05, "loss": 0.6311, "step": 4973 }, { "epoch": 0.7600565381823738, "grad_norm": 0.2943771183490753, "learning_rate": 2.9165898784527858e-05, "loss": 0.513, "step": 4974 }, { "epoch": 0.7602093440806815, "grad_norm": 0.38422003388404846, "learning_rate": 2.9130674733488006e-05, "loss": 0.9081, "step": 4975 }, { "epoch": 0.7603621499789892, "grad_norm": 0.6306685209274292, "learning_rate": 2.909546833894561e-05, "loss": 0.7427, "step": 4976 }, { "epoch": 0.7605149558772969, "grad_norm": 0.2573539614677429, "learning_rate": 2.9060279609672126e-05, "loss": 0.4403, "step": 4977 }, { "epoch": 0.7606677617756046, "grad_norm": 0.2672884166240692, "learning_rate": 2.902510855443449e-05, "loss": 0.5494, "step": 4978 }, { "epoch": 0.7608205676739123, "grad_norm": 0.2918144762516022, "learning_rate": 2.8989955181995243e-05, "loss": 0.5941, "step": 4979 }, { "epoch": 0.7609733735722198, "grad_norm": 0.27063295245170593, "learning_rate": 2.8954819501112584e-05, "loss": 0.6396, "step": 4980 }, { "epoch": 0.7611261794705275, "grad_norm": 0.2645648717880249, "learning_rate": 2.891970152054031e-05, "loss": 0.9103, "step": 4981 }, { "epoch": 0.7612789853688352, "grad_norm": 0.31377100944519043, "learning_rate": 2.888460124902774e-05, "loss": 0.6627, "step": 4982 }, { "epoch": 0.7614317912671429, "grad_norm": 0.2497723251581192, "learning_rate": 2.8849518695319776e-05, "loss": 0.7019, "step": 4983 }, { "epoch": 0.7615845971654506, "grad_norm": 0.5569624304771423, "learning_rate": 2.8814453868156978e-05, "loss": 0.6643, "step": 4984 }, { "epoch": 0.7617374030637583, "grad_norm": 0.4809087812900543, "learning_rate": 2.8779406776275475e-05, "loss": 0.7912, "step": 4985 }, { "epoch": 0.7618902089620659, "grad_norm": 0.31673797965049744, "learning_rate": 2.8744377428406933e-05, "loss": 0.5688, "step": 4986 }, { "epoch": 0.7620430148603736, "grad_norm": 0.30070045590400696, "learning_rate": 2.870936583327858e-05, "loss": 0.6979, "step": 4987 }, { "epoch": 0.7621958207586813, "grad_norm": 0.3410513699054718, "learning_rate": 2.8674371999613314e-05, "loss": 0.6147, "step": 4988 }, { "epoch": 0.762348626656989, "grad_norm": 0.3067401051521301, "learning_rate": 2.8639395936129553e-05, "loss": 0.8445, "step": 4989 }, { "epoch": 0.7625014325552967, "grad_norm": 0.26025477051734924, "learning_rate": 2.860443765154126e-05, "loss": 0.6062, "step": 4990 }, { "epoch": 0.7626542384536044, "grad_norm": 0.25668859481811523, "learning_rate": 2.8569497154558034e-05, "loss": 0.6773, "step": 4991 }, { "epoch": 0.7628070443519119, "grad_norm": 0.33201903104782104, "learning_rate": 2.8534574453885e-05, "loss": 0.5587, "step": 4992 }, { "epoch": 0.7629598502502196, "grad_norm": 0.3629027009010315, "learning_rate": 2.8499669558222796e-05, "loss": 0.6214, "step": 4993 }, { "epoch": 0.7631126561485273, "grad_norm": 0.3354741632938385, "learning_rate": 2.8464782476267737e-05, "loss": 0.736, "step": 4994 }, { "epoch": 0.763265462046835, "grad_norm": 0.29513174295425415, "learning_rate": 2.8429913216711678e-05, "loss": 0.8077, "step": 4995 }, { "epoch": 0.7634182679451427, "grad_norm": 0.2917438745498657, "learning_rate": 2.839506178824196e-05, "loss": 0.7234, "step": 4996 }, { "epoch": 0.7635710738434504, "grad_norm": 0.2602333426475525, "learning_rate": 2.8360228199541494e-05, "loss": 0.5751, "step": 4997 }, { "epoch": 0.763723879741758, "grad_norm": 0.3598625659942627, "learning_rate": 2.8325412459288814e-05, "loss": 0.6023, "step": 4998 }, { "epoch": 0.7638766856400657, "grad_norm": 0.30226266384124756, "learning_rate": 2.8290614576157992e-05, "loss": 0.7538, "step": 4999 }, { "epoch": 0.7640294915383734, "grad_norm": 0.2980852425098419, "learning_rate": 2.8255834558818607e-05, "loss": 0.7214, "step": 5000 }, { "epoch": 0.7641822974366811, "grad_norm": 0.30146270990371704, "learning_rate": 2.8221072415935766e-05, "loss": 0.6857, "step": 5001 }, { "epoch": 0.7643351033349888, "grad_norm": 0.29795554280281067, "learning_rate": 2.8186328156170217e-05, "loss": 0.9127, "step": 5002 }, { "epoch": 0.7644879092332965, "grad_norm": 0.3890931010246277, "learning_rate": 2.8151601788178207e-05, "loss": 0.6883, "step": 5003 }, { "epoch": 0.764640715131604, "grad_norm": 0.27632567286491394, "learning_rate": 2.8116893320611494e-05, "loss": 0.697, "step": 5004 }, { "epoch": 0.7647935210299117, "grad_norm": 0.2864340841770172, "learning_rate": 2.8082202762117382e-05, "loss": 0.5696, "step": 5005 }, { "epoch": 0.7649463269282194, "grad_norm": 0.29483935236930847, "learning_rate": 2.8047530121338795e-05, "loss": 0.7518, "step": 5006 }, { "epoch": 0.7650991328265271, "grad_norm": 0.29760611057281494, "learning_rate": 2.801287540691404e-05, "loss": 0.8129, "step": 5007 }, { "epoch": 0.7652519387248348, "grad_norm": 0.2790490984916687, "learning_rate": 2.797823862747715e-05, "loss": 0.7121, "step": 5008 }, { "epoch": 0.7654047446231425, "grad_norm": 0.3155994117259979, "learning_rate": 2.7943619791657494e-05, "loss": 0.7546, "step": 5009 }, { "epoch": 0.7655575505214501, "grad_norm": 0.34615352749824524, "learning_rate": 2.7909018908080153e-05, "loss": 0.5962, "step": 5010 }, { "epoch": 0.7657103564197578, "grad_norm": 0.2994769513607025, "learning_rate": 2.7874435985365555e-05, "loss": 0.9641, "step": 5011 }, { "epoch": 0.7658631623180655, "grad_norm": 0.29071369767189026, "learning_rate": 2.7839871032129828e-05, "loss": 0.5396, "step": 5012 }, { "epoch": 0.7660159682163732, "grad_norm": 0.31707438826560974, "learning_rate": 2.7805324056984482e-05, "loss": 0.9124, "step": 5013 }, { "epoch": 0.7661687741146809, "grad_norm": 0.43815508484840393, "learning_rate": 2.777079506853665e-05, "loss": 0.5922, "step": 5014 }, { "epoch": 0.7663215800129884, "grad_norm": 0.341488242149353, "learning_rate": 2.7736284075388884e-05, "loss": 0.8211, "step": 5015 }, { "epoch": 0.7664743859112961, "grad_norm": 0.4834541082382202, "learning_rate": 2.770179108613935e-05, "loss": 0.4068, "step": 5016 }, { "epoch": 0.7666271918096038, "grad_norm": 0.3897079825401306, "learning_rate": 2.7667316109381734e-05, "loss": 0.7649, "step": 5017 }, { "epoch": 0.7667799977079115, "grad_norm": 0.36329084634780884, "learning_rate": 2.763285915370507e-05, "loss": 0.7393, "step": 5018 }, { "epoch": 0.7669328036062192, "grad_norm": 0.3158819377422333, "learning_rate": 2.759842022769408e-05, "loss": 0.7657, "step": 5019 }, { "epoch": 0.7670856095045269, "grad_norm": 0.38043487071990967, "learning_rate": 2.7563999339928938e-05, "loss": 0.8129, "step": 5020 }, { "epoch": 0.7672384154028346, "grad_norm": 0.2957688271999359, "learning_rate": 2.7529596498985334e-05, "loss": 0.6722, "step": 5021 }, { "epoch": 0.7673912213011422, "grad_norm": 0.2666127681732178, "learning_rate": 2.7495211713434443e-05, "loss": 0.5102, "step": 5022 }, { "epoch": 0.7675440271994499, "grad_norm": 0.2595529854297638, "learning_rate": 2.7460844991842893e-05, "loss": 0.7852, "step": 5023 }, { "epoch": 0.7676968330977576, "grad_norm": 0.33117806911468506, "learning_rate": 2.7426496342772934e-05, "loss": 0.6446, "step": 5024 }, { "epoch": 0.7678496389960653, "grad_norm": 0.8229051232337952, "learning_rate": 2.7392165774782175e-05, "loss": 0.9172, "step": 5025 }, { "epoch": 0.768002444894373, "grad_norm": 0.36411499977111816, "learning_rate": 2.7357853296423865e-05, "loss": 0.8698, "step": 5026 }, { "epoch": 0.7681552507926805, "grad_norm": 0.260728657245636, "learning_rate": 2.7323558916246593e-05, "loss": 0.733, "step": 5027 }, { "epoch": 0.7683080566909882, "grad_norm": 0.3510059416294098, "learning_rate": 2.7289282642794588e-05, "loss": 0.704, "step": 5028 }, { "epoch": 0.7684608625892959, "grad_norm": 0.35938236117362976, "learning_rate": 2.725502448460743e-05, "loss": 0.6985, "step": 5029 }, { "epoch": 0.7686136684876036, "grad_norm": 0.31864234805107117, "learning_rate": 2.7220784450220304e-05, "loss": 0.6877, "step": 5030 }, { "epoch": 0.7687664743859113, "grad_norm": 0.2729928493499756, "learning_rate": 2.7186562548163817e-05, "loss": 0.7003, "step": 5031 }, { "epoch": 0.768919280284219, "grad_norm": 0.33942142128944397, "learning_rate": 2.7152358786964026e-05, "loss": 0.6741, "step": 5032 }, { "epoch": 0.7690720861825266, "grad_norm": 0.32317203283309937, "learning_rate": 2.7118173175142537e-05, "loss": 0.6225, "step": 5033 }, { "epoch": 0.7692248920808343, "grad_norm": 0.3431759178638458, "learning_rate": 2.7084005721216456e-05, "loss": 0.6183, "step": 5034 }, { "epoch": 0.769377697979142, "grad_norm": 0.2686121165752411, "learning_rate": 2.7049856433698263e-05, "loss": 0.7735, "step": 5035 }, { "epoch": 0.7695305038774497, "grad_norm": 0.3046192228794098, "learning_rate": 2.701572532109595e-05, "loss": 0.8076, "step": 5036 }, { "epoch": 0.7696833097757574, "grad_norm": 0.3387291431427002, "learning_rate": 2.6981612391913026e-05, "loss": 0.7316, "step": 5037 }, { "epoch": 0.7698361156740651, "grad_norm": 0.2865094244480133, "learning_rate": 2.6947517654648467e-05, "loss": 0.5962, "step": 5038 }, { "epoch": 0.7699889215723726, "grad_norm": 0.3363531827926636, "learning_rate": 2.6913441117796666e-05, "loss": 0.7593, "step": 5039 }, { "epoch": 0.7701417274706803, "grad_norm": 0.32571524381637573, "learning_rate": 2.6879382789847486e-05, "loss": 0.9278, "step": 5040 }, { "epoch": 0.770294533368988, "grad_norm": 0.3032762408256531, "learning_rate": 2.6845342679286278e-05, "loss": 0.7615, "step": 5041 }, { "epoch": 0.7704473392672957, "grad_norm": 0.3438403606414795, "learning_rate": 2.6811320794593896e-05, "loss": 0.5469, "step": 5042 }, { "epoch": 0.7706001451656034, "grad_norm": 0.28308266401290894, "learning_rate": 2.6777317144246572e-05, "loss": 0.7271, "step": 5043 }, { "epoch": 0.7707529510639111, "grad_norm": 0.24329397082328796, "learning_rate": 2.6743331736716017e-05, "loss": 0.8853, "step": 5044 }, { "epoch": 0.7709057569622187, "grad_norm": 0.2939806580543518, "learning_rate": 2.670936458046941e-05, "loss": 0.843, "step": 5045 }, { "epoch": 0.7710585628605264, "grad_norm": 0.3209003508090973, "learning_rate": 2.6675415683969428e-05, "loss": 0.8808, "step": 5046 }, { "epoch": 0.7712113687588341, "grad_norm": 0.3447398245334625, "learning_rate": 2.6641485055674132e-05, "loss": 0.6298, "step": 5047 }, { "epoch": 0.7713641746571418, "grad_norm": 0.31295421719551086, "learning_rate": 2.660757270403701e-05, "loss": 0.559, "step": 5048 }, { "epoch": 0.7715169805554495, "grad_norm": 0.2551795244216919, "learning_rate": 2.6573678637507116e-05, "loss": 0.675, "step": 5049 }, { "epoch": 0.7716697864537572, "grad_norm": 1.657878041267395, "learning_rate": 2.6539802864528784e-05, "loss": 0.6751, "step": 5050 }, { "epoch": 0.7718225923520647, "grad_norm": 0.2639477252960205, "learning_rate": 2.6505945393541932e-05, "loss": 0.6769, "step": 5051 }, { "epoch": 0.7719753982503724, "grad_norm": 0.27711376547813416, "learning_rate": 2.6472106232981897e-05, "loss": 0.7162, "step": 5052 }, { "epoch": 0.7721282041486801, "grad_norm": 0.2852341830730438, "learning_rate": 2.643828539127937e-05, "loss": 0.5743, "step": 5053 }, { "epoch": 0.7722810100469878, "grad_norm": 0.30288180708885193, "learning_rate": 2.6404482876860527e-05, "loss": 0.6888, "step": 5054 }, { "epoch": 0.7724338159452955, "grad_norm": 0.25058987736701965, "learning_rate": 2.6370698698146977e-05, "loss": 0.5594, "step": 5055 }, { "epoch": 0.7725866218436032, "grad_norm": 0.29127037525177, "learning_rate": 2.633693286355583e-05, "loss": 0.7225, "step": 5056 }, { "epoch": 0.7727394277419108, "grad_norm": 0.35187846422195435, "learning_rate": 2.6303185381499507e-05, "loss": 0.5605, "step": 5057 }, { "epoch": 0.7728922336402185, "grad_norm": 0.36523064970970154, "learning_rate": 2.6269456260385893e-05, "loss": 0.7734, "step": 5058 }, { "epoch": 0.7730450395385262, "grad_norm": 0.34329739212989807, "learning_rate": 2.6235745508618338e-05, "loss": 0.689, "step": 5059 }, { "epoch": 0.7731978454368339, "grad_norm": 0.285354346036911, "learning_rate": 2.6202053134595618e-05, "loss": 0.6413, "step": 5060 }, { "epoch": 0.7733506513351416, "grad_norm": 0.2637817859649658, "learning_rate": 2.6168379146711884e-05, "loss": 0.8285, "step": 5061 }, { "epoch": 0.7735034572334493, "grad_norm": 0.31705915927886963, "learning_rate": 2.61347235533567e-05, "loss": 0.6822, "step": 5062 }, { "epoch": 0.7736562631317568, "grad_norm": 0.3138381540775299, "learning_rate": 2.6101086362915127e-05, "loss": 0.8156, "step": 5063 }, { "epoch": 0.7738090690300645, "grad_norm": 0.3055115342140198, "learning_rate": 2.6067467583767535e-05, "loss": 0.7352, "step": 5064 }, { "epoch": 0.7739618749283722, "grad_norm": 0.298575222492218, "learning_rate": 2.603386722428981e-05, "loss": 0.7935, "step": 5065 }, { "epoch": 0.7741146808266799, "grad_norm": 0.3727077841758728, "learning_rate": 2.6000285292853156e-05, "loss": 0.6423, "step": 5066 }, { "epoch": 0.7742674867249876, "grad_norm": 0.27079665660858154, "learning_rate": 2.5966721797824267e-05, "loss": 0.7292, "step": 5067 }, { "epoch": 0.7744202926232953, "grad_norm": 0.29036736488342285, "learning_rate": 2.593317674756517e-05, "loss": 0.7094, "step": 5068 }, { "epoch": 0.7745730985216029, "grad_norm": 0.2901022732257843, "learning_rate": 2.5899650150433375e-05, "loss": 0.6526, "step": 5069 }, { "epoch": 0.7747259044199106, "grad_norm": 0.3423730432987213, "learning_rate": 2.5866142014781726e-05, "loss": 0.6695, "step": 5070 }, { "epoch": 0.7748787103182183, "grad_norm": 0.3615645170211792, "learning_rate": 2.5832652348958475e-05, "loss": 0.7929, "step": 5071 }, { "epoch": 0.775031516216526, "grad_norm": 0.4355672299861908, "learning_rate": 2.5799181161307308e-05, "loss": 0.6221, "step": 5072 }, { "epoch": 0.7751843221148337, "grad_norm": 0.3562428653240204, "learning_rate": 2.5765728460167314e-05, "loss": 0.7955, "step": 5073 }, { "epoch": 0.7753371280131413, "grad_norm": 0.3044549524784088, "learning_rate": 2.5732294253872947e-05, "loss": 0.6761, "step": 5074 }, { "epoch": 0.7754899339114489, "grad_norm": 0.32677242159843445, "learning_rate": 2.5698878550754014e-05, "loss": 0.7494, "step": 5075 }, { "epoch": 0.7756427398097566, "grad_norm": 0.2794375419616699, "learning_rate": 2.566548135913579e-05, "loss": 0.7209, "step": 5076 }, { "epoch": 0.7757955457080643, "grad_norm": 0.31766951084136963, "learning_rate": 2.5632102687338932e-05, "loss": 0.812, "step": 5077 }, { "epoch": 0.775948351606372, "grad_norm": 0.34559720754623413, "learning_rate": 2.559874254367942e-05, "loss": 0.674, "step": 5078 }, { "epoch": 0.7761011575046797, "grad_norm": 0.29366979002952576, "learning_rate": 2.5565400936468643e-05, "loss": 0.6706, "step": 5079 }, { "epoch": 0.7762539634029874, "grad_norm": 0.2583487331867218, "learning_rate": 2.5532077874013392e-05, "loss": 0.7567, "step": 5080 }, { "epoch": 0.776406769301295, "grad_norm": 0.3665739893913269, "learning_rate": 2.549877336461587e-05, "loss": 0.6882, "step": 5081 }, { "epoch": 0.7765595751996027, "grad_norm": 0.2896324694156647, "learning_rate": 2.546548741657355e-05, "loss": 0.6605, "step": 5082 }, { "epoch": 0.7767123810979104, "grad_norm": 0.27397826313972473, "learning_rate": 2.5432220038179412e-05, "loss": 0.7353, "step": 5083 }, { "epoch": 0.7768651869962181, "grad_norm": 0.2787233293056488, "learning_rate": 2.539897123772168e-05, "loss": 0.7683, "step": 5084 }, { "epoch": 0.7770179928945258, "grad_norm": 0.35348227620124817, "learning_rate": 2.536574102348407e-05, "loss": 0.616, "step": 5085 }, { "epoch": 0.7771707987928334, "grad_norm": 0.384181946516037, "learning_rate": 2.5332529403745564e-05, "loss": 0.7344, "step": 5086 }, { "epoch": 0.777323604691141, "grad_norm": 0.3795936107635498, "learning_rate": 2.5299336386780603e-05, "loss": 0.5957, "step": 5087 }, { "epoch": 0.7774764105894487, "grad_norm": 0.33974671363830566, "learning_rate": 2.5266161980858937e-05, "loss": 0.6189, "step": 5088 }, { "epoch": 0.7776292164877564, "grad_norm": 1.0465130805969238, "learning_rate": 2.5233006194245634e-05, "loss": 0.8266, "step": 5089 }, { "epoch": 0.7777820223860641, "grad_norm": 0.25817668437957764, "learning_rate": 2.519986903520124e-05, "loss": 0.8808, "step": 5090 }, { "epoch": 0.7779348282843718, "grad_norm": 0.3180966079235077, "learning_rate": 2.516675051198161e-05, "loss": 0.7442, "step": 5091 }, { "epoch": 0.7780876341826795, "grad_norm": 0.2853553295135498, "learning_rate": 2.513365063283791e-05, "loss": 0.7022, "step": 5092 }, { "epoch": 0.7782404400809871, "grad_norm": 0.28600025177001953, "learning_rate": 2.5100569406016695e-05, "loss": 0.631, "step": 5093 }, { "epoch": 0.7783932459792948, "grad_norm": 0.3371172547340393, "learning_rate": 2.506750683975988e-05, "loss": 0.5755, "step": 5094 }, { "epoch": 0.7785460518776025, "grad_norm": 0.33092522621154785, "learning_rate": 2.5034462942304772e-05, "loss": 0.6777, "step": 5095 }, { "epoch": 0.7786988577759102, "grad_norm": 0.49018746614456177, "learning_rate": 2.5001437721883936e-05, "loss": 0.7151, "step": 5096 }, { "epoch": 0.7788516636742179, "grad_norm": 0.30702343583106995, "learning_rate": 2.4968431186725304e-05, "loss": 0.7647, "step": 5097 }, { "epoch": 0.7790044695725254, "grad_norm": 0.2928082346916199, "learning_rate": 2.4935443345052213e-05, "loss": 0.6836, "step": 5098 }, { "epoch": 0.7791572754708331, "grad_norm": 0.3457891047000885, "learning_rate": 2.4902474205083336e-05, "loss": 0.7501, "step": 5099 }, { "epoch": 0.7793100813691408, "grad_norm": 0.28967201709747314, "learning_rate": 2.486952377503261e-05, "loss": 0.7462, "step": 5100 }, { "epoch": 0.7794628872674485, "grad_norm": 0.2979332506656647, "learning_rate": 2.4836592063109355e-05, "loss": 0.7389, "step": 5101 }, { "epoch": 0.7796156931657562, "grad_norm": 0.2730976343154907, "learning_rate": 2.480367907751827e-05, "loss": 0.7138, "step": 5102 }, { "epoch": 0.7797684990640639, "grad_norm": 0.2908374071121216, "learning_rate": 2.4770784826459303e-05, "loss": 0.5659, "step": 5103 }, { "epoch": 0.7799213049623716, "grad_norm": 0.2820816934108734, "learning_rate": 2.473790931812783e-05, "loss": 0.6933, "step": 5104 }, { "epoch": 0.7800741108606792, "grad_norm": 0.2908737063407898, "learning_rate": 2.470505256071446e-05, "loss": 0.7285, "step": 5105 }, { "epoch": 0.7802269167589869, "grad_norm": 0.2963566482067108, "learning_rate": 2.4672214562405217e-05, "loss": 0.7999, "step": 5106 }, { "epoch": 0.7803797226572946, "grad_norm": 0.2846260368824005, "learning_rate": 2.4639395331381376e-05, "loss": 0.6746, "step": 5107 }, { "epoch": 0.7805325285556023, "grad_norm": 0.26171863079071045, "learning_rate": 2.4606594875819622e-05, "loss": 0.4587, "step": 5108 }, { "epoch": 0.78068533445391, "grad_norm": 0.26902881264686584, "learning_rate": 2.4573813203891883e-05, "loss": 0.879, "step": 5109 }, { "epoch": 0.7808381403522175, "grad_norm": 0.26719123125076294, "learning_rate": 2.4541050323765403e-05, "loss": 0.6603, "step": 5110 }, { "epoch": 0.7809909462505252, "grad_norm": 0.264125257730484, "learning_rate": 2.450830624360282e-05, "loss": 0.619, "step": 5111 }, { "epoch": 0.7811437521488329, "grad_norm": 0.265664666891098, "learning_rate": 2.447558097156204e-05, "loss": 0.527, "step": 5112 }, { "epoch": 0.7812965580471406, "grad_norm": 0.3076263964176178, "learning_rate": 2.4442874515796344e-05, "loss": 0.6551, "step": 5113 }, { "epoch": 0.7814493639454483, "grad_norm": 0.32857951521873474, "learning_rate": 2.4410186884454165e-05, "loss": 0.7661, "step": 5114 }, { "epoch": 0.781602169843756, "grad_norm": 0.3077498972415924, "learning_rate": 2.4377518085679396e-05, "loss": 0.7124, "step": 5115 }, { "epoch": 0.7817549757420637, "grad_norm": 0.3741486668586731, "learning_rate": 2.4344868127611243e-05, "loss": 0.7011, "step": 5116 }, { "epoch": 0.7819077816403713, "grad_norm": 0.2527276277542114, "learning_rate": 2.43122370183841e-05, "loss": 0.7102, "step": 5117 }, { "epoch": 0.782060587538679, "grad_norm": 0.44322469830513, "learning_rate": 2.4279624766127785e-05, "loss": 0.7646, "step": 5118 }, { "epoch": 0.7822133934369867, "grad_norm": 0.2980830669403076, "learning_rate": 2.424703137896731e-05, "loss": 0.6688, "step": 5119 }, { "epoch": 0.7823661993352944, "grad_norm": 0.25133106112480164, "learning_rate": 2.4214456865023117e-05, "loss": 0.794, "step": 5120 }, { "epoch": 0.7825190052336021, "grad_norm": 0.2756834328174591, "learning_rate": 2.4181901232410796e-05, "loss": 0.5819, "step": 5121 }, { "epoch": 0.7826718111319096, "grad_norm": 0.2971981465816498, "learning_rate": 2.414936448924139e-05, "loss": 0.6456, "step": 5122 }, { "epoch": 0.7828246170302173, "grad_norm": 0.28821277618408203, "learning_rate": 2.411684664362107e-05, "loss": 0.5152, "step": 5123 }, { "epoch": 0.782977422928525, "grad_norm": 0.3240285813808441, "learning_rate": 2.4084347703651466e-05, "loss": 0.7247, "step": 5124 }, { "epoch": 0.7831302288268327, "grad_norm": 0.2902561128139496, "learning_rate": 2.405186767742934e-05, "loss": 0.5942, "step": 5125 }, { "epoch": 0.7832830347251404, "grad_norm": 0.3167160451412201, "learning_rate": 2.401940657304689e-05, "loss": 0.6878, "step": 5126 }, { "epoch": 0.7834358406234481, "grad_norm": 0.2904062867164612, "learning_rate": 2.3986964398591483e-05, "loss": 0.6876, "step": 5127 }, { "epoch": 0.7835886465217557, "grad_norm": 0.2593238353729248, "learning_rate": 2.3954541162145804e-05, "loss": 0.6787, "step": 5128 }, { "epoch": 0.7837414524200634, "grad_norm": 0.265027791261673, "learning_rate": 2.392213687178785e-05, "loss": 0.6336, "step": 5129 }, { "epoch": 0.7838942583183711, "grad_norm": 0.3181680142879486, "learning_rate": 2.388975153559091e-05, "loss": 0.6047, "step": 5130 }, { "epoch": 0.7840470642166788, "grad_norm": 0.3099213242530823, "learning_rate": 2.385738516162348e-05, "loss": 0.7953, "step": 5131 }, { "epoch": 0.7841998701149865, "grad_norm": 0.3505837917327881, "learning_rate": 2.3825037757949355e-05, "loss": 0.8281, "step": 5132 }, { "epoch": 0.7843526760132941, "grad_norm": 0.27586525678634644, "learning_rate": 2.3792709332627637e-05, "loss": 0.6736, "step": 5133 }, { "epoch": 0.7845054819116017, "grad_norm": 0.2906564772129059, "learning_rate": 2.3760399893712714e-05, "loss": 0.7257, "step": 5134 }, { "epoch": 0.7846582878099094, "grad_norm": 0.29521262645721436, "learning_rate": 2.372810944925419e-05, "loss": 0.7035, "step": 5135 }, { "epoch": 0.7848110937082171, "grad_norm": 0.324818879365921, "learning_rate": 2.3695838007296913e-05, "loss": 0.598, "step": 5136 }, { "epoch": 0.7849638996065248, "grad_norm": 0.289086252450943, "learning_rate": 2.3663585575881086e-05, "loss": 0.7587, "step": 5137 }, { "epoch": 0.7851167055048325, "grad_norm": 0.2762129008769989, "learning_rate": 2.3631352163042154e-05, "loss": 0.557, "step": 5138 }, { "epoch": 0.7852695114031402, "grad_norm": 0.27317383885383606, "learning_rate": 2.3599137776810775e-05, "loss": 0.7014, "step": 5139 }, { "epoch": 0.7854223173014478, "grad_norm": 0.3378963768482208, "learning_rate": 2.356694242521287e-05, "loss": 0.6726, "step": 5140 }, { "epoch": 0.7855751231997555, "grad_norm": 0.29100221395492554, "learning_rate": 2.353476611626968e-05, "loss": 0.5299, "step": 5141 }, { "epoch": 0.7857279290980632, "grad_norm": 0.296665757894516, "learning_rate": 2.3502608857997622e-05, "loss": 0.5991, "step": 5142 }, { "epoch": 0.7858807349963709, "grad_norm": 0.29530707001686096, "learning_rate": 2.3470470658408427e-05, "loss": 0.6371, "step": 5143 }, { "epoch": 0.7860335408946786, "grad_norm": 0.26709187030792236, "learning_rate": 2.3438351525509085e-05, "loss": 0.6762, "step": 5144 }, { "epoch": 0.7861863467929862, "grad_norm": 0.31802254915237427, "learning_rate": 2.3406251467301788e-05, "loss": 0.7362, "step": 5145 }, { "epoch": 0.7863391526912938, "grad_norm": 0.3588809072971344, "learning_rate": 2.3374170491783953e-05, "loss": 0.4949, "step": 5146 }, { "epoch": 0.7864919585896015, "grad_norm": 0.2746977210044861, "learning_rate": 2.3342108606948343e-05, "loss": 0.6827, "step": 5147 }, { "epoch": 0.7866447644879092, "grad_norm": 0.2767939269542694, "learning_rate": 2.3310065820782935e-05, "loss": 0.6995, "step": 5148 }, { "epoch": 0.7867975703862169, "grad_norm": 0.2917032241821289, "learning_rate": 2.3278042141270806e-05, "loss": 0.7076, "step": 5149 }, { "epoch": 0.7869503762845246, "grad_norm": 0.5610830783843994, "learning_rate": 2.3246037576390466e-05, "loss": 0.6843, "step": 5150 }, { "epoch": 0.7871031821828323, "grad_norm": 0.2540992200374603, "learning_rate": 2.3214052134115572e-05, "loss": 0.6724, "step": 5151 }, { "epoch": 0.78725598808114, "grad_norm": 0.46241921186447144, "learning_rate": 2.3182085822415055e-05, "loss": 0.5017, "step": 5152 }, { "epoch": 0.7874087939794476, "grad_norm": 0.4499031901359558, "learning_rate": 2.315013864925304e-05, "loss": 0.5057, "step": 5153 }, { "epoch": 0.7875615998777553, "grad_norm": 0.27625760436058044, "learning_rate": 2.3118210622588843e-05, "loss": 0.6678, "step": 5154 }, { "epoch": 0.787714405776063, "grad_norm": 0.3061494827270508, "learning_rate": 2.3086301750377136e-05, "loss": 0.7484, "step": 5155 }, { "epoch": 0.7878672116743707, "grad_norm": 0.25919973850250244, "learning_rate": 2.3054412040567684e-05, "loss": 0.5866, "step": 5156 }, { "epoch": 0.7880200175726783, "grad_norm": 0.3416811525821686, "learning_rate": 2.30225415011056e-05, "loss": 0.5841, "step": 5157 }, { "epoch": 0.7881728234709859, "grad_norm": 0.26105600595474243, "learning_rate": 2.2990690139931116e-05, "loss": 0.7581, "step": 5158 }, { "epoch": 0.7883256293692936, "grad_norm": 0.2879030704498291, "learning_rate": 2.295885796497976e-05, "loss": 0.9003, "step": 5159 }, { "epoch": 0.7884784352676013, "grad_norm": 0.22672039270401, "learning_rate": 2.292704498418222e-05, "loss": 0.5145, "step": 5160 }, { "epoch": 0.788631241165909, "grad_norm": 0.27298426628112793, "learning_rate": 2.2895251205464484e-05, "loss": 0.8854, "step": 5161 }, { "epoch": 0.7887840470642167, "grad_norm": 0.2944872975349426, "learning_rate": 2.286347663674765e-05, "loss": 0.6355, "step": 5162 }, { "epoch": 0.7889368529625244, "grad_norm": 0.28508460521698, "learning_rate": 2.2831721285948126e-05, "loss": 0.6496, "step": 5163 }, { "epoch": 0.789089658860832, "grad_norm": 0.5533873438835144, "learning_rate": 2.2799985160977454e-05, "loss": 0.7379, "step": 5164 }, { "epoch": 0.7892424647591397, "grad_norm": 0.28607064485549927, "learning_rate": 2.2768268269742466e-05, "loss": 0.7115, "step": 5165 }, { "epoch": 0.7893952706574474, "grad_norm": 0.2833400368690491, "learning_rate": 2.2736570620145136e-05, "loss": 0.7425, "step": 5166 }, { "epoch": 0.7895480765557551, "grad_norm": 0.3185187578201294, "learning_rate": 2.270489222008265e-05, "loss": 0.721, "step": 5167 }, { "epoch": 0.7897008824540628, "grad_norm": 0.2721453607082367, "learning_rate": 2.267323307744742e-05, "loss": 0.6955, "step": 5168 }, { "epoch": 0.7898536883523704, "grad_norm": 0.3211742639541626, "learning_rate": 2.264159320012711e-05, "loss": 0.621, "step": 5169 }, { "epoch": 0.790006494250678, "grad_norm": 0.3255116641521454, "learning_rate": 2.260997259600448e-05, "loss": 0.7441, "step": 5170 }, { "epoch": 0.7901593001489857, "grad_norm": 0.3053962290287018, "learning_rate": 2.257837127295752e-05, "loss": 0.8264, "step": 5171 }, { "epoch": 0.7903121060472934, "grad_norm": 0.4836776554584503, "learning_rate": 2.2546789238859468e-05, "loss": 0.6754, "step": 5172 }, { "epoch": 0.7904649119456011, "grad_norm": 0.2527182698249817, "learning_rate": 2.2515226501578734e-05, "loss": 0.731, "step": 5173 }, { "epoch": 0.7906177178439088, "grad_norm": 0.32779669761657715, "learning_rate": 2.24836830689789e-05, "loss": 0.6997, "step": 5174 }, { "epoch": 0.7907705237422165, "grad_norm": 0.2647869884967804, "learning_rate": 2.2452158948918712e-05, "loss": 0.5513, "step": 5175 }, { "epoch": 0.7909233296405241, "grad_norm": 0.3325052857398987, "learning_rate": 2.2420654149252153e-05, "loss": 0.5717, "step": 5176 }, { "epoch": 0.7910761355388318, "grad_norm": 0.27766644954681396, "learning_rate": 2.238916867782843e-05, "loss": 0.7734, "step": 5177 }, { "epoch": 0.7912289414371395, "grad_norm": 0.295060396194458, "learning_rate": 2.235770254249182e-05, "loss": 0.6441, "step": 5178 }, { "epoch": 0.7913817473354472, "grad_norm": 0.5573975443840027, "learning_rate": 2.2326255751081892e-05, "loss": 0.6681, "step": 5179 }, { "epoch": 0.7915345532337549, "grad_norm": 0.3403951823711395, "learning_rate": 2.2294828311433346e-05, "loss": 0.6454, "step": 5180 }, { "epoch": 0.7916873591320625, "grad_norm": 0.25036436319351196, "learning_rate": 2.226342023137601e-05, "loss": 0.6293, "step": 5181 }, { "epoch": 0.7918401650303701, "grad_norm": 0.3675941526889801, "learning_rate": 2.2232031518734986e-05, "loss": 0.8306, "step": 5182 }, { "epoch": 0.7919929709286778, "grad_norm": 0.30091819167137146, "learning_rate": 2.2200662181330535e-05, "loss": 0.6478, "step": 5183 }, { "epoch": 0.7921457768269855, "grad_norm": 0.28308168053627014, "learning_rate": 2.2169312226978044e-05, "loss": 0.6683, "step": 5184 }, { "epoch": 0.7922985827252932, "grad_norm": 0.30706119537353516, "learning_rate": 2.2137981663488038e-05, "loss": 0.6971, "step": 5185 }, { "epoch": 0.7924513886236009, "grad_norm": 0.26795217394828796, "learning_rate": 2.2106670498666315e-05, "loss": 0.7442, "step": 5186 }, { "epoch": 0.7926041945219086, "grad_norm": 0.2965717017650604, "learning_rate": 2.207537874031381e-05, "loss": 0.521, "step": 5187 }, { "epoch": 0.7927570004202162, "grad_norm": 0.29789793491363525, "learning_rate": 2.204410639622657e-05, "loss": 0.7498, "step": 5188 }, { "epoch": 0.7929098063185239, "grad_norm": 0.2972559928894043, "learning_rate": 2.2012853474195826e-05, "loss": 0.6631, "step": 5189 }, { "epoch": 0.7930626122168316, "grad_norm": 0.2753361761569977, "learning_rate": 2.1981619982007985e-05, "loss": 0.6776, "step": 5190 }, { "epoch": 0.7932154181151393, "grad_norm": 1.3614482879638672, "learning_rate": 2.195040592744465e-05, "loss": 0.5634, "step": 5191 }, { "epoch": 0.7933682240134469, "grad_norm": 0.28750181198120117, "learning_rate": 2.1919211318282505e-05, "loss": 0.5691, "step": 5192 }, { "epoch": 0.7935210299117545, "grad_norm": 0.28820011019706726, "learning_rate": 2.1888036162293413e-05, "loss": 0.6766, "step": 5193 }, { "epoch": 0.7936738358100622, "grad_norm": 0.28162091970443726, "learning_rate": 2.185688046724441e-05, "loss": 0.7707, "step": 5194 }, { "epoch": 0.7938266417083699, "grad_norm": 0.3941137492656708, "learning_rate": 2.182574424089773e-05, "loss": 0.7717, "step": 5195 }, { "epoch": 0.7939794476066776, "grad_norm": 0.34859415888786316, "learning_rate": 2.1794627491010644e-05, "loss": 0.8792, "step": 5196 }, { "epoch": 0.7941322535049853, "grad_norm": 0.3332647383213043, "learning_rate": 2.1763530225335614e-05, "loss": 0.487, "step": 5197 }, { "epoch": 0.794285059403293, "grad_norm": 0.2961379885673523, "learning_rate": 2.1732452451620333e-05, "loss": 0.6603, "step": 5198 }, { "epoch": 0.7944378653016007, "grad_norm": 0.27676260471343994, "learning_rate": 2.1701394177607494e-05, "loss": 0.5361, "step": 5199 }, { "epoch": 0.7945906711999083, "grad_norm": 0.25243091583251953, "learning_rate": 2.167035541103506e-05, "loss": 0.7178, "step": 5200 }, { "epoch": 0.794743477098216, "grad_norm": 0.25789448618888855, "learning_rate": 2.1639336159636027e-05, "loss": 0.67, "step": 5201 }, { "epoch": 0.7948962829965237, "grad_norm": 0.2747963070869446, "learning_rate": 2.1608336431138655e-05, "loss": 0.641, "step": 5202 }, { "epoch": 0.7950490888948314, "grad_norm": 0.3560905158519745, "learning_rate": 2.1577356233266176e-05, "loss": 0.6971, "step": 5203 }, { "epoch": 0.795201894793139, "grad_norm": 0.30919522047042847, "learning_rate": 2.154639557373711e-05, "loss": 0.8019, "step": 5204 }, { "epoch": 0.7953547006914466, "grad_norm": 0.31300801038742065, "learning_rate": 2.151545446026507e-05, "loss": 0.5774, "step": 5205 }, { "epoch": 0.7955075065897543, "grad_norm": 0.2874794602394104, "learning_rate": 2.1484532900558685e-05, "loss": 0.8056, "step": 5206 }, { "epoch": 0.795660312488062, "grad_norm": 0.26195093989372253, "learning_rate": 2.1453630902321843e-05, "loss": 0.7185, "step": 5207 }, { "epoch": 0.7958131183863697, "grad_norm": 0.47928446531295776, "learning_rate": 2.142274847325353e-05, "loss": 0.8464, "step": 5208 }, { "epoch": 0.7959659242846774, "grad_norm": 0.32929208874702454, "learning_rate": 2.139188562104789e-05, "loss": 0.7058, "step": 5209 }, { "epoch": 0.7961187301829851, "grad_norm": 0.2897893488407135, "learning_rate": 2.1361042353394044e-05, "loss": 0.6669, "step": 5210 }, { "epoch": 0.7962715360812928, "grad_norm": 0.43749895691871643, "learning_rate": 2.1330218677976376e-05, "loss": 0.6892, "step": 5211 }, { "epoch": 0.7964243419796004, "grad_norm": 0.3396851420402527, "learning_rate": 2.1299414602474376e-05, "loss": 0.771, "step": 5212 }, { "epoch": 0.7965771478779081, "grad_norm": 0.2620655596256256, "learning_rate": 2.126863013456257e-05, "loss": 0.6952, "step": 5213 }, { "epoch": 0.7967299537762158, "grad_norm": 0.34103044867515564, "learning_rate": 2.1237865281910708e-05, "loss": 0.6764, "step": 5214 }, { "epoch": 0.7968827596745235, "grad_norm": 0.26926666498184204, "learning_rate": 2.120712005218354e-05, "loss": 0.8362, "step": 5215 }, { "epoch": 0.7970355655728311, "grad_norm": 0.29229509830474854, "learning_rate": 2.1176394453041016e-05, "loss": 0.6737, "step": 5216 }, { "epoch": 0.7971883714711387, "grad_norm": 0.25633910298347473, "learning_rate": 2.1145688492138127e-05, "loss": 0.6043, "step": 5217 }, { "epoch": 0.7973411773694464, "grad_norm": 0.2882971167564392, "learning_rate": 2.1115002177125064e-05, "loss": 0.6424, "step": 5218 }, { "epoch": 0.7974939832677541, "grad_norm": 0.49982550740242004, "learning_rate": 2.1084335515647024e-05, "loss": 0.7675, "step": 5219 }, { "epoch": 0.7976467891660618, "grad_norm": 0.25070077180862427, "learning_rate": 2.1053688515344327e-05, "loss": 0.6555, "step": 5220 }, { "epoch": 0.7977995950643695, "grad_norm": 0.3176601529121399, "learning_rate": 2.1023061183852433e-05, "loss": 0.6537, "step": 5221 }, { "epoch": 0.7979524009626772, "grad_norm": 0.2853238880634308, "learning_rate": 2.0992453528801924e-05, "loss": 0.7822, "step": 5222 }, { "epoch": 0.7981052068609849, "grad_norm": 0.2929050624370575, "learning_rate": 2.0961865557818417e-05, "loss": 0.8065, "step": 5223 }, { "epoch": 0.7982580127592925, "grad_norm": 0.2644808292388916, "learning_rate": 2.093129727852261e-05, "loss": 0.6393, "step": 5224 }, { "epoch": 0.7984108186576002, "grad_norm": 0.34935396909713745, "learning_rate": 2.0900748698530358e-05, "loss": 0.5423, "step": 5225 }, { "epoch": 0.7985636245559079, "grad_norm": 0.3045203983783722, "learning_rate": 2.087021982545263e-05, "loss": 0.8789, "step": 5226 }, { "epoch": 0.7987164304542156, "grad_norm": 0.3892831802368164, "learning_rate": 2.0839710666895386e-05, "loss": 0.7262, "step": 5227 }, { "epoch": 0.7988692363525232, "grad_norm": 0.39896148443222046, "learning_rate": 2.080922123045972e-05, "loss": 0.8121, "step": 5228 }, { "epoch": 0.7990220422508308, "grad_norm": 0.3847440481185913, "learning_rate": 2.0778751523741824e-05, "loss": 0.5827, "step": 5229 }, { "epoch": 0.7991748481491385, "grad_norm": 0.28219443559646606, "learning_rate": 2.0748301554333027e-05, "loss": 0.668, "step": 5230 }, { "epoch": 0.7993276540474462, "grad_norm": 0.2536992132663727, "learning_rate": 2.0717871329819628e-05, "loss": 0.5957, "step": 5231 }, { "epoch": 0.7994804599457539, "grad_norm": 0.3085779845714569, "learning_rate": 2.0687460857783048e-05, "loss": 0.7489, "step": 5232 }, { "epoch": 0.7996332658440616, "grad_norm": 0.332774817943573, "learning_rate": 2.065707014579983e-05, "loss": 0.9133, "step": 5233 }, { "epoch": 0.7997860717423693, "grad_norm": 0.2687940001487732, "learning_rate": 2.062669920144159e-05, "loss": 0.6992, "step": 5234 }, { "epoch": 0.799938877640677, "grad_norm": 0.3154837489128113, "learning_rate": 2.059634803227496e-05, "loss": 0.6829, "step": 5235 }, { "epoch": 0.8000916835389846, "grad_norm": 0.31086432933807373, "learning_rate": 2.0566016645861663e-05, "loss": 0.6192, "step": 5236 }, { "epoch": 0.8002444894372923, "grad_norm": 1.0907458066940308, "learning_rate": 2.053570504975856e-05, "loss": 0.638, "step": 5237 }, { "epoch": 0.8003972953356, "grad_norm": 0.5163177847862244, "learning_rate": 2.050541325151746e-05, "loss": 0.9716, "step": 5238 }, { "epoch": 0.8005501012339076, "grad_norm": 0.28363659977912903, "learning_rate": 2.0475141258685358e-05, "loss": 0.6881, "step": 5239 }, { "epoch": 0.8007029071322153, "grad_norm": 0.28859302401542664, "learning_rate": 2.0444889078804298e-05, "loss": 0.7552, "step": 5240 }, { "epoch": 0.8008557130305229, "grad_norm": 0.2968814969062805, "learning_rate": 2.0414656719411305e-05, "loss": 0.5776, "step": 5241 }, { "epoch": 0.8010085189288306, "grad_norm": 0.6331593990325928, "learning_rate": 2.038444418803851e-05, "loss": 0.7135, "step": 5242 }, { "epoch": 0.8011613248271383, "grad_norm": 0.3301532566547394, "learning_rate": 2.0354251492213138e-05, "loss": 0.7518, "step": 5243 }, { "epoch": 0.801314130725446, "grad_norm": 0.48997145891189575, "learning_rate": 2.0324078639457455e-05, "loss": 0.6749, "step": 5244 }, { "epoch": 0.8014669366237537, "grad_norm": 0.2547190189361572, "learning_rate": 2.029392563728877e-05, "loss": 0.8264, "step": 5245 }, { "epoch": 0.8016197425220614, "grad_norm": 0.23393574357032776, "learning_rate": 2.0263792493219413e-05, "loss": 0.7307, "step": 5246 }, { "epoch": 0.801772548420369, "grad_norm": 0.4022747874259949, "learning_rate": 2.023367921475683e-05, "loss": 0.7258, "step": 5247 }, { "epoch": 0.8019253543186767, "grad_norm": 0.3829197883605957, "learning_rate": 2.0203585809403525e-05, "loss": 0.7445, "step": 5248 }, { "epoch": 0.8020781602169844, "grad_norm": 0.3364792466163635, "learning_rate": 2.017351228465697e-05, "loss": 0.577, "step": 5249 }, { "epoch": 0.8022309661152921, "grad_norm": 0.28101688623428345, "learning_rate": 2.014345864800974e-05, "loss": 0.62, "step": 5250 }, { "epoch": 0.8023837720135997, "grad_norm": 0.3479030430316925, "learning_rate": 2.0113424906949465e-05, "loss": 0.48, "step": 5251 }, { "epoch": 0.8025365779119074, "grad_norm": 0.26711511611938477, "learning_rate": 2.0083411068958756e-05, "loss": 0.5663, "step": 5252 }, { "epoch": 0.802689383810215, "grad_norm": 0.3207715153694153, "learning_rate": 2.0053417141515373e-05, "loss": 0.6989, "step": 5253 }, { "epoch": 0.8028421897085227, "grad_norm": 0.261491060256958, "learning_rate": 2.0023443132092003e-05, "loss": 0.5608, "step": 5254 }, { "epoch": 0.8029949956068304, "grad_norm": 0.3126250207424164, "learning_rate": 1.9993489048156443e-05, "loss": 0.4752, "step": 5255 }, { "epoch": 0.8031478015051381, "grad_norm": 0.2783001661300659, "learning_rate": 1.9963554897171478e-05, "loss": 0.6302, "step": 5256 }, { "epoch": 0.8033006074034458, "grad_norm": 0.2911037504673004, "learning_rate": 1.9933640686594978e-05, "loss": 0.5396, "step": 5257 }, { "epoch": 0.8034534133017535, "grad_norm": 0.243194118142128, "learning_rate": 1.990374642387982e-05, "loss": 0.5593, "step": 5258 }, { "epoch": 0.8036062192000611, "grad_norm": 0.3205549418926239, "learning_rate": 1.9873872116473857e-05, "loss": 0.7169, "step": 5259 }, { "epoch": 0.8037590250983688, "grad_norm": 0.35936808586120605, "learning_rate": 1.9844017771820055e-05, "loss": 0.6554, "step": 5260 }, { "epoch": 0.8039118309966765, "grad_norm": 0.27080589532852173, "learning_rate": 1.981418339735641e-05, "loss": 0.6742, "step": 5261 }, { "epoch": 0.8040646368949842, "grad_norm": 0.3525456190109253, "learning_rate": 1.978436900051588e-05, "loss": 0.7487, "step": 5262 }, { "epoch": 0.8042174427932918, "grad_norm": 0.2958907186985016, "learning_rate": 1.9754574588726426e-05, "loss": 0.7423, "step": 5263 }, { "epoch": 0.8043702486915995, "grad_norm": 0.387239545583725, "learning_rate": 1.9724800169411107e-05, "loss": 0.7466, "step": 5264 }, { "epoch": 0.8045230545899071, "grad_norm": 0.31217509508132935, "learning_rate": 1.9695045749988017e-05, "loss": 0.6598, "step": 5265 }, { "epoch": 0.8046758604882148, "grad_norm": 0.25408506393432617, "learning_rate": 1.9665311337870173e-05, "loss": 0.7294, "step": 5266 }, { "epoch": 0.8048286663865225, "grad_norm": 0.34589096903800964, "learning_rate": 1.963559694046563e-05, "loss": 0.8124, "step": 5267 }, { "epoch": 0.8049814722848302, "grad_norm": 0.3010726571083069, "learning_rate": 1.9605902565177513e-05, "loss": 0.6591, "step": 5268 }, { "epoch": 0.8051342781831379, "grad_norm": 0.25482556223869324, "learning_rate": 1.9576228219403957e-05, "loss": 0.662, "step": 5269 }, { "epoch": 0.8052870840814456, "grad_norm": 0.3103812038898468, "learning_rate": 1.9546573910538036e-05, "loss": 0.6193, "step": 5270 }, { "epoch": 0.8054398899797532, "grad_norm": 0.3204786479473114, "learning_rate": 1.9516939645967857e-05, "loss": 0.724, "step": 5271 }, { "epoch": 0.8055926958780609, "grad_norm": 0.4453890025615692, "learning_rate": 1.9487325433076576e-05, "loss": 0.7314, "step": 5272 }, { "epoch": 0.8057455017763686, "grad_norm": 0.7212764024734497, "learning_rate": 1.945773127924234e-05, "loss": 0.8104, "step": 5273 }, { "epoch": 0.8058983076746763, "grad_norm": 0.31767502427101135, "learning_rate": 1.9428157191838238e-05, "loss": 0.5659, "step": 5274 }, { "epoch": 0.8060511135729839, "grad_norm": 0.29767361283302307, "learning_rate": 1.9398603178232455e-05, "loss": 0.7183, "step": 5275 }, { "epoch": 0.8062039194712916, "grad_norm": 0.26745566725730896, "learning_rate": 1.9369069245788106e-05, "loss": 0.6183, "step": 5276 }, { "epoch": 0.8063567253695992, "grad_norm": 0.2996903955936432, "learning_rate": 1.9339555401863297e-05, "loss": 0.5862, "step": 5277 }, { "epoch": 0.8065095312679069, "grad_norm": 0.2614487111568451, "learning_rate": 1.9310061653811173e-05, "loss": 0.7281, "step": 5278 }, { "epoch": 0.8066623371662146, "grad_norm": 0.2431805282831192, "learning_rate": 1.9280588008979884e-05, "loss": 0.5995, "step": 5279 }, { "epoch": 0.8068151430645223, "grad_norm": 0.3033202886581421, "learning_rate": 1.9251134474712506e-05, "loss": 0.6573, "step": 5280 }, { "epoch": 0.80696794896283, "grad_norm": 0.29538241028785706, "learning_rate": 1.922170105834713e-05, "loss": 0.7933, "step": 5281 }, { "epoch": 0.8071207548611377, "grad_norm": 0.2647150456905365, "learning_rate": 1.9192287767216867e-05, "loss": 0.5476, "step": 5282 }, { "epoch": 0.8072735607594453, "grad_norm": 0.2977331578731537, "learning_rate": 1.9162894608649805e-05, "loss": 0.482, "step": 5283 }, { "epoch": 0.807426366657753, "grad_norm": 0.28192129731178284, "learning_rate": 1.9133521589968985e-05, "loss": 0.6165, "step": 5284 }, { "epoch": 0.8075791725560607, "grad_norm": 0.3128628730773926, "learning_rate": 1.9104168718492423e-05, "loss": 0.7441, "step": 5285 }, { "epoch": 0.8077319784543684, "grad_norm": 0.35077306628227234, "learning_rate": 1.907483600153317e-05, "loss": 0.7481, "step": 5286 }, { "epoch": 0.807884784352676, "grad_norm": 0.3166959285736084, "learning_rate": 1.9045523446399237e-05, "loss": 0.7984, "step": 5287 }, { "epoch": 0.8080375902509837, "grad_norm": 0.27331990003585815, "learning_rate": 1.9016231060393596e-05, "loss": 0.6793, "step": 5288 }, { "epoch": 0.8081903961492913, "grad_norm": 0.2991531491279602, "learning_rate": 1.898695885081416e-05, "loss": 0.5654, "step": 5289 }, { "epoch": 0.808343202047599, "grad_norm": 0.2798959016799927, "learning_rate": 1.8957706824953915e-05, "loss": 0.6628, "step": 5290 }, { "epoch": 0.8084960079459067, "grad_norm": 0.27082306146621704, "learning_rate": 1.8928474990100687e-05, "loss": 0.7142, "step": 5291 }, { "epoch": 0.8086488138442144, "grad_norm": 0.28114885091781616, "learning_rate": 1.889926335353741e-05, "loss": 0.716, "step": 5292 }, { "epoch": 0.8088016197425221, "grad_norm": 0.3345818519592285, "learning_rate": 1.8870071922541877e-05, "loss": 0.8301, "step": 5293 }, { "epoch": 0.8089544256408298, "grad_norm": 0.3638163208961487, "learning_rate": 1.884090070438691e-05, "loss": 0.7258, "step": 5294 }, { "epoch": 0.8091072315391374, "grad_norm": 0.3682017922401428, "learning_rate": 1.881174970634024e-05, "loss": 0.6451, "step": 5295 }, { "epoch": 0.8092600374374451, "grad_norm": 0.2848983407020569, "learning_rate": 1.8782618935664653e-05, "loss": 0.8843, "step": 5296 }, { "epoch": 0.8094128433357528, "grad_norm": 0.30901724100112915, "learning_rate": 1.8753508399617793e-05, "loss": 0.638, "step": 5297 }, { "epoch": 0.8095656492340604, "grad_norm": 0.3024827837944031, "learning_rate": 1.872441810545228e-05, "loss": 0.7069, "step": 5298 }, { "epoch": 0.8097184551323681, "grad_norm": 0.28565528988838196, "learning_rate": 1.8695348060415762e-05, "loss": 0.4848, "step": 5299 }, { "epoch": 0.8098712610306757, "grad_norm": 0.45431607961654663, "learning_rate": 1.866629827175077e-05, "loss": 0.6773, "step": 5300 }, { "epoch": 0.8100240669289834, "grad_norm": 0.2928932011127472, "learning_rate": 1.8637268746694892e-05, "loss": 0.5875, "step": 5301 }, { "epoch": 0.8101768728272911, "grad_norm": 0.31244757771492004, "learning_rate": 1.8608259492480474e-05, "loss": 0.6565, "step": 5302 }, { "epoch": 0.8103296787255988, "grad_norm": 0.33375710248947144, "learning_rate": 1.857927051633498e-05, "loss": 0.6444, "step": 5303 }, { "epoch": 0.8104824846239065, "grad_norm": 0.2504745423793793, "learning_rate": 1.8550301825480763e-05, "loss": 0.7504, "step": 5304 }, { "epoch": 0.8106352905222142, "grad_norm": 0.3320654034614563, "learning_rate": 1.8521353427135168e-05, "loss": 0.7599, "step": 5305 }, { "epoch": 0.8107880964205219, "grad_norm": 0.32688429951667786, "learning_rate": 1.849242532851042e-05, "loss": 0.5366, "step": 5306 }, { "epoch": 0.8109409023188295, "grad_norm": 0.2687855064868927, "learning_rate": 1.846351753681368e-05, "loss": 0.6612, "step": 5307 }, { "epoch": 0.8110937082171372, "grad_norm": 0.30853578448295593, "learning_rate": 1.8434630059247126e-05, "loss": 0.6146, "step": 5308 }, { "epoch": 0.8112465141154449, "grad_norm": 0.2773032486438751, "learning_rate": 1.8405762903007793e-05, "loss": 0.7276, "step": 5309 }, { "epoch": 0.8113993200137525, "grad_norm": 0.3360896110534668, "learning_rate": 1.837691607528774e-05, "loss": 0.6333, "step": 5310 }, { "epoch": 0.8115521259120602, "grad_norm": 0.26231664419174194, "learning_rate": 1.834808958327385e-05, "loss": 0.7275, "step": 5311 }, { "epoch": 0.8117049318103678, "grad_norm": 0.2589069604873657, "learning_rate": 1.831928343414807e-05, "loss": 0.751, "step": 5312 }, { "epoch": 0.8118577377086755, "grad_norm": 0.3091123700141907, "learning_rate": 1.8290497635087146e-05, "loss": 0.827, "step": 5313 }, { "epoch": 0.8120105436069832, "grad_norm": 0.30030331015586853, "learning_rate": 1.8261732193262872e-05, "loss": 0.7596, "step": 5314 }, { "epoch": 0.8121633495052909, "grad_norm": 0.3518400192260742, "learning_rate": 1.8232987115841884e-05, "loss": 0.5488, "step": 5315 }, { "epoch": 0.8123161554035986, "grad_norm": 0.26434770226478577, "learning_rate": 1.8204262409985763e-05, "loss": 0.5582, "step": 5316 }, { "epoch": 0.8124689613019063, "grad_norm": 0.3209986984729767, "learning_rate": 1.817555808285105e-05, "loss": 0.6947, "step": 5317 }, { "epoch": 0.812621767200214, "grad_norm": 0.26500552892684937, "learning_rate": 1.814687414158921e-05, "loss": 0.6549, "step": 5318 }, { "epoch": 0.8127745730985216, "grad_norm": 0.4316971004009247, "learning_rate": 1.8118210593346586e-05, "loss": 0.6712, "step": 5319 }, { "epoch": 0.8129273789968293, "grad_norm": 0.3341822028160095, "learning_rate": 1.808956744526443e-05, "loss": 0.7109, "step": 5320 }, { "epoch": 0.813080184895137, "grad_norm": 0.26364752650260925, "learning_rate": 1.8060944704478965e-05, "loss": 0.5573, "step": 5321 }, { "epoch": 0.8132329907934446, "grad_norm": 0.3340471088886261, "learning_rate": 1.8032342378121347e-05, "loss": 0.4768, "step": 5322 }, { "epoch": 0.8133857966917523, "grad_norm": 0.31111812591552734, "learning_rate": 1.8003760473317555e-05, "loss": 0.9573, "step": 5323 }, { "epoch": 0.81353860259006, "grad_norm": 0.32566016912460327, "learning_rate": 1.7975198997188526e-05, "loss": 0.8372, "step": 5324 }, { "epoch": 0.8136914084883676, "grad_norm": 0.2471184879541397, "learning_rate": 1.7946657956850133e-05, "loss": 0.615, "step": 5325 }, { "epoch": 0.8138442143866753, "grad_norm": 0.2883491516113281, "learning_rate": 1.7918137359413157e-05, "loss": 0.6954, "step": 5326 }, { "epoch": 0.813997020284983, "grad_norm": 0.2613636553287506, "learning_rate": 1.7889637211983246e-05, "loss": 0.6137, "step": 5327 }, { "epoch": 0.8141498261832907, "grad_norm": 0.3412512242794037, "learning_rate": 1.786115752166094e-05, "loss": 0.5779, "step": 5328 }, { "epoch": 0.8143026320815984, "grad_norm": 0.34528636932373047, "learning_rate": 1.7832698295541773e-05, "loss": 0.7299, "step": 5329 }, { "epoch": 0.814455437979906, "grad_norm": 0.27255862951278687, "learning_rate": 1.780425954071606e-05, "loss": 0.7939, "step": 5330 }, { "epoch": 0.8146082438782137, "grad_norm": 0.29479607939720154, "learning_rate": 1.7775841264269145e-05, "loss": 0.673, "step": 5331 }, { "epoch": 0.8147610497765214, "grad_norm": 0.2675367593765259, "learning_rate": 1.7747443473281133e-05, "loss": 0.5236, "step": 5332 }, { "epoch": 0.8149138556748291, "grad_norm": 0.47365444898605347, "learning_rate": 1.771906617482717e-05, "loss": 0.5629, "step": 5333 }, { "epoch": 0.8150666615731367, "grad_norm": 0.284067839384079, "learning_rate": 1.7690709375977154e-05, "loss": 0.4462, "step": 5334 }, { "epoch": 0.8152194674714444, "grad_norm": 0.28673845529556274, "learning_rate": 1.7662373083795968e-05, "loss": 0.6392, "step": 5335 }, { "epoch": 0.815372273369752, "grad_norm": 0.562435507774353, "learning_rate": 1.763405730534342e-05, "loss": 0.5871, "step": 5336 }, { "epoch": 0.8155250792680597, "grad_norm": 0.3012368679046631, "learning_rate": 1.7605762047674046e-05, "loss": 0.6446, "step": 5337 }, { "epoch": 0.8156778851663674, "grad_norm": 0.28177013993263245, "learning_rate": 1.7577487317837414e-05, "loss": 0.723, "step": 5338 }, { "epoch": 0.8158306910646751, "grad_norm": 0.29954829812049866, "learning_rate": 1.754923312287795e-05, "loss": 0.7209, "step": 5339 }, { "epoch": 0.8159834969629828, "grad_norm": 0.3177793323993683, "learning_rate": 1.7520999469834964e-05, "loss": 0.6263, "step": 5340 }, { "epoch": 0.8161363028612905, "grad_norm": 0.2854011356830597, "learning_rate": 1.749278636574262e-05, "loss": 0.7782, "step": 5341 }, { "epoch": 0.8162891087595981, "grad_norm": 0.24384701251983643, "learning_rate": 1.7464593817629926e-05, "loss": 0.7041, "step": 5342 }, { "epoch": 0.8164419146579058, "grad_norm": 0.4053572118282318, "learning_rate": 1.7436421832520866e-05, "loss": 0.7957, "step": 5343 }, { "epoch": 0.8165947205562135, "grad_norm": 0.2650754749774933, "learning_rate": 1.740827041743428e-05, "loss": 0.6292, "step": 5344 }, { "epoch": 0.8167475264545212, "grad_norm": 0.5751661658287048, "learning_rate": 1.7380139579383814e-05, "loss": 0.7315, "step": 5345 }, { "epoch": 0.8169003323528288, "grad_norm": 0.2932993471622467, "learning_rate": 1.7352029325378015e-05, "loss": 0.9154, "step": 5346 }, { "epoch": 0.8170531382511365, "grad_norm": 0.27605947852134705, "learning_rate": 1.7323939662420373e-05, "loss": 0.7626, "step": 5347 }, { "epoch": 0.8172059441494441, "grad_norm": 0.31729474663734436, "learning_rate": 1.7295870597509146e-05, "loss": 0.8639, "step": 5348 }, { "epoch": 0.8173587500477518, "grad_norm": 0.283658891916275, "learning_rate": 1.7267822137637536e-05, "loss": 0.6038, "step": 5349 }, { "epoch": 0.8175115559460595, "grad_norm": 0.3112010657787323, "learning_rate": 1.7239794289793533e-05, "loss": 0.7148, "step": 5350 }, { "epoch": 0.8176643618443672, "grad_norm": 0.3229861259460449, "learning_rate": 1.7211787060960105e-05, "loss": 0.7873, "step": 5351 }, { "epoch": 0.8178171677426749, "grad_norm": 0.4701400399208069, "learning_rate": 1.7183800458114964e-05, "loss": 0.856, "step": 5352 }, { "epoch": 0.8179699736409826, "grad_norm": 0.3407234251499176, "learning_rate": 1.7155834488230782e-05, "loss": 0.6922, "step": 5353 }, { "epoch": 0.8181227795392902, "grad_norm": 0.3151310086250305, "learning_rate": 1.7127889158275024e-05, "loss": 0.6667, "step": 5354 }, { "epoch": 0.8182755854375979, "grad_norm": 0.35597583651542664, "learning_rate": 1.7099964475210017e-05, "loss": 0.6749, "step": 5355 }, { "epoch": 0.8184283913359056, "grad_norm": 0.30627796053886414, "learning_rate": 1.7072060445992967e-05, "loss": 0.7082, "step": 5356 }, { "epoch": 0.8185811972342132, "grad_norm": 0.30426254868507385, "learning_rate": 1.7044177077575962e-05, "loss": 0.7114, "step": 5357 }, { "epoch": 0.8187340031325209, "grad_norm": 0.27987557649612427, "learning_rate": 1.7016314376905894e-05, "loss": 0.6147, "step": 5358 }, { "epoch": 0.8188868090308286, "grad_norm": 0.31371667981147766, "learning_rate": 1.6988472350924488e-05, "loss": 0.5975, "step": 5359 }, { "epoch": 0.8190396149291362, "grad_norm": 0.39855438470840454, "learning_rate": 1.6960651006568372e-05, "loss": 0.8116, "step": 5360 }, { "epoch": 0.8191924208274439, "grad_norm": 0.4293268322944641, "learning_rate": 1.6932850350769037e-05, "loss": 0.7199, "step": 5361 }, { "epoch": 0.8193452267257516, "grad_norm": 0.3112042546272278, "learning_rate": 1.690507039045275e-05, "loss": 0.8287, "step": 5362 }, { "epoch": 0.8194980326240593, "grad_norm": 0.2758471965789795, "learning_rate": 1.687731113254063e-05, "loss": 0.6841, "step": 5363 }, { "epoch": 0.819650838522367, "grad_norm": 0.2721893787384033, "learning_rate": 1.684957258394869e-05, "loss": 0.5713, "step": 5364 }, { "epoch": 0.8198036444206747, "grad_norm": 0.3106933832168579, "learning_rate": 1.6821854751587774e-05, "loss": 0.6588, "step": 5365 }, { "epoch": 0.8199564503189823, "grad_norm": 0.2738363444805145, "learning_rate": 1.6794157642363517e-05, "loss": 0.739, "step": 5366 }, { "epoch": 0.82010925621729, "grad_norm": 0.26095277070999146, "learning_rate": 1.6766481263176448e-05, "loss": 0.8577, "step": 5367 }, { "epoch": 0.8202620621155977, "grad_norm": 0.4808953106403351, "learning_rate": 1.6738825620921894e-05, "loss": 0.6906, "step": 5368 }, { "epoch": 0.8204148680139053, "grad_norm": 0.25246375799179077, "learning_rate": 1.671119072248999e-05, "loss": 0.5648, "step": 5369 }, { "epoch": 0.820567673912213, "grad_norm": 0.27172037959098816, "learning_rate": 1.668357657476578e-05, "loss": 0.9843, "step": 5370 }, { "epoch": 0.8207204798105207, "grad_norm": 0.5794962644577026, "learning_rate": 1.6655983184629108e-05, "loss": 0.4935, "step": 5371 }, { "epoch": 0.8208732857088283, "grad_norm": 0.2584603726863861, "learning_rate": 1.662841055895461e-05, "loss": 0.6888, "step": 5372 }, { "epoch": 0.821026091607136, "grad_norm": 0.36797964572906494, "learning_rate": 1.6600858704611764e-05, "loss": 0.77, "step": 5373 }, { "epoch": 0.8211788975054437, "grad_norm": 0.2741428315639496, "learning_rate": 1.6573327628464897e-05, "loss": 0.6751, "step": 5374 }, { "epoch": 0.8213317034037514, "grad_norm": 0.4992486536502838, "learning_rate": 1.6545817337373172e-05, "loss": 0.712, "step": 5375 }, { "epoch": 0.8214845093020591, "grad_norm": 0.28491753339767456, "learning_rate": 1.6518327838190528e-05, "loss": 0.7427, "step": 5376 }, { "epoch": 0.8216373152003668, "grad_norm": 0.47695693373680115, "learning_rate": 1.64908591377657e-05, "loss": 0.9162, "step": 5377 }, { "epoch": 0.8217901210986744, "grad_norm": 0.29675573110580444, "learning_rate": 1.646341124294234e-05, "loss": 0.7819, "step": 5378 }, { "epoch": 0.8219429269969821, "grad_norm": 0.2657209634780884, "learning_rate": 1.643598416055885e-05, "loss": 0.7338, "step": 5379 }, { "epoch": 0.8220957328952898, "grad_norm": 0.28576889634132385, "learning_rate": 1.640857789744846e-05, "loss": 0.7603, "step": 5380 }, { "epoch": 0.8222485387935974, "grad_norm": 0.2834707498550415, "learning_rate": 1.6381192460439175e-05, "loss": 0.732, "step": 5381 }, { "epoch": 0.8224013446919051, "grad_norm": 0.23507724702358246, "learning_rate": 1.6353827856353864e-05, "loss": 0.6541, "step": 5382 }, { "epoch": 0.8225541505902128, "grad_norm": 0.30040469765663147, "learning_rate": 1.632648409201023e-05, "loss": 0.7025, "step": 5383 }, { "epoch": 0.8227069564885204, "grad_norm": 0.25531867146492004, "learning_rate": 1.62991611742207e-05, "loss": 0.5456, "step": 5384 }, { "epoch": 0.8228597623868281, "grad_norm": 0.310249388217926, "learning_rate": 1.6271859109792543e-05, "loss": 0.712, "step": 5385 }, { "epoch": 0.8230125682851358, "grad_norm": 0.4035734236240387, "learning_rate": 1.6244577905527868e-05, "loss": 0.6386, "step": 5386 }, { "epoch": 0.8231653741834435, "grad_norm": 0.2613977789878845, "learning_rate": 1.6217317568223523e-05, "loss": 0.5869, "step": 5387 }, { "epoch": 0.8233181800817512, "grad_norm": 0.27511075139045715, "learning_rate": 1.6190078104671245e-05, "loss": 0.7242, "step": 5388 }, { "epoch": 0.8234709859800589, "grad_norm": 0.3719506561756134, "learning_rate": 1.616285952165746e-05, "loss": 0.637, "step": 5389 }, { "epoch": 0.8236237918783665, "grad_norm": 0.2993394434452057, "learning_rate": 1.61356618259635e-05, "loss": 0.6588, "step": 5390 }, { "epoch": 0.8237765977766742, "grad_norm": 0.3319057822227478, "learning_rate": 1.6108485024365383e-05, "loss": 0.7413, "step": 5391 }, { "epoch": 0.8239294036749819, "grad_norm": 0.2575140595436096, "learning_rate": 1.6081329123634027e-05, "loss": 0.6268, "step": 5392 }, { "epoch": 0.8240822095732895, "grad_norm": 0.35643646121025085, "learning_rate": 1.605419413053514e-05, "loss": 0.7586, "step": 5393 }, { "epoch": 0.8242350154715972, "grad_norm": 0.28320401906967163, "learning_rate": 1.6027080051829058e-05, "loss": 0.5139, "step": 5394 }, { "epoch": 0.8243878213699048, "grad_norm": 0.30584996938705444, "learning_rate": 1.59999868942711e-05, "loss": 0.8343, "step": 5395 }, { "epoch": 0.8245406272682125, "grad_norm": 0.3051997125148773, "learning_rate": 1.5972914664611306e-05, "loss": 0.6096, "step": 5396 }, { "epoch": 0.8246934331665202, "grad_norm": 0.27233120799064636, "learning_rate": 1.5945863369594503e-05, "loss": 0.63, "step": 5397 }, { "epoch": 0.8248462390648279, "grad_norm": 0.2944967448711395, "learning_rate": 1.5918833015960243e-05, "loss": 0.7065, "step": 5398 }, { "epoch": 0.8249990449631356, "grad_norm": 0.2862212061882019, "learning_rate": 1.5891823610442925e-05, "loss": 0.8733, "step": 5399 }, { "epoch": 0.8251518508614433, "grad_norm": 0.3013235032558441, "learning_rate": 1.5864835159771763e-05, "loss": 0.5567, "step": 5400 }, { "epoch": 0.825304656759751, "grad_norm": 0.4039291441440582, "learning_rate": 1.5837867670670638e-05, "loss": 0.8828, "step": 5401 }, { "epoch": 0.8254574626580586, "grad_norm": 0.25634995102882385, "learning_rate": 1.581092114985834e-05, "loss": 0.7524, "step": 5402 }, { "epoch": 0.8256102685563663, "grad_norm": 0.301152765750885, "learning_rate": 1.5783995604048295e-05, "loss": 0.8011, "step": 5403 }, { "epoch": 0.8257630744546739, "grad_norm": 0.25691330432891846, "learning_rate": 1.5757091039948856e-05, "loss": 0.6929, "step": 5404 }, { "epoch": 0.8259158803529816, "grad_norm": 0.2751913368701935, "learning_rate": 1.573020746426299e-05, "loss": 0.7996, "step": 5405 }, { "epoch": 0.8260686862512893, "grad_norm": 0.3121042251586914, "learning_rate": 1.5703344883688586e-05, "loss": 0.7233, "step": 5406 }, { "epoch": 0.826221492149597, "grad_norm": 0.30473700165748596, "learning_rate": 1.56765033049182e-05, "loss": 0.8604, "step": 5407 }, { "epoch": 0.8263742980479046, "grad_norm": 0.28437185287475586, "learning_rate": 1.5649682734639147e-05, "loss": 0.737, "step": 5408 }, { "epoch": 0.8265271039462123, "grad_norm": 0.27961575984954834, "learning_rate": 1.56228831795336e-05, "loss": 0.602, "step": 5409 }, { "epoch": 0.82667990984452, "grad_norm": 0.2619366943836212, "learning_rate": 1.5596104646278443e-05, "loss": 0.5513, "step": 5410 }, { "epoch": 0.8268327157428277, "grad_norm": 0.3035143315792084, "learning_rate": 1.55693471415453e-05, "loss": 0.5348, "step": 5411 }, { "epoch": 0.8269855216411354, "grad_norm": 0.28375378251075745, "learning_rate": 1.5542610672000568e-05, "loss": 0.6483, "step": 5412 }, { "epoch": 0.827138327539443, "grad_norm": 0.28063929080963135, "learning_rate": 1.5515895244305435e-05, "loss": 0.6351, "step": 5413 }, { "epoch": 0.8272911334377507, "grad_norm": 0.25390905141830444, "learning_rate": 1.5489200865115838e-05, "loss": 0.5528, "step": 5414 }, { "epoch": 0.8274439393360584, "grad_norm": 0.3002142906188965, "learning_rate": 1.546252754108245e-05, "loss": 0.7665, "step": 5415 }, { "epoch": 0.827596745234366, "grad_norm": 0.25479069352149963, "learning_rate": 1.5435875278850664e-05, "loss": 0.6322, "step": 5416 }, { "epoch": 0.8277495511326737, "grad_norm": 0.32886984944343567, "learning_rate": 1.5409244085060704e-05, "loss": 0.6251, "step": 5417 }, { "epoch": 0.8279023570309814, "grad_norm": 0.3810743987560272, "learning_rate": 1.5382633966347527e-05, "loss": 0.6871, "step": 5418 }, { "epoch": 0.828055162929289, "grad_norm": 0.27458542585372925, "learning_rate": 1.5356044929340806e-05, "loss": 0.7093, "step": 5419 }, { "epoch": 0.8282079688275967, "grad_norm": 0.28523746132850647, "learning_rate": 1.5329476980664935e-05, "loss": 0.6043, "step": 5420 }, { "epoch": 0.8283607747259044, "grad_norm": 0.3243076205253601, "learning_rate": 1.530293012693913e-05, "loss": 0.6776, "step": 5421 }, { "epoch": 0.8285135806242121, "grad_norm": 0.2854920029640198, "learning_rate": 1.5276404374777353e-05, "loss": 0.6931, "step": 5422 }, { "epoch": 0.8286663865225198, "grad_norm": 0.27915722131729126, "learning_rate": 1.524989973078822e-05, "loss": 0.8883, "step": 5423 }, { "epoch": 0.8288191924208275, "grad_norm": 0.29778462648391724, "learning_rate": 1.5223416201575137e-05, "loss": 0.7446, "step": 5424 }, { "epoch": 0.8289719983191352, "grad_norm": 0.26901963353157043, "learning_rate": 1.5196953793736301e-05, "loss": 0.72, "step": 5425 }, { "epoch": 0.8291248042174428, "grad_norm": 0.30687153339385986, "learning_rate": 1.5170512513864543e-05, "loss": 0.7793, "step": 5426 }, { "epoch": 0.8292776101157505, "grad_norm": 0.37296929955482483, "learning_rate": 1.5144092368547513e-05, "loss": 0.6609, "step": 5427 }, { "epoch": 0.8294304160140581, "grad_norm": 0.28542792797088623, "learning_rate": 1.511769336436759e-05, "loss": 0.7416, "step": 5428 }, { "epoch": 0.8295832219123658, "grad_norm": 0.2609799802303314, "learning_rate": 1.5091315507901838e-05, "loss": 0.8109, "step": 5429 }, { "epoch": 0.8297360278106735, "grad_norm": 0.35812559723854065, "learning_rate": 1.5064958805722074e-05, "loss": 0.8972, "step": 5430 }, { "epoch": 0.8298888337089811, "grad_norm": 0.3091167211532593, "learning_rate": 1.5038623264394846e-05, "loss": 0.6813, "step": 5431 }, { "epoch": 0.8300416396072888, "grad_norm": 0.3029838800430298, "learning_rate": 1.5012308890481474e-05, "loss": 0.8768, "step": 5432 }, { "epoch": 0.8301944455055965, "grad_norm": 0.37332555651664734, "learning_rate": 1.4986015690537924e-05, "loss": 0.7508, "step": 5433 }, { "epoch": 0.8303472514039042, "grad_norm": 0.292579710483551, "learning_rate": 1.4959743671114924e-05, "loss": 0.5804, "step": 5434 }, { "epoch": 0.8305000573022119, "grad_norm": 0.28892597556114197, "learning_rate": 1.4933492838757933e-05, "loss": 0.7084, "step": 5435 }, { "epoch": 0.8306528632005196, "grad_norm": 0.41257259249687195, "learning_rate": 1.490726320000716e-05, "loss": 0.7625, "step": 5436 }, { "epoch": 0.8308056690988272, "grad_norm": 0.2837768495082855, "learning_rate": 1.4881054761397472e-05, "loss": 0.7145, "step": 5437 }, { "epoch": 0.8309584749971349, "grad_norm": 0.2908405065536499, "learning_rate": 1.4854867529458461e-05, "loss": 0.7271, "step": 5438 }, { "epoch": 0.8311112808954426, "grad_norm": 0.3113293945789337, "learning_rate": 1.4828701510714494e-05, "loss": 0.6935, "step": 5439 }, { "epoch": 0.8312640867937502, "grad_norm": 0.2733793258666992, "learning_rate": 1.480255671168458e-05, "loss": 0.6754, "step": 5440 }, { "epoch": 0.8314168926920579, "grad_norm": 0.39152440428733826, "learning_rate": 1.4776433138882507e-05, "loss": 0.6892, "step": 5441 }, { "epoch": 0.8315696985903656, "grad_norm": 0.3532632887363434, "learning_rate": 1.4750330798816714e-05, "loss": 0.6027, "step": 5442 }, { "epoch": 0.8317225044886732, "grad_norm": 0.3093455135822296, "learning_rate": 1.4724249697990412e-05, "loss": 0.7746, "step": 5443 }, { "epoch": 0.8318753103869809, "grad_norm": 0.32264313101768494, "learning_rate": 1.4698189842901455e-05, "loss": 0.7218, "step": 5444 }, { "epoch": 0.8320281162852886, "grad_norm": 0.2763960063457489, "learning_rate": 1.4672151240042475e-05, "loss": 0.7134, "step": 5445 }, { "epoch": 0.8321809221835963, "grad_norm": 0.30286088585853577, "learning_rate": 1.464613389590076e-05, "loss": 0.6322, "step": 5446 }, { "epoch": 0.832333728081904, "grad_norm": 0.27298229932785034, "learning_rate": 1.4620137816958269e-05, "loss": 0.6095, "step": 5447 }, { "epoch": 0.8324865339802117, "grad_norm": 0.30183646082878113, "learning_rate": 1.4594163009691741e-05, "loss": 0.7774, "step": 5448 }, { "epoch": 0.8326393398785193, "grad_norm": 0.3693987727165222, "learning_rate": 1.4568209480572615e-05, "loss": 0.5401, "step": 5449 }, { "epoch": 0.832792145776827, "grad_norm": 0.3332158327102661, "learning_rate": 1.454227723606696e-05, "loss": 0.5118, "step": 5450 }, { "epoch": 0.8329449516751347, "grad_norm": 0.2714625895023346, "learning_rate": 1.4516366282635552e-05, "loss": 0.6193, "step": 5451 }, { "epoch": 0.8330977575734423, "grad_norm": 0.2512652277946472, "learning_rate": 1.4490476626733907e-05, "loss": 0.7225, "step": 5452 }, { "epoch": 0.83325056347175, "grad_norm": 0.32431286573410034, "learning_rate": 1.446460827481223e-05, "loss": 0.762, "step": 5453 }, { "epoch": 0.8334033693700577, "grad_norm": 0.26185527443885803, "learning_rate": 1.4438761233315445e-05, "loss": 0.6317, "step": 5454 }, { "epoch": 0.8335561752683653, "grad_norm": 0.2889951169490814, "learning_rate": 1.4412935508683024e-05, "loss": 0.5021, "step": 5455 }, { "epoch": 0.833708981166673, "grad_norm": 0.2649388909339905, "learning_rate": 1.4387131107349295e-05, "loss": 0.544, "step": 5456 }, { "epoch": 0.8338617870649807, "grad_norm": 0.26375481486320496, "learning_rate": 1.4361348035743205e-05, "loss": 0.7273, "step": 5457 }, { "epoch": 0.8340145929632884, "grad_norm": 0.3317602574825287, "learning_rate": 1.4335586300288385e-05, "loss": 0.7108, "step": 5458 }, { "epoch": 0.8341673988615961, "grad_norm": 0.37609806656837463, "learning_rate": 1.430984590740313e-05, "loss": 0.6881, "step": 5459 }, { "epoch": 0.8343202047599038, "grad_norm": 0.245027095079422, "learning_rate": 1.4284126863500457e-05, "loss": 0.7448, "step": 5460 }, { "epoch": 0.8344730106582114, "grad_norm": 0.34146881103515625, "learning_rate": 1.4258429174988086e-05, "loss": 0.9717, "step": 5461 }, { "epoch": 0.8346258165565191, "grad_norm": 0.331993043422699, "learning_rate": 1.4232752848268317e-05, "loss": 0.6564, "step": 5462 }, { "epoch": 0.8347786224548267, "grad_norm": 0.23571527004241943, "learning_rate": 1.4207097889738253e-05, "loss": 0.6448, "step": 5463 }, { "epoch": 0.8349314283531344, "grad_norm": 0.2884713411331177, "learning_rate": 1.4181464305789583e-05, "loss": 0.6051, "step": 5464 }, { "epoch": 0.8350842342514421, "grad_norm": 0.30997729301452637, "learning_rate": 1.4155852102808686e-05, "loss": 0.7864, "step": 5465 }, { "epoch": 0.8352370401497498, "grad_norm": 0.42678937315940857, "learning_rate": 1.4130261287176627e-05, "loss": 0.7529, "step": 5466 }, { "epoch": 0.8353898460480574, "grad_norm": 0.23659348487854004, "learning_rate": 1.4104691865269193e-05, "loss": 0.5946, "step": 5467 }, { "epoch": 0.8355426519463651, "grad_norm": 0.27591216564178467, "learning_rate": 1.4079143843456743e-05, "loss": 0.6855, "step": 5468 }, { "epoch": 0.8356954578446728, "grad_norm": 0.3251248002052307, "learning_rate": 1.4053617228104343e-05, "loss": 0.7527, "step": 5469 }, { "epoch": 0.8358482637429805, "grad_norm": 0.28075751662254333, "learning_rate": 1.402811202557176e-05, "loss": 0.5165, "step": 5470 }, { "epoch": 0.8360010696412882, "grad_norm": 0.2756359279155731, "learning_rate": 1.4002628242213422e-05, "loss": 0.5895, "step": 5471 }, { "epoch": 0.8361538755395959, "grad_norm": 0.30491769313812256, "learning_rate": 1.3977165884378362e-05, "loss": 0.721, "step": 5472 }, { "epoch": 0.8363066814379035, "grad_norm": 0.4730520248413086, "learning_rate": 1.3951724958410317e-05, "loss": 0.6908, "step": 5473 }, { "epoch": 0.8364594873362112, "grad_norm": 0.3533181846141815, "learning_rate": 1.3926305470647682e-05, "loss": 0.664, "step": 5474 }, { "epoch": 0.8366122932345188, "grad_norm": 0.3893337547779083, "learning_rate": 1.3900907427423537e-05, "loss": 0.8438, "step": 5475 }, { "epoch": 0.8367650991328265, "grad_norm": 0.2789744436740875, "learning_rate": 1.3875530835065576e-05, "loss": 0.6695, "step": 5476 }, { "epoch": 0.8369179050311342, "grad_norm": 0.2796769440174103, "learning_rate": 1.3850175699896128e-05, "loss": 0.8565, "step": 5477 }, { "epoch": 0.8370707109294419, "grad_norm": 0.3079114258289337, "learning_rate": 1.3824842028232265e-05, "loss": 0.7301, "step": 5478 }, { "epoch": 0.8372235168277495, "grad_norm": 0.2383667379617691, "learning_rate": 1.3799529826385616e-05, "loss": 0.5826, "step": 5479 }, { "epoch": 0.8373763227260572, "grad_norm": 0.27189722657203674, "learning_rate": 1.3774239100662545e-05, "loss": 0.7496, "step": 5480 }, { "epoch": 0.8375291286243649, "grad_norm": 0.28391632437705994, "learning_rate": 1.374896985736398e-05, "loss": 0.8122, "step": 5481 }, { "epoch": 0.8376819345226726, "grad_norm": 0.4183500409126282, "learning_rate": 1.3723722102785575e-05, "loss": 0.8417, "step": 5482 }, { "epoch": 0.8378347404209803, "grad_norm": 0.2729761302471161, "learning_rate": 1.3698495843217574e-05, "loss": 0.7305, "step": 5483 }, { "epoch": 0.837987546319288, "grad_norm": 0.31188705563545227, "learning_rate": 1.3673291084944916e-05, "loss": 0.7001, "step": 5484 }, { "epoch": 0.8381403522175956, "grad_norm": 0.27931174635887146, "learning_rate": 1.3648107834247137e-05, "loss": 0.788, "step": 5485 }, { "epoch": 0.8382931581159033, "grad_norm": 0.2955465018749237, "learning_rate": 1.3622946097398415e-05, "loss": 0.6219, "step": 5486 }, { "epoch": 0.8384459640142109, "grad_norm": 0.2784363329410553, "learning_rate": 1.3597805880667591e-05, "loss": 0.7067, "step": 5487 }, { "epoch": 0.8385987699125186, "grad_norm": 0.2599412202835083, "learning_rate": 1.3572687190318167e-05, "loss": 0.6021, "step": 5488 }, { "epoch": 0.8387515758108263, "grad_norm": 0.2823382616043091, "learning_rate": 1.3547590032608271e-05, "loss": 0.7634, "step": 5489 }, { "epoch": 0.838904381709134, "grad_norm": 1.0645288228988647, "learning_rate": 1.3522514413790577e-05, "loss": 1.0043, "step": 5490 }, { "epoch": 0.8390571876074416, "grad_norm": 0.29358312487602234, "learning_rate": 1.34974603401125e-05, "loss": 0.8486, "step": 5491 }, { "epoch": 0.8392099935057493, "grad_norm": 0.32983365654945374, "learning_rate": 1.3472427817816047e-05, "loss": 0.8058, "step": 5492 }, { "epoch": 0.839362799404057, "grad_norm": 0.3272798955440521, "learning_rate": 1.3447416853137907e-05, "loss": 0.6448, "step": 5493 }, { "epoch": 0.8395156053023647, "grad_norm": 0.29118168354034424, "learning_rate": 1.3422427452309305e-05, "loss": 0.6715, "step": 5494 }, { "epoch": 0.8396684112006724, "grad_norm": 0.31547603011131287, "learning_rate": 1.339745962155613e-05, "loss": 0.7579, "step": 5495 }, { "epoch": 0.83982121709898, "grad_norm": 0.28353336453437805, "learning_rate": 1.337251336709896e-05, "loss": 0.6519, "step": 5496 }, { "epoch": 0.8399740229972877, "grad_norm": 0.2982615530490875, "learning_rate": 1.334758869515288e-05, "loss": 0.7229, "step": 5497 }, { "epoch": 0.8401268288955954, "grad_norm": 0.307255357503891, "learning_rate": 1.332268561192771e-05, "loss": 0.893, "step": 5498 }, { "epoch": 0.840279634793903, "grad_norm": 0.37311118841171265, "learning_rate": 1.3297804123627822e-05, "loss": 0.7039, "step": 5499 }, { "epoch": 0.8404324406922107, "grad_norm": 0.3093119263648987, "learning_rate": 1.3272944236452256e-05, "loss": 0.6056, "step": 5500 }, { "epoch": 0.8405852465905184, "grad_norm": 0.26752883195877075, "learning_rate": 1.3248105956594592e-05, "loss": 0.5392, "step": 5501 }, { "epoch": 0.840738052488826, "grad_norm": 0.3182859420776367, "learning_rate": 1.3223289290243147e-05, "loss": 0.5405, "step": 5502 }, { "epoch": 0.8408908583871337, "grad_norm": 0.2866218686103821, "learning_rate": 1.319849424358075e-05, "loss": 0.6926, "step": 5503 }, { "epoch": 0.8410436642854414, "grad_norm": 0.2911316454410553, "learning_rate": 1.3173720822784852e-05, "loss": 0.7432, "step": 5504 }, { "epoch": 0.8411964701837491, "grad_norm": 0.370292067527771, "learning_rate": 1.3148969034027569e-05, "loss": 0.7433, "step": 5505 }, { "epoch": 0.8413492760820568, "grad_norm": 0.3643721640110016, "learning_rate": 1.3124238883475626e-05, "loss": 0.6179, "step": 5506 }, { "epoch": 0.8415020819803645, "grad_norm": 0.4829062819480896, "learning_rate": 1.3099530377290314e-05, "loss": 0.7343, "step": 5507 }, { "epoch": 0.8416548878786722, "grad_norm": 0.3859752416610718, "learning_rate": 1.3074843521627522e-05, "loss": 0.5284, "step": 5508 }, { "epoch": 0.8418076937769798, "grad_norm": 0.26677191257476807, "learning_rate": 1.3050178322637784e-05, "loss": 0.5418, "step": 5509 }, { "epoch": 0.8419604996752875, "grad_norm": 0.33165523409843445, "learning_rate": 1.3025534786466275e-05, "loss": 0.7229, "step": 5510 }, { "epoch": 0.8421133055735951, "grad_norm": 0.3077392876148224, "learning_rate": 1.3000912919252683e-05, "loss": 0.7913, "step": 5511 }, { "epoch": 0.8422661114719028, "grad_norm": 0.31485605239868164, "learning_rate": 1.2976312727131323e-05, "loss": 0.8463, "step": 5512 }, { "epoch": 0.8424189173702105, "grad_norm": 0.27010369300842285, "learning_rate": 1.2951734216231148e-05, "loss": 0.5581, "step": 5513 }, { "epoch": 0.8425717232685181, "grad_norm": 0.2709108293056488, "learning_rate": 1.2927177392675715e-05, "loss": 0.6104, "step": 5514 }, { "epoch": 0.8427245291668258, "grad_norm": 0.2696745991706848, "learning_rate": 1.290264226258312e-05, "loss": 0.5627, "step": 5515 }, { "epoch": 0.8428773350651335, "grad_norm": 0.3353211283683777, "learning_rate": 1.2878128832066073e-05, "loss": 0.6656, "step": 5516 }, { "epoch": 0.8430301409634412, "grad_norm": 0.2736285924911499, "learning_rate": 1.285363710723192e-05, "loss": 0.6254, "step": 5517 }, { "epoch": 0.8431829468617489, "grad_norm": 0.319490909576416, "learning_rate": 1.2829167094182537e-05, "loss": 0.5857, "step": 5518 }, { "epoch": 0.8433357527600566, "grad_norm": 0.30264532566070557, "learning_rate": 1.2804718799014459e-05, "loss": 0.7828, "step": 5519 }, { "epoch": 0.8434885586583643, "grad_norm": 0.5836649537086487, "learning_rate": 1.2780292227818735e-05, "loss": 0.6535, "step": 5520 }, { "epoch": 0.8436413645566719, "grad_norm": 0.27647146582603455, "learning_rate": 1.2755887386681076e-05, "loss": 0.6675, "step": 5521 }, { "epoch": 0.8437941704549795, "grad_norm": 0.28818479180336, "learning_rate": 1.2731504281681705e-05, "loss": 0.6968, "step": 5522 }, { "epoch": 0.8439469763532872, "grad_norm": 0.33592307567596436, "learning_rate": 1.2707142918895498e-05, "loss": 0.6931, "step": 5523 }, { "epoch": 0.8440997822515949, "grad_norm": 0.2817316949367523, "learning_rate": 1.268280330439191e-05, "loss": 0.6263, "step": 5524 }, { "epoch": 0.8442525881499026, "grad_norm": 0.4386361539363861, "learning_rate": 1.2658485444234869e-05, "loss": 0.4877, "step": 5525 }, { "epoch": 0.8444053940482102, "grad_norm": 0.32742446660995483, "learning_rate": 1.2634189344483028e-05, "loss": 0.8693, "step": 5526 }, { "epoch": 0.8445581999465179, "grad_norm": 0.30600279569625854, "learning_rate": 1.2609915011189533e-05, "loss": 0.5613, "step": 5527 }, { "epoch": 0.8447110058448256, "grad_norm": 0.3841206133365631, "learning_rate": 1.2585662450402158e-05, "loss": 0.684, "step": 5528 }, { "epoch": 0.8448638117431333, "grad_norm": 0.29146626591682434, "learning_rate": 1.2561431668163204e-05, "loss": 0.5726, "step": 5529 }, { "epoch": 0.845016617641441, "grad_norm": 0.4497874677181244, "learning_rate": 1.2537222670509563e-05, "loss": 0.8621, "step": 5530 }, { "epoch": 0.8451694235397487, "grad_norm": 0.31416016817092896, "learning_rate": 1.25130354634727e-05, "loss": 0.602, "step": 5531 }, { "epoch": 0.8453222294380563, "grad_norm": 0.41826075315475464, "learning_rate": 1.2488870053078682e-05, "loss": 0.5775, "step": 5532 }, { "epoch": 0.845475035336364, "grad_norm": 0.2680056393146515, "learning_rate": 1.2464726445348106e-05, "loss": 0.6435, "step": 5533 }, { "epoch": 0.8456278412346716, "grad_norm": 0.3037991225719452, "learning_rate": 1.2440604646296117e-05, "loss": 0.5587, "step": 5534 }, { "epoch": 0.8457806471329793, "grad_norm": 0.3998739421367645, "learning_rate": 1.2416504661932516e-05, "loss": 0.5474, "step": 5535 }, { "epoch": 0.845933453031287, "grad_norm": 0.35582205653190613, "learning_rate": 1.2392426498261556e-05, "loss": 0.6253, "step": 5536 }, { "epoch": 0.8460862589295947, "grad_norm": 0.3369934856891632, "learning_rate": 1.236837016128215e-05, "loss": 0.7949, "step": 5537 }, { "epoch": 0.8462390648279023, "grad_norm": 0.25886452198028564, "learning_rate": 1.2344335656987704e-05, "loss": 0.8035, "step": 5538 }, { "epoch": 0.84639187072621, "grad_norm": 0.4257791340351105, "learning_rate": 1.232032299136624e-05, "loss": 0.7621, "step": 5539 }, { "epoch": 0.8465446766245177, "grad_norm": 0.2683560848236084, "learning_rate": 1.2296332170400281e-05, "loss": 0.8101, "step": 5540 }, { "epoch": 0.8466974825228254, "grad_norm": 0.30207762122154236, "learning_rate": 1.2272363200066983e-05, "loss": 0.5819, "step": 5541 }, { "epoch": 0.8468502884211331, "grad_norm": 0.4459848403930664, "learning_rate": 1.2248416086337977e-05, "loss": 0.7585, "step": 5542 }, { "epoch": 0.8470030943194408, "grad_norm": 0.3064769506454468, "learning_rate": 1.222449083517948e-05, "loss": 0.6841, "step": 5543 }, { "epoch": 0.8471559002177484, "grad_norm": 0.4430690407752991, "learning_rate": 1.2200587452552281e-05, "loss": 0.5349, "step": 5544 }, { "epoch": 0.8473087061160561, "grad_norm": 0.27452754974365234, "learning_rate": 1.2176705944411726e-05, "loss": 0.8026, "step": 5545 }, { "epoch": 0.8474615120143637, "grad_norm": 0.25186803936958313, "learning_rate": 1.2152846316707678e-05, "loss": 0.7354, "step": 5546 }, { "epoch": 0.8476143179126714, "grad_norm": 0.2482818365097046, "learning_rate": 1.2129008575384537e-05, "loss": 0.7683, "step": 5547 }, { "epoch": 0.8477671238109791, "grad_norm": 0.25365763902664185, "learning_rate": 1.2105192726381298e-05, "loss": 0.6313, "step": 5548 }, { "epoch": 0.8479199297092868, "grad_norm": 0.2693358361721039, "learning_rate": 1.2081398775631502e-05, "loss": 0.6479, "step": 5549 }, { "epoch": 0.8480727356075944, "grad_norm": 0.35001295804977417, "learning_rate": 1.2057626729063198e-05, "loss": 0.8097, "step": 5550 }, { "epoch": 0.8482255415059021, "grad_norm": 0.35767117142677307, "learning_rate": 1.2033876592598959e-05, "loss": 0.7211, "step": 5551 }, { "epoch": 0.8483783474042098, "grad_norm": 0.3254198431968689, "learning_rate": 1.201014837215595e-05, "loss": 0.8238, "step": 5552 }, { "epoch": 0.8485311533025175, "grad_norm": 0.2950455844402313, "learning_rate": 1.1986442073645899e-05, "loss": 0.6653, "step": 5553 }, { "epoch": 0.8486839592008252, "grad_norm": 0.3335111439228058, "learning_rate": 1.196275770297497e-05, "loss": 0.7169, "step": 5554 }, { "epoch": 0.8488367650991329, "grad_norm": 0.3324906826019287, "learning_rate": 1.1939095266043976e-05, "loss": 0.6226, "step": 5555 }, { "epoch": 0.8489895709974405, "grad_norm": 0.25888141989707947, "learning_rate": 1.1915454768748191e-05, "loss": 0.8281, "step": 5556 }, { "epoch": 0.8491423768957482, "grad_norm": 0.26205694675445557, "learning_rate": 1.1891836216977426e-05, "loss": 0.5808, "step": 5557 }, { "epoch": 0.8492951827940558, "grad_norm": 0.37880903482437134, "learning_rate": 1.1868239616616073e-05, "loss": 0.8139, "step": 5558 }, { "epoch": 0.8494479886923635, "grad_norm": 0.2531372010707855, "learning_rate": 1.1844664973543029e-05, "loss": 0.7087, "step": 5559 }, { "epoch": 0.8496007945906712, "grad_norm": 0.2695479989051819, "learning_rate": 1.182111229363172e-05, "loss": 0.711, "step": 5560 }, { "epoch": 0.8497536004889789, "grad_norm": 0.3131905198097229, "learning_rate": 1.1797581582750062e-05, "loss": 0.4314, "step": 5561 }, { "epoch": 0.8499064063872865, "grad_norm": 0.3199729919433594, "learning_rate": 1.1774072846760565e-05, "loss": 0.6371, "step": 5562 }, { "epoch": 0.8500592122855942, "grad_norm": 0.3494158685207367, "learning_rate": 1.1750586091520244e-05, "loss": 0.8639, "step": 5563 }, { "epoch": 0.8502120181839019, "grad_norm": 0.30111271142959595, "learning_rate": 1.1727121322880607e-05, "loss": 0.6583, "step": 5564 }, { "epoch": 0.8503648240822096, "grad_norm": 0.6418349146842957, "learning_rate": 1.1703678546687701e-05, "loss": 0.6721, "step": 5565 }, { "epoch": 0.8505176299805173, "grad_norm": 0.2809236943721771, "learning_rate": 1.1680257768782098e-05, "loss": 0.7419, "step": 5566 }, { "epoch": 0.850670435878825, "grad_norm": 0.2570629417896271, "learning_rate": 1.1656858994998909e-05, "loss": 0.5666, "step": 5567 }, { "epoch": 0.8508232417771326, "grad_norm": 0.28726086020469666, "learning_rate": 1.1633482231167736e-05, "loss": 0.64, "step": 5568 }, { "epoch": 0.8509760476754402, "grad_norm": 0.4716130495071411, "learning_rate": 1.1610127483112665e-05, "loss": 0.728, "step": 5569 }, { "epoch": 0.8511288535737479, "grad_norm": 0.25644704699516296, "learning_rate": 1.1586794756652374e-05, "loss": 0.9191, "step": 5570 }, { "epoch": 0.8512816594720556, "grad_norm": 0.27911999821662903, "learning_rate": 1.1563484057600028e-05, "loss": 0.5204, "step": 5571 }, { "epoch": 0.8514344653703633, "grad_norm": 0.3576193153858185, "learning_rate": 1.1540195391763265e-05, "loss": 0.712, "step": 5572 }, { "epoch": 0.851587271268671, "grad_norm": 0.27247634530067444, "learning_rate": 1.1516928764944257e-05, "loss": 0.7406, "step": 5573 }, { "epoch": 0.8517400771669786, "grad_norm": 0.2926981747150421, "learning_rate": 1.1493684182939712e-05, "loss": 0.6532, "step": 5574 }, { "epoch": 0.8518928830652863, "grad_norm": 0.2958250939846039, "learning_rate": 1.1470461651540787e-05, "loss": 0.7098, "step": 5575 }, { "epoch": 0.852045688963594, "grad_norm": 0.3859553337097168, "learning_rate": 1.144726117653322e-05, "loss": 0.5666, "step": 5576 }, { "epoch": 0.8521984948619017, "grad_norm": 0.24316290020942688, "learning_rate": 1.1424082763697186e-05, "loss": 0.69, "step": 5577 }, { "epoch": 0.8523513007602094, "grad_norm": 0.29982128739356995, "learning_rate": 1.1400926418807423e-05, "loss": 0.8659, "step": 5578 }, { "epoch": 0.8525041066585171, "grad_norm": 0.307070255279541, "learning_rate": 1.1377792147633092e-05, "loss": 0.8201, "step": 5579 }, { "epoch": 0.8526569125568247, "grad_norm": 0.24706871807575226, "learning_rate": 1.1354679955937963e-05, "loss": 0.7248, "step": 5580 }, { "epoch": 0.8528097184551323, "grad_norm": 0.45683878660202026, "learning_rate": 1.1331589849480207e-05, "loss": 0.9072, "step": 5581 }, { "epoch": 0.85296252435344, "grad_norm": 0.3122950494289398, "learning_rate": 1.1308521834012509e-05, "loss": 0.7126, "step": 5582 }, { "epoch": 0.8531153302517477, "grad_norm": 0.24352186918258667, "learning_rate": 1.1285475915282106e-05, "loss": 0.5917, "step": 5583 }, { "epoch": 0.8532681361500554, "grad_norm": 0.26959019899368286, "learning_rate": 1.1262452099030684e-05, "loss": 0.7065, "step": 5584 }, { "epoch": 0.853420942048363, "grad_norm": 0.3406347632408142, "learning_rate": 1.1239450390994487e-05, "loss": 0.6882, "step": 5585 }, { "epoch": 0.8535737479466707, "grad_norm": 0.2712723910808563, "learning_rate": 1.1216470796904099e-05, "loss": 0.7421, "step": 5586 }, { "epoch": 0.8537265538449784, "grad_norm": 0.26714420318603516, "learning_rate": 1.119351332248474e-05, "loss": 0.7186, "step": 5587 }, { "epoch": 0.8538793597432861, "grad_norm": 0.255027711391449, "learning_rate": 1.1170577973456097e-05, "loss": 0.5784, "step": 5588 }, { "epoch": 0.8540321656415938, "grad_norm": 0.6876900792121887, "learning_rate": 1.1147664755532272e-05, "loss": 0.6206, "step": 5589 }, { "epoch": 0.8541849715399015, "grad_norm": 0.31581729650497437, "learning_rate": 1.1124773674421951e-05, "loss": 0.5449, "step": 5590 }, { "epoch": 0.8543377774382092, "grad_norm": 0.291388601064682, "learning_rate": 1.1101904735828206e-05, "loss": 0.6831, "step": 5591 }, { "epoch": 0.8544905833365168, "grad_norm": 0.2843266725540161, "learning_rate": 1.1079057945448678e-05, "loss": 0.6111, "step": 5592 }, { "epoch": 0.8546433892348244, "grad_norm": 0.4129765033721924, "learning_rate": 1.1056233308975428e-05, "loss": 0.6729, "step": 5593 }, { "epoch": 0.8547961951331321, "grad_norm": 0.29213792085647583, "learning_rate": 1.1033430832095049e-05, "loss": 0.796, "step": 5594 }, { "epoch": 0.8549490010314398, "grad_norm": 0.277227520942688, "learning_rate": 1.1010650520488564e-05, "loss": 0.4999, "step": 5595 }, { "epoch": 0.8551018069297475, "grad_norm": 0.2723330855369568, "learning_rate": 1.09878923798315e-05, "loss": 0.7722, "step": 5596 }, { "epoch": 0.8552546128280551, "grad_norm": 0.25330761075019836, "learning_rate": 1.0965156415793843e-05, "loss": 0.6138, "step": 5597 }, { "epoch": 0.8554074187263628, "grad_norm": 0.28613653779029846, "learning_rate": 1.0942442634040118e-05, "loss": 0.8158, "step": 5598 }, { "epoch": 0.8555602246246705, "grad_norm": 0.2570474445819855, "learning_rate": 1.0919751040229231e-05, "loss": 0.6388, "step": 5599 }, { "epoch": 0.8557130305229782, "grad_norm": 0.260789155960083, "learning_rate": 1.0897081640014594e-05, "loss": 0.8127, "step": 5600 }, { "epoch": 0.8558658364212859, "grad_norm": 0.30257099866867065, "learning_rate": 1.0874434439044122e-05, "loss": 0.697, "step": 5601 }, { "epoch": 0.8560186423195936, "grad_norm": 0.28331151604652405, "learning_rate": 1.085180944296018e-05, "loss": 0.646, "step": 5602 }, { "epoch": 0.8561714482179013, "grad_norm": 0.31558719277381897, "learning_rate": 1.0829206657399581e-05, "loss": 0.7261, "step": 5603 }, { "epoch": 0.8563242541162089, "grad_norm": 0.30309322476387024, "learning_rate": 1.080662608799361e-05, "loss": 0.6796, "step": 5604 }, { "epoch": 0.8564770600145165, "grad_norm": 0.2481728047132492, "learning_rate": 1.0784067740368032e-05, "loss": 0.5802, "step": 5605 }, { "epoch": 0.8566298659128242, "grad_norm": 0.2914709448814392, "learning_rate": 1.0761531620143106e-05, "loss": 0.7447, "step": 5606 }, { "epoch": 0.8567826718111319, "grad_norm": 0.32431426644325256, "learning_rate": 1.0739017732933476e-05, "loss": 0.5631, "step": 5607 }, { "epoch": 0.8569354777094396, "grad_norm": 0.24270617961883545, "learning_rate": 1.0716526084348277e-05, "loss": 0.5381, "step": 5608 }, { "epoch": 0.8570882836077472, "grad_norm": 0.27154895663261414, "learning_rate": 1.069405667999115e-05, "loss": 0.6087, "step": 5609 }, { "epoch": 0.8572410895060549, "grad_norm": 0.3157510459423065, "learning_rate": 1.0671609525460158e-05, "loss": 0.7442, "step": 5610 }, { "epoch": 0.8573938954043626, "grad_norm": 0.2895173728466034, "learning_rate": 1.0649184626347807e-05, "loss": 0.7309, "step": 5611 }, { "epoch": 0.8575467013026703, "grad_norm": 0.2819909155368805, "learning_rate": 1.0626781988241064e-05, "loss": 0.6276, "step": 5612 }, { "epoch": 0.857699507200978, "grad_norm": 0.3299407660961151, "learning_rate": 1.0604401616721371e-05, "loss": 0.8517, "step": 5613 }, { "epoch": 0.8578523130992857, "grad_norm": 0.3332931697368622, "learning_rate": 1.0582043517364604e-05, "loss": 0.7648, "step": 5614 }, { "epoch": 0.8580051189975934, "grad_norm": 0.3062601089477539, "learning_rate": 1.0559707695741083e-05, "loss": 0.7078, "step": 5615 }, { "epoch": 0.858157924895901, "grad_norm": 0.23787756264209747, "learning_rate": 1.0537394157415637e-05, "loss": 0.6386, "step": 5616 }, { "epoch": 0.8583107307942086, "grad_norm": 0.2560744285583496, "learning_rate": 1.0515102907947461e-05, "loss": 0.7632, "step": 5617 }, { "epoch": 0.8584635366925163, "grad_norm": 0.3579595685005188, "learning_rate": 1.0492833952890225e-05, "loss": 0.7741, "step": 5618 }, { "epoch": 0.858616342590824, "grad_norm": 0.2760065793991089, "learning_rate": 1.0470587297792056e-05, "loss": 0.691, "step": 5619 }, { "epoch": 0.8587691484891317, "grad_norm": 0.3856504559516907, "learning_rate": 1.0448362948195567e-05, "loss": 0.679, "step": 5620 }, { "epoch": 0.8589219543874393, "grad_norm": 0.3312358558177948, "learning_rate": 1.0426160909637694e-05, "loss": 0.8009, "step": 5621 }, { "epoch": 0.859074760285747, "grad_norm": 0.3192104995250702, "learning_rate": 1.0403981187649936e-05, "loss": 0.6937, "step": 5622 }, { "epoch": 0.8592275661840547, "grad_norm": 0.2923565208911896, "learning_rate": 1.038182378775816e-05, "loss": 0.8538, "step": 5623 }, { "epoch": 0.8593803720823624, "grad_norm": 0.2979848384857178, "learning_rate": 1.0359688715482741e-05, "loss": 0.6309, "step": 5624 }, { "epoch": 0.8595331779806701, "grad_norm": 0.41364747285842896, "learning_rate": 1.033757597633841e-05, "loss": 0.6971, "step": 5625 }, { "epoch": 0.8596859838789778, "grad_norm": 0.2538832724094391, "learning_rate": 1.031548557583436e-05, "loss": 0.5638, "step": 5626 }, { "epoch": 0.8598387897772855, "grad_norm": 0.32028117775917053, "learning_rate": 1.0293417519474268e-05, "loss": 0.8821, "step": 5627 }, { "epoch": 0.859991595675593, "grad_norm": 0.417066365480423, "learning_rate": 1.0271371812756158e-05, "loss": 0.7425, "step": 5628 }, { "epoch": 0.8601444015739007, "grad_norm": 0.36033546924591064, "learning_rate": 1.024934846117257e-05, "loss": 0.692, "step": 5629 }, { "epoch": 0.8602972074722084, "grad_norm": 0.2695680260658264, "learning_rate": 1.0227347470210413e-05, "loss": 0.6444, "step": 5630 }, { "epoch": 0.8604500133705161, "grad_norm": 0.28025510907173157, "learning_rate": 1.0205368845351082e-05, "loss": 0.7348, "step": 5631 }, { "epoch": 0.8606028192688238, "grad_norm": 0.27903392910957336, "learning_rate": 1.0183412592070319e-05, "loss": 0.5588, "step": 5632 }, { "epoch": 0.8607556251671314, "grad_norm": 0.29430249333381653, "learning_rate": 1.016147871583839e-05, "loss": 0.7455, "step": 5633 }, { "epoch": 0.8609084310654391, "grad_norm": 0.27932068705558777, "learning_rate": 1.0139567222119906e-05, "loss": 0.5564, "step": 5634 }, { "epoch": 0.8610612369637468, "grad_norm": 0.3375709354877472, "learning_rate": 1.0117678116373929e-05, "loss": 0.6618, "step": 5635 }, { "epoch": 0.8612140428620545, "grad_norm": 0.2798751890659332, "learning_rate": 1.0095811404053946e-05, "loss": 0.7448, "step": 5636 }, { "epoch": 0.8613668487603622, "grad_norm": 0.27346816658973694, "learning_rate": 1.0073967090607894e-05, "loss": 0.7263, "step": 5637 }, { "epoch": 0.8615196546586699, "grad_norm": 0.29813331365585327, "learning_rate": 1.005214518147809e-05, "loss": 0.6409, "step": 5638 }, { "epoch": 0.8616724605569775, "grad_norm": 0.29515641927719116, "learning_rate": 1.0030345682101239e-05, "loss": 0.7254, "step": 5639 }, { "epoch": 0.8618252664552851, "grad_norm": 0.38635021448135376, "learning_rate": 1.0008568597908542e-05, "loss": 0.6273, "step": 5640 }, { "epoch": 0.8619780723535928, "grad_norm": 0.26836127042770386, "learning_rate": 9.986813934325589e-06, "loss": 0.6707, "step": 5641 }, { "epoch": 0.8621308782519005, "grad_norm": 0.2930348515510559, "learning_rate": 9.965081696772349e-06, "loss": 0.6082, "step": 5642 }, { "epoch": 0.8622836841502082, "grad_norm": 0.37345796823501587, "learning_rate": 9.94337189066321e-06, "loss": 0.6485, "step": 5643 }, { "epoch": 0.8624364900485159, "grad_norm": 0.2520149052143097, "learning_rate": 9.921684521407004e-06, "loss": 0.5862, "step": 5644 }, { "epoch": 0.8625892959468235, "grad_norm": 0.36799752712249756, "learning_rate": 9.900019594406984e-06, "loss": 0.6129, "step": 5645 }, { "epoch": 0.8627421018451312, "grad_norm": 0.30241623520851135, "learning_rate": 9.878377115060755e-06, "loss": 0.8082, "step": 5646 }, { "epoch": 0.8628949077434389, "grad_norm": 0.309946745634079, "learning_rate": 9.85675708876035e-06, "loss": 0.5611, "step": 5647 }, { "epoch": 0.8630477136417466, "grad_norm": 0.27384522557258606, "learning_rate": 9.835159520892235e-06, "loss": 0.5449, "step": 5648 }, { "epoch": 0.8632005195400543, "grad_norm": 0.32065561413764954, "learning_rate": 9.813584416837273e-06, "loss": 0.7328, "step": 5649 }, { "epoch": 0.863353325438362, "grad_norm": 0.31820061802864075, "learning_rate": 9.79203178197069e-06, "loss": 0.7254, "step": 5650 }, { "epoch": 0.8635061313366696, "grad_norm": 0.37443435192108154, "learning_rate": 9.770501621662176e-06, "loss": 0.6916, "step": 5651 }, { "epoch": 0.8636589372349772, "grad_norm": 0.5556673407554626, "learning_rate": 9.748993941275775e-06, "loss": 0.716, "step": 5652 }, { "epoch": 0.8638117431332849, "grad_norm": 0.30924415588378906, "learning_rate": 9.727508746169934e-06, "loss": 0.7234, "step": 5653 }, { "epoch": 0.8639645490315926, "grad_norm": 0.2775317430496216, "learning_rate": 9.706046041697513e-06, "loss": 0.4973, "step": 5654 }, { "epoch": 0.8641173549299003, "grad_norm": 0.26694798469543457, "learning_rate": 9.684605833205796e-06, "loss": 0.6978, "step": 5655 }, { "epoch": 0.864270160828208, "grad_norm": 0.2846663296222687, "learning_rate": 9.663188126036393e-06, "loss": 0.8492, "step": 5656 }, { "epoch": 0.8644229667265156, "grad_norm": 0.442914217710495, "learning_rate": 9.64179292552535e-06, "loss": 0.8636, "step": 5657 }, { "epoch": 0.8645757726248233, "grad_norm": 0.4035290777683258, "learning_rate": 9.620420237003114e-06, "loss": 0.6299, "step": 5658 }, { "epoch": 0.864728578523131, "grad_norm": 0.26495277881622314, "learning_rate": 9.599070065794525e-06, "loss": 0.7732, "step": 5659 }, { "epoch": 0.8648813844214387, "grad_norm": 0.30105409026145935, "learning_rate": 9.577742417218782e-06, "loss": 0.6551, "step": 5660 }, { "epoch": 0.8650341903197464, "grad_norm": 0.2992299199104309, "learning_rate": 9.55643729658946e-06, "loss": 0.7077, "step": 5661 }, { "epoch": 0.8651869962180541, "grad_norm": 0.3086511492729187, "learning_rate": 9.535154709214589e-06, "loss": 0.6582, "step": 5662 }, { "epoch": 0.8653398021163617, "grad_norm": 0.30483368039131165, "learning_rate": 9.51389466039656e-06, "loss": 0.7493, "step": 5663 }, { "epoch": 0.8654926080146693, "grad_norm": 0.2668604850769043, "learning_rate": 9.492657155432105e-06, "loss": 0.7509, "step": 5664 }, { "epoch": 0.865645413912977, "grad_norm": 0.2963505685329437, "learning_rate": 9.471442199612367e-06, "loss": 0.7943, "step": 5665 }, { "epoch": 0.8657982198112847, "grad_norm": 0.31135594844818115, "learning_rate": 9.45024979822291e-06, "loss": 0.7238, "step": 5666 }, { "epoch": 0.8659510257095924, "grad_norm": 0.3372398018836975, "learning_rate": 9.429079956543596e-06, "loss": 0.7797, "step": 5667 }, { "epoch": 0.8661038316079, "grad_norm": 0.3138117492198944, "learning_rate": 9.407932679848751e-06, "loss": 0.6864, "step": 5668 }, { "epoch": 0.8662566375062077, "grad_norm": 0.27608153223991394, "learning_rate": 9.386807973407007e-06, "loss": 0.709, "step": 5669 }, { "epoch": 0.8664094434045154, "grad_norm": 0.27074432373046875, "learning_rate": 9.365705842481454e-06, "loss": 0.6483, "step": 5670 }, { "epoch": 0.8665622493028231, "grad_norm": 0.28649184107780457, "learning_rate": 9.34462629232946e-06, "loss": 0.7414, "step": 5671 }, { "epoch": 0.8667150552011308, "grad_norm": 0.33052703738212585, "learning_rate": 9.323569328202853e-06, "loss": 0.7705, "step": 5672 }, { "epoch": 0.8668678610994385, "grad_norm": 0.32664650678634644, "learning_rate": 9.302534955347796e-06, "loss": 0.7885, "step": 5673 }, { "epoch": 0.8670206669977462, "grad_norm": 0.25715407729148865, "learning_rate": 9.281523179004803e-06, "loss": 0.6727, "step": 5674 }, { "epoch": 0.8671734728960538, "grad_norm": 0.27501925826072693, "learning_rate": 9.260534004408795e-06, "loss": 0.5609, "step": 5675 }, { "epoch": 0.8673262787943614, "grad_norm": 0.2649621367454529, "learning_rate": 9.239567436789053e-06, "loss": 0.757, "step": 5676 }, { "epoch": 0.8674790846926691, "grad_norm": 0.28358912467956543, "learning_rate": 9.218623481369249e-06, "loss": 0.6585, "step": 5677 }, { "epoch": 0.8676318905909768, "grad_norm": 0.330975741147995, "learning_rate": 9.197702143367327e-06, "loss": 0.7913, "step": 5678 }, { "epoch": 0.8677846964892845, "grad_norm": 0.2609249949455261, "learning_rate": 9.176803427995706e-06, "loss": 0.6313, "step": 5679 }, { "epoch": 0.8679375023875922, "grad_norm": 0.27459460496902466, "learning_rate": 9.155927340461112e-06, "loss": 0.6604, "step": 5680 }, { "epoch": 0.8680903082858998, "grad_norm": 0.2390647977590561, "learning_rate": 9.135073885964695e-06, "loss": 0.6527, "step": 5681 }, { "epoch": 0.8682431141842075, "grad_norm": 0.28790050745010376, "learning_rate": 9.114243069701844e-06, "loss": 0.8257, "step": 5682 }, { "epoch": 0.8683959200825152, "grad_norm": 0.3109422028064728, "learning_rate": 9.093434896862408e-06, "loss": 0.4674, "step": 5683 }, { "epoch": 0.8685487259808229, "grad_norm": 0.30797600746154785, "learning_rate": 9.072649372630592e-06, "loss": 0.6488, "step": 5684 }, { "epoch": 0.8687015318791306, "grad_norm": 0.45591118931770325, "learning_rate": 9.051886502184903e-06, "loss": 0.6504, "step": 5685 }, { "epoch": 0.8688543377774383, "grad_norm": 0.2956595718860626, "learning_rate": 9.031146290698279e-06, "loss": 0.7148, "step": 5686 }, { "epoch": 0.8690071436757458, "grad_norm": 0.30931755900382996, "learning_rate": 9.010428743337906e-06, "loss": 0.7228, "step": 5687 }, { "epoch": 0.8691599495740535, "grad_norm": 0.27179425954818726, "learning_rate": 8.98973386526546e-06, "loss": 0.7834, "step": 5688 }, { "epoch": 0.8693127554723612, "grad_norm": 0.5840659737586975, "learning_rate": 8.969061661636824e-06, "loss": 0.6617, "step": 5689 }, { "epoch": 0.8694655613706689, "grad_norm": 0.3197341561317444, "learning_rate": 8.94841213760237e-06, "loss": 0.671, "step": 5690 }, { "epoch": 0.8696183672689766, "grad_norm": 0.29900041222572327, "learning_rate": 8.927785298306712e-06, "loss": 0.593, "step": 5691 }, { "epoch": 0.8697711731672843, "grad_norm": 0.31964540481567383, "learning_rate": 8.907181148888854e-06, "loss": 0.6972, "step": 5692 }, { "epoch": 0.8699239790655919, "grad_norm": 0.2736988067626953, "learning_rate": 8.886599694482155e-06, "loss": 0.735, "step": 5693 }, { "epoch": 0.8700767849638996, "grad_norm": 0.26970410346984863, "learning_rate": 8.866040940214338e-06, "loss": 0.6944, "step": 5694 }, { "epoch": 0.8702295908622073, "grad_norm": 0.30608078837394714, "learning_rate": 8.845504891207412e-06, "loss": 0.8553, "step": 5695 }, { "epoch": 0.870382396760515, "grad_norm": 0.3669775128364563, "learning_rate": 8.824991552577755e-06, "loss": 0.7968, "step": 5696 }, { "epoch": 0.8705352026588227, "grad_norm": 0.3871442675590515, "learning_rate": 8.80450092943611e-06, "loss": 0.8196, "step": 5697 }, { "epoch": 0.8706880085571304, "grad_norm": 0.2657804489135742, "learning_rate": 8.784033026887551e-06, "loss": 0.8516, "step": 5698 }, { "epoch": 0.8708408144554379, "grad_norm": 0.42050185799598694, "learning_rate": 8.763587850031484e-06, "loss": 0.7629, "step": 5699 }, { "epoch": 0.8709936203537456, "grad_norm": 0.3306422233581543, "learning_rate": 8.743165403961617e-06, "loss": 0.7971, "step": 5700 }, { "epoch": 0.8711464262520533, "grad_norm": 0.294408917427063, "learning_rate": 8.722765693766066e-06, "loss": 0.7246, "step": 5701 }, { "epoch": 0.871299232150361, "grad_norm": 0.3002341091632843, "learning_rate": 8.702388724527255e-06, "loss": 0.6067, "step": 5702 }, { "epoch": 0.8714520380486687, "grad_norm": 0.34488150477409363, "learning_rate": 8.682034501321912e-06, "loss": 0.7983, "step": 5703 }, { "epoch": 0.8716048439469763, "grad_norm": 0.3561033010482788, "learning_rate": 8.661703029221114e-06, "loss": 0.7648, "step": 5704 }, { "epoch": 0.871757649845284, "grad_norm": 0.30296874046325684, "learning_rate": 8.641394313290308e-06, "loss": 0.7938, "step": 5705 }, { "epoch": 0.8719104557435917, "grad_norm": 0.30546966195106506, "learning_rate": 8.621108358589202e-06, "loss": 0.7538, "step": 5706 }, { "epoch": 0.8720632616418994, "grad_norm": 0.3180548846721649, "learning_rate": 8.600845170171911e-06, "loss": 0.9197, "step": 5707 }, { "epoch": 0.8722160675402071, "grad_norm": 0.25248074531555176, "learning_rate": 8.580604753086807e-06, "loss": 0.5223, "step": 5708 }, { "epoch": 0.8723688734385148, "grad_norm": 0.272348016500473, "learning_rate": 8.560387112376645e-06, "loss": 0.657, "step": 5709 }, { "epoch": 0.8725216793368225, "grad_norm": 0.3801282048225403, "learning_rate": 8.54019225307845e-06, "loss": 0.731, "step": 5710 }, { "epoch": 0.87267448523513, "grad_norm": 0.613310694694519, "learning_rate": 8.52002018022362e-06, "loss": 0.6538, "step": 5711 }, { "epoch": 0.8728272911334377, "grad_norm": 0.2751515805721283, "learning_rate": 8.49987089883788e-06, "loss": 0.4931, "step": 5712 }, { "epoch": 0.8729800970317454, "grad_norm": 0.32077914476394653, "learning_rate": 8.479744413941215e-06, "loss": 0.7055, "step": 5713 }, { "epoch": 0.8731329029300531, "grad_norm": 0.3849603831768036, "learning_rate": 8.459640730547979e-06, "loss": 0.6672, "step": 5714 }, { "epoch": 0.8732857088283608, "grad_norm": 0.39358845353126526, "learning_rate": 8.439559853666846e-06, "loss": 0.6651, "step": 5715 }, { "epoch": 0.8734385147266684, "grad_norm": 0.47464704513549805, "learning_rate": 8.41950178830081e-06, "loss": 0.6592, "step": 5716 }, { "epoch": 0.8735913206249761, "grad_norm": 0.26846784353256226, "learning_rate": 8.399466539447154e-06, "loss": 0.6931, "step": 5717 }, { "epoch": 0.8737441265232838, "grad_norm": 0.2942967414855957, "learning_rate": 8.379454112097473e-06, "loss": 0.6378, "step": 5718 }, { "epoch": 0.8738969324215915, "grad_norm": 0.2899715304374695, "learning_rate": 8.359464511237713e-06, "loss": 0.8007, "step": 5719 }, { "epoch": 0.8740497383198992, "grad_norm": 0.26803848147392273, "learning_rate": 8.339497741848146e-06, "loss": 0.6989, "step": 5720 }, { "epoch": 0.8742025442182069, "grad_norm": 0.34943777322769165, "learning_rate": 8.31955380890329e-06, "loss": 0.7332, "step": 5721 }, { "epoch": 0.8743553501165146, "grad_norm": 0.3253035843372345, "learning_rate": 8.299632717371997e-06, "loss": 0.4714, "step": 5722 }, { "epoch": 0.8745081560148221, "grad_norm": 0.24931125342845917, "learning_rate": 8.279734472217471e-06, "loss": 0.7059, "step": 5723 }, { "epoch": 0.8746609619131298, "grad_norm": 0.2940715253353119, "learning_rate": 8.259859078397158e-06, "loss": 0.7114, "step": 5724 }, { "epoch": 0.8748137678114375, "grad_norm": 0.2790750563144684, "learning_rate": 8.240006540862887e-06, "loss": 0.9123, "step": 5725 }, { "epoch": 0.8749665737097452, "grad_norm": 0.5216521620750427, "learning_rate": 8.220176864560724e-06, "loss": 0.6532, "step": 5726 }, { "epoch": 0.8751193796080529, "grad_norm": 0.3397212326526642, "learning_rate": 8.200370054431072e-06, "loss": 0.6039, "step": 5727 }, { "epoch": 0.8752721855063605, "grad_norm": 0.3261764943599701, "learning_rate": 8.180586115408628e-06, "loss": 0.6529, "step": 5728 }, { "epoch": 0.8754249914046682, "grad_norm": 0.27617859840393066, "learning_rate": 8.160825052422417e-06, "loss": 0.5977, "step": 5729 }, { "epoch": 0.8755777973029759, "grad_norm": 0.24808910489082336, "learning_rate": 8.14108687039572e-06, "loss": 0.7398, "step": 5730 }, { "epoch": 0.8757306032012836, "grad_norm": 0.32295671105384827, "learning_rate": 8.121371574246128e-06, "loss": 0.7373, "step": 5731 }, { "epoch": 0.8758834090995913, "grad_norm": 0.4352364242076874, "learning_rate": 8.101679168885546e-06, "loss": 0.732, "step": 5732 }, { "epoch": 0.876036214997899, "grad_norm": 0.291965126991272, "learning_rate": 8.082009659220213e-06, "loss": 0.6596, "step": 5733 }, { "epoch": 0.8761890208962065, "grad_norm": 0.27537909150123596, "learning_rate": 8.06236305015059e-06, "loss": 0.7323, "step": 5734 }, { "epoch": 0.8763418267945142, "grad_norm": 0.2971029281616211, "learning_rate": 8.042739346571437e-06, "loss": 0.5912, "step": 5735 }, { "epoch": 0.8764946326928219, "grad_norm": 0.3006308674812317, "learning_rate": 8.023138553371878e-06, "loss": 0.7751, "step": 5736 }, { "epoch": 0.8766474385911296, "grad_norm": 0.3444878160953522, "learning_rate": 8.003560675435285e-06, "loss": 0.813, "step": 5737 }, { "epoch": 0.8768002444894373, "grad_norm": 0.28715288639068604, "learning_rate": 7.984005717639309e-06, "loss": 0.6781, "step": 5738 }, { "epoch": 0.876953050387745, "grad_norm": 0.25705134868621826, "learning_rate": 7.964473684855888e-06, "loss": 0.7919, "step": 5739 }, { "epoch": 0.8771058562860526, "grad_norm": 0.3298662304878235, "learning_rate": 7.944964581951275e-06, "loss": 0.6425, "step": 5740 }, { "epoch": 0.8772586621843603, "grad_norm": 0.27563560009002686, "learning_rate": 7.925478413786026e-06, "loss": 0.9351, "step": 5741 }, { "epoch": 0.877411468082668, "grad_norm": 0.29507142305374146, "learning_rate": 7.906015185214933e-06, "loss": 0.6913, "step": 5742 }, { "epoch": 0.8775642739809757, "grad_norm": 0.3378017246723175, "learning_rate": 7.886574901087074e-06, "loss": 0.6415, "step": 5743 }, { "epoch": 0.8777170798792834, "grad_norm": 0.36379972100257874, "learning_rate": 7.867157566245874e-06, "loss": 0.6531, "step": 5744 }, { "epoch": 0.8778698857775911, "grad_norm": 0.2869352102279663, "learning_rate": 7.847763185528967e-06, "loss": 0.5903, "step": 5745 }, { "epoch": 0.8780226916758986, "grad_norm": 0.26820555329322815, "learning_rate": 7.828391763768317e-06, "loss": 0.7291, "step": 5746 }, { "epoch": 0.8781754975742063, "grad_norm": 0.26429784297943115, "learning_rate": 7.809043305790165e-06, "loss": 0.724, "step": 5747 }, { "epoch": 0.878328303472514, "grad_norm": 0.29767340421676636, "learning_rate": 7.789717816414999e-06, "loss": 0.7074, "step": 5748 }, { "epoch": 0.8784811093708217, "grad_norm": 0.3220751881599426, "learning_rate": 7.77041530045759e-06, "loss": 0.6294, "step": 5749 }, { "epoch": 0.8786339152691294, "grad_norm": 0.2906290292739868, "learning_rate": 7.751135762727003e-06, "loss": 0.661, "step": 5750 }, { "epoch": 0.8787867211674371, "grad_norm": 0.37977883219718933, "learning_rate": 7.731879208026605e-06, "loss": 0.8768, "step": 5751 }, { "epoch": 0.8789395270657447, "grad_norm": 0.2759582996368408, "learning_rate": 7.71264564115397e-06, "loss": 0.652, "step": 5752 }, { "epoch": 0.8790923329640524, "grad_norm": 0.37194663286209106, "learning_rate": 7.693435066900989e-06, "loss": 0.4973, "step": 5753 }, { "epoch": 0.8792451388623601, "grad_norm": 0.2985452115535736, "learning_rate": 7.674247490053809e-06, "loss": 0.5931, "step": 5754 }, { "epoch": 0.8793979447606678, "grad_norm": 0.28206849098205566, "learning_rate": 7.655082915392887e-06, "loss": 0.6726, "step": 5755 }, { "epoch": 0.8795507506589755, "grad_norm": 0.35902372002601624, "learning_rate": 7.635941347692876e-06, "loss": 0.8268, "step": 5756 }, { "epoch": 0.8797035565572832, "grad_norm": 0.35157230496406555, "learning_rate": 7.616822791722744e-06, "loss": 0.7652, "step": 5757 }, { "epoch": 0.8798563624555907, "grad_norm": 0.28466176986694336, "learning_rate": 7.597727252245723e-06, "loss": 0.6142, "step": 5758 }, { "epoch": 0.8800091683538984, "grad_norm": 0.2658984661102295, "learning_rate": 7.5786547340193304e-06, "loss": 0.5487, "step": 5759 }, { "epoch": 0.8801619742522061, "grad_norm": 0.2549937069416046, "learning_rate": 7.559605241795309e-06, "loss": 0.477, "step": 5760 }, { "epoch": 0.8803147801505138, "grad_norm": 0.30298152565956116, "learning_rate": 7.5405787803196516e-06, "loss": 0.5568, "step": 5761 }, { "epoch": 0.8804675860488215, "grad_norm": 0.26043254137039185, "learning_rate": 7.5215753543326776e-06, "loss": 0.7528, "step": 5762 }, { "epoch": 0.8806203919471292, "grad_norm": 0.309759259223938, "learning_rate": 7.50259496856891e-06, "loss": 0.6305, "step": 5763 }, { "epoch": 0.8807731978454368, "grad_norm": 0.7723713517189026, "learning_rate": 7.483637627757167e-06, "loss": 0.7753, "step": 5764 }, { "epoch": 0.8809260037437445, "grad_norm": 0.30090397596359253, "learning_rate": 7.464703336620493e-06, "loss": 0.6874, "step": 5765 }, { "epoch": 0.8810788096420522, "grad_norm": 0.26742488145828247, "learning_rate": 7.445792099876236e-06, "loss": 0.7572, "step": 5766 }, { "epoch": 0.8812316155403599, "grad_norm": 0.3087722659111023, "learning_rate": 7.426903922235939e-06, "loss": 0.654, "step": 5767 }, { "epoch": 0.8813844214386676, "grad_norm": 0.5429712533950806, "learning_rate": 7.40803880840546e-06, "loss": 0.6168, "step": 5768 }, { "epoch": 0.8815372273369753, "grad_norm": 0.2850368022918701, "learning_rate": 7.3891967630848716e-06, "loss": 0.9643, "step": 5769 }, { "epoch": 0.8816900332352828, "grad_norm": 0.2823317050933838, "learning_rate": 7.3703777909684965e-06, "loss": 0.6667, "step": 5770 }, { "epoch": 0.8818428391335905, "grad_norm": 0.2532433271408081, "learning_rate": 7.351581896744941e-06, "loss": 0.5592, "step": 5771 }, { "epoch": 0.8819956450318982, "grad_norm": 0.2902817130088806, "learning_rate": 7.332809085097047e-06, "loss": 0.5788, "step": 5772 }, { "epoch": 0.8821484509302059, "grad_norm": 0.30547183752059937, "learning_rate": 7.31405936070193e-06, "loss": 0.665, "step": 5773 }, { "epoch": 0.8823012568285136, "grad_norm": 0.28098851442337036, "learning_rate": 7.2953327282308525e-06, "loss": 0.6169, "step": 5774 }, { "epoch": 0.8824540627268213, "grad_norm": 0.26863008737564087, "learning_rate": 7.276629192349449e-06, "loss": 0.6753, "step": 5775 }, { "epoch": 0.8826068686251289, "grad_norm": 0.3257790207862854, "learning_rate": 7.257948757717559e-06, "loss": 0.5925, "step": 5776 }, { "epoch": 0.8827596745234366, "grad_norm": 0.273481547832489, "learning_rate": 7.239291428989214e-06, "loss": 0.9063, "step": 5777 }, { "epoch": 0.8829124804217443, "grad_norm": 0.27130621671676636, "learning_rate": 7.220657210812775e-06, "loss": 0.7669, "step": 5778 }, { "epoch": 0.883065286320052, "grad_norm": 0.4380197823047638, "learning_rate": 7.202046107830762e-06, "loss": 0.6786, "step": 5779 }, { "epoch": 0.8832180922183597, "grad_norm": 0.2692827582359314, "learning_rate": 7.183458124679998e-06, "loss": 0.6413, "step": 5780 }, { "epoch": 0.8833708981166674, "grad_norm": 0.2981157898902893, "learning_rate": 7.1648932659915124e-06, "loss": 0.5292, "step": 5781 }, { "epoch": 0.8835237040149749, "grad_norm": 0.2957375645637512, "learning_rate": 7.146351536390605e-06, "loss": 0.7469, "step": 5782 }, { "epoch": 0.8836765099132826, "grad_norm": 0.3381440341472626, "learning_rate": 7.127832940496771e-06, "loss": 0.906, "step": 5783 }, { "epoch": 0.8838293158115903, "grad_norm": 0.30220258235931396, "learning_rate": 7.1093374829237615e-06, "loss": 0.9091, "step": 5784 }, { "epoch": 0.883982121709898, "grad_norm": 0.24618121981620789, "learning_rate": 7.090865168279559e-06, "loss": 0.6997, "step": 5785 }, { "epoch": 0.8841349276082057, "grad_norm": 0.22514882683753967, "learning_rate": 7.072416001166426e-06, "loss": 0.529, "step": 5786 }, { "epoch": 0.8842877335065134, "grad_norm": 0.27179720997810364, "learning_rate": 7.053989986180776e-06, "loss": 0.7022, "step": 5787 }, { "epoch": 0.884440539404821, "grad_norm": 0.32559579610824585, "learning_rate": 7.035587127913301e-06, "loss": 0.8342, "step": 5788 }, { "epoch": 0.8845933453031287, "grad_norm": 0.25409793853759766, "learning_rate": 7.017207430948936e-06, "loss": 0.658, "step": 5789 }, { "epoch": 0.8847461512014364, "grad_norm": 0.2802891731262207, "learning_rate": 6.998850899866827e-06, "loss": 0.6106, "step": 5790 }, { "epoch": 0.8848989570997441, "grad_norm": 0.2987188994884491, "learning_rate": 6.9805175392403385e-06, "loss": 0.7882, "step": 5791 }, { "epoch": 0.8850517629980518, "grad_norm": 0.26601770520210266, "learning_rate": 6.962207353637063e-06, "loss": 0.5598, "step": 5792 }, { "epoch": 0.8852045688963593, "grad_norm": 0.2836983799934387, "learning_rate": 6.943920347618849e-06, "loss": 0.6301, "step": 5793 }, { "epoch": 0.885357374794667, "grad_norm": 0.3127320110797882, "learning_rate": 6.925656525741753e-06, "loss": 0.6502, "step": 5794 }, { "epoch": 0.8855101806929747, "grad_norm": 0.29738175868988037, "learning_rate": 6.907415892556046e-06, "loss": 0.763, "step": 5795 }, { "epoch": 0.8856629865912824, "grad_norm": 0.27413830161094666, "learning_rate": 6.8891984526062155e-06, "loss": 0.8139, "step": 5796 }, { "epoch": 0.8858157924895901, "grad_norm": 0.29911452531814575, "learning_rate": 6.871004210430998e-06, "loss": 0.7271, "step": 5797 }, { "epoch": 0.8859685983878978, "grad_norm": 0.2909379303455353, "learning_rate": 6.852833170563344e-06, "loss": 0.6116, "step": 5798 }, { "epoch": 0.8861214042862054, "grad_norm": 0.2798643708229065, "learning_rate": 6.834685337530411e-06, "loss": 0.6803, "step": 5799 }, { "epoch": 0.8862742101845131, "grad_norm": 0.33216592669487, "learning_rate": 6.816560715853548e-06, "loss": 0.8831, "step": 5800 }, { "epoch": 0.8864270160828208, "grad_norm": 0.2739565074443817, "learning_rate": 6.7984593100484e-06, "loss": 0.7547, "step": 5801 }, { "epoch": 0.8865798219811285, "grad_norm": 0.27872902154922485, "learning_rate": 6.780381124624746e-06, "loss": 0.71, "step": 5802 }, { "epoch": 0.8867326278794362, "grad_norm": 0.30003616213798523, "learning_rate": 6.7623261640866185e-06, "loss": 0.7366, "step": 5803 }, { "epoch": 0.8868854337777439, "grad_norm": 0.36598697304725647, "learning_rate": 6.744294432932296e-06, "loss": 0.7659, "step": 5804 }, { "epoch": 0.8870382396760514, "grad_norm": 0.2758978307247162, "learning_rate": 6.726285935654197e-06, "loss": 0.5918, "step": 5805 }, { "epoch": 0.8871910455743591, "grad_norm": 0.2967734634876251, "learning_rate": 6.708300676738977e-06, "loss": 0.7183, "step": 5806 }, { "epoch": 0.8873438514726668, "grad_norm": 0.3790004551410675, "learning_rate": 6.690338660667527e-06, "loss": 0.5838, "step": 5807 }, { "epoch": 0.8874966573709745, "grad_norm": 0.27603670954704285, "learning_rate": 6.672399891914982e-06, "loss": 0.741, "step": 5808 }, { "epoch": 0.8876494632692822, "grad_norm": 0.2567933201789856, "learning_rate": 6.654484374950543e-06, "loss": 0.6312, "step": 5809 }, { "epoch": 0.8878022691675899, "grad_norm": 0.3038698732852936, "learning_rate": 6.6365921142377606e-06, "loss": 0.8309, "step": 5810 }, { "epoch": 0.8879550750658975, "grad_norm": 0.29641303420066833, "learning_rate": 6.618723114234337e-06, "loss": 0.7406, "step": 5811 }, { "epoch": 0.8881078809642052, "grad_norm": 0.28365176916122437, "learning_rate": 6.600877379392212e-06, "loss": 0.5779, "step": 5812 }, { "epoch": 0.8882606868625129, "grad_norm": 0.32412979006767273, "learning_rate": 6.58305491415746e-06, "loss": 0.7036, "step": 5813 }, { "epoch": 0.8884134927608206, "grad_norm": 0.2406160682439804, "learning_rate": 6.565255722970398e-06, "loss": 0.6547, "step": 5814 }, { "epoch": 0.8885662986591283, "grad_norm": 0.3018895983695984, "learning_rate": 6.547479810265578e-06, "loss": 0.7239, "step": 5815 }, { "epoch": 0.888719104557436, "grad_norm": 0.30829906463623047, "learning_rate": 6.529727180471679e-06, "loss": 0.6411, "step": 5816 }, { "epoch": 0.8888719104557435, "grad_norm": 0.4289684295654297, "learning_rate": 6.511997838011663e-06, "loss": 0.8104, "step": 5817 }, { "epoch": 0.8890247163540512, "grad_norm": 0.29046669602394104, "learning_rate": 6.494291787302609e-06, "loss": 0.7391, "step": 5818 }, { "epoch": 0.8891775222523589, "grad_norm": 0.25001123547554016, "learning_rate": 6.476609032755854e-06, "loss": 0.6197, "step": 5819 }, { "epoch": 0.8893303281506666, "grad_norm": 0.4297448694705963, "learning_rate": 6.458949578776885e-06, "loss": 0.8025, "step": 5820 }, { "epoch": 0.8894831340489743, "grad_norm": 0.4375990927219391, "learning_rate": 6.44131342976545e-06, "loss": 0.7262, "step": 5821 }, { "epoch": 0.889635939947282, "grad_norm": 0.3232177793979645, "learning_rate": 6.4237005901154114e-06, "loss": 0.716, "step": 5822 }, { "epoch": 0.8897887458455896, "grad_norm": 0.26701030135154724, "learning_rate": 6.406111064214848e-06, "loss": 0.7209, "step": 5823 }, { "epoch": 0.8899415517438973, "grad_norm": 0.24602992832660675, "learning_rate": 6.388544856446066e-06, "loss": 0.6626, "step": 5824 }, { "epoch": 0.890094357642205, "grad_norm": 0.3176684081554413, "learning_rate": 6.371001971185553e-06, "loss": 0.7256, "step": 5825 }, { "epoch": 0.8902471635405127, "grad_norm": 0.2829485535621643, "learning_rate": 6.3534824128039575e-06, "loss": 0.7708, "step": 5826 }, { "epoch": 0.8903999694388204, "grad_norm": 0.34358328580856323, "learning_rate": 6.335986185666109e-06, "loss": 1.0291, "step": 5827 }, { "epoch": 0.8905527753371281, "grad_norm": 0.2644045054912567, "learning_rate": 6.318513294131067e-06, "loss": 0.7311, "step": 5828 }, { "epoch": 0.8907055812354356, "grad_norm": 0.3157048523426056, "learning_rate": 6.30106374255206e-06, "loss": 0.6677, "step": 5829 }, { "epoch": 0.8908583871337433, "grad_norm": 0.31687742471694946, "learning_rate": 6.283637535276498e-06, "loss": 0.6844, "step": 5830 }, { "epoch": 0.891011193032051, "grad_norm": 0.2959498465061188, "learning_rate": 6.266234676645943e-06, "loss": 0.692, "step": 5831 }, { "epoch": 0.8911639989303587, "grad_norm": 0.4458772838115692, "learning_rate": 6.248855170996204e-06, "loss": 0.8595, "step": 5832 }, { "epoch": 0.8913168048286664, "grad_norm": 0.27008432149887085, "learning_rate": 6.231499022657239e-06, "loss": 0.6832, "step": 5833 }, { "epoch": 0.8914696107269741, "grad_norm": 1.1612355709075928, "learning_rate": 6.214166235953178e-06, "loss": 0.906, "step": 5834 }, { "epoch": 0.8916224166252817, "grad_norm": 0.2569592297077179, "learning_rate": 6.196856815202323e-06, "loss": 0.7152, "step": 5835 }, { "epoch": 0.8917752225235894, "grad_norm": 0.3061888813972473, "learning_rate": 6.17957076471718e-06, "loss": 0.704, "step": 5836 }, { "epoch": 0.8919280284218971, "grad_norm": 0.2537577748298645, "learning_rate": 6.1623080888044475e-06, "loss": 0.6497, "step": 5837 }, { "epoch": 0.8920808343202048, "grad_norm": 0.3566432595252991, "learning_rate": 6.145068791764952e-06, "loss": 0.7981, "step": 5838 }, { "epoch": 0.8922336402185125, "grad_norm": 0.299544095993042, "learning_rate": 6.127852877893736e-06, "loss": 0.8284, "step": 5839 }, { "epoch": 0.8923864461168202, "grad_norm": 0.3752456605434418, "learning_rate": 6.11066035147998e-06, "loss": 0.5426, "step": 5840 }, { "epoch": 0.8925392520151277, "grad_norm": 0.2913227677345276, "learning_rate": 6.093491216807068e-06, "loss": 0.7273, "step": 5841 }, { "epoch": 0.8926920579134354, "grad_norm": 0.26798391342163086, "learning_rate": 6.076345478152534e-06, "loss": 0.7619, "step": 5842 }, { "epoch": 0.8928448638117431, "grad_norm": 0.278884619474411, "learning_rate": 6.059223139788128e-06, "loss": 0.7944, "step": 5843 }, { "epoch": 0.8929976697100508, "grad_norm": 0.3024114966392517, "learning_rate": 6.042124205979704e-06, "loss": 0.8972, "step": 5844 }, { "epoch": 0.8931504756083585, "grad_norm": 0.318230003118515, "learning_rate": 6.025048680987322e-06, "loss": 0.6585, "step": 5845 }, { "epoch": 0.8933032815066662, "grad_norm": 0.27753233909606934, "learning_rate": 6.007996569065222e-06, "loss": 0.7879, "step": 5846 }, { "epoch": 0.8934560874049738, "grad_norm": 0.30329933762550354, "learning_rate": 5.990967874461784e-06, "loss": 0.6417, "step": 5847 }, { "epoch": 0.8936088933032815, "grad_norm": 0.3251444101333618, "learning_rate": 5.973962601419569e-06, "loss": 0.8238, "step": 5848 }, { "epoch": 0.8937616992015892, "grad_norm": 0.3880283236503601, "learning_rate": 5.956980754175289e-06, "loss": 0.6176, "step": 5849 }, { "epoch": 0.8939145050998969, "grad_norm": 0.2770722508430481, "learning_rate": 5.940022336959828e-06, "loss": 0.6808, "step": 5850 }, { "epoch": 0.8940673109982046, "grad_norm": 0.2863628566265106, "learning_rate": 5.923087353998246e-06, "loss": 0.8039, "step": 5851 }, { "epoch": 0.8942201168965122, "grad_norm": 0.2710552513599396, "learning_rate": 5.9061758095097505e-06, "loss": 0.6318, "step": 5852 }, { "epoch": 0.8943729227948198, "grad_norm": 0.2887222170829773, "learning_rate": 5.889287707707702e-06, "loss": 0.6414, "step": 5853 }, { "epoch": 0.8945257286931275, "grad_norm": 0.27116432785987854, "learning_rate": 5.872423052799636e-06, "loss": 0.5471, "step": 5854 }, { "epoch": 0.8946785345914352, "grad_norm": 0.2812448740005493, "learning_rate": 5.855581848987224e-06, "loss": 0.3823, "step": 5855 }, { "epoch": 0.8948313404897429, "grad_norm": 0.4182233512401581, "learning_rate": 5.838764100466343e-06, "loss": 0.6723, "step": 5856 }, { "epoch": 0.8949841463880506, "grad_norm": 0.3221946358680725, "learning_rate": 5.821969811426953e-06, "loss": 0.7409, "step": 5857 }, { "epoch": 0.8951369522863583, "grad_norm": 0.2302420288324356, "learning_rate": 5.80519898605324e-06, "loss": 0.701, "step": 5858 }, { "epoch": 0.8952897581846659, "grad_norm": 0.33879178762435913, "learning_rate": 5.788451628523505e-06, "loss": 0.5567, "step": 5859 }, { "epoch": 0.8954425640829736, "grad_norm": 0.28119370341300964, "learning_rate": 5.771727743010213e-06, "loss": 0.8658, "step": 5860 }, { "epoch": 0.8955953699812813, "grad_norm": 0.2939225733280182, "learning_rate": 5.755027333679974e-06, "loss": 0.6298, "step": 5861 }, { "epoch": 0.895748175879589, "grad_norm": 0.27552029490470886, "learning_rate": 5.738350404693571e-06, "loss": 0.74, "step": 5862 }, { "epoch": 0.8959009817778967, "grad_norm": 0.4174579381942749, "learning_rate": 5.7216969602058915e-06, "loss": 0.6768, "step": 5863 }, { "epoch": 0.8960537876762042, "grad_norm": 0.37528759241104126, "learning_rate": 5.705067004366027e-06, "loss": 0.8695, "step": 5864 }, { "epoch": 0.8962065935745119, "grad_norm": 0.27183693647384644, "learning_rate": 5.6884605413172085e-06, "loss": 0.8489, "step": 5865 }, { "epoch": 0.8963593994728196, "grad_norm": 0.27479737997055054, "learning_rate": 5.671877575196749e-06, "loss": 0.5745, "step": 5866 }, { "epoch": 0.8965122053711273, "grad_norm": 0.6669013500213623, "learning_rate": 5.655318110136165e-06, "loss": 0.5943, "step": 5867 }, { "epoch": 0.896665011269435, "grad_norm": 0.2586415410041809, "learning_rate": 5.638782150261135e-06, "loss": 0.5973, "step": 5868 }, { "epoch": 0.8968178171677427, "grad_norm": 0.2983604073524475, "learning_rate": 5.6222696996914625e-06, "loss": 0.6632, "step": 5869 }, { "epoch": 0.8969706230660504, "grad_norm": 0.4398852586746216, "learning_rate": 5.605780762541036e-06, "loss": 0.9462, "step": 5870 }, { "epoch": 0.897123428964358, "grad_norm": 0.26358821988105774, "learning_rate": 5.589315342917967e-06, "loss": 0.715, "step": 5871 }, { "epoch": 0.8972762348626657, "grad_norm": 0.3196551501750946, "learning_rate": 5.572873444924487e-06, "loss": 0.7284, "step": 5872 }, { "epoch": 0.8974290407609734, "grad_norm": 0.3146829307079315, "learning_rate": 5.55645507265693e-06, "loss": 0.73, "step": 5873 }, { "epoch": 0.8975818466592811, "grad_norm": 0.31993281841278076, "learning_rate": 5.5400602302058236e-06, "loss": 0.6271, "step": 5874 }, { "epoch": 0.8977346525575888, "grad_norm": 0.5897267460823059, "learning_rate": 5.523688921655779e-06, "loss": 0.8675, "step": 5875 }, { "epoch": 0.8978874584558963, "grad_norm": 0.31031474471092224, "learning_rate": 5.507341151085599e-06, "loss": 0.7015, "step": 5876 }, { "epoch": 0.898040264354204, "grad_norm": 0.22663110494613647, "learning_rate": 5.49101692256816e-06, "loss": 0.7444, "step": 5877 }, { "epoch": 0.8981930702525117, "grad_norm": 0.33932214975357056, "learning_rate": 5.47471624017053e-06, "loss": 0.6108, "step": 5878 }, { "epoch": 0.8983458761508194, "grad_norm": 0.26433759927749634, "learning_rate": 5.458439107953894e-06, "loss": 0.6342, "step": 5879 }, { "epoch": 0.8984986820491271, "grad_norm": 0.2902994751930237, "learning_rate": 5.44218552997352e-06, "loss": 0.6325, "step": 5880 }, { "epoch": 0.8986514879474348, "grad_norm": 0.2531551718711853, "learning_rate": 5.425955510278891e-06, "loss": 0.6177, "step": 5881 }, { "epoch": 0.8988042938457425, "grad_norm": 0.293772429227829, "learning_rate": 5.409749052913582e-06, "loss": 0.6913, "step": 5882 }, { "epoch": 0.8989570997440501, "grad_norm": 0.33307167887687683, "learning_rate": 5.393566161915276e-06, "loss": 0.7166, "step": 5883 }, { "epoch": 0.8991099056423578, "grad_norm": 0.2943355441093445, "learning_rate": 5.377406841315802e-06, "loss": 0.7835, "step": 5884 }, { "epoch": 0.8992627115406655, "grad_norm": 0.3272358477115631, "learning_rate": 5.36127109514113e-06, "loss": 0.7343, "step": 5885 }, { "epoch": 0.8994155174389732, "grad_norm": 0.29728636145591736, "learning_rate": 5.345158927411354e-06, "loss": 0.8396, "step": 5886 }, { "epoch": 0.8995683233372809, "grad_norm": 0.2658395767211914, "learning_rate": 5.329070342140685e-06, "loss": 0.8047, "step": 5887 }, { "epoch": 0.8997211292355884, "grad_norm": 0.3411000967025757, "learning_rate": 5.313005343337429e-06, "loss": 0.8666, "step": 5888 }, { "epoch": 0.8998739351338961, "grad_norm": 0.3442920744419098, "learning_rate": 5.296963935004062e-06, "loss": 0.5833, "step": 5889 }, { "epoch": 0.9000267410322038, "grad_norm": 0.24363353848457336, "learning_rate": 5.280946121137187e-06, "loss": 0.5621, "step": 5890 }, { "epoch": 0.9001795469305115, "grad_norm": 0.31754767894744873, "learning_rate": 5.2649519057274886e-06, "loss": 0.6101, "step": 5891 }, { "epoch": 0.9003323528288192, "grad_norm": 0.26689276099205017, "learning_rate": 5.2489812927597915e-06, "loss": 0.716, "step": 5892 }, { "epoch": 0.9004851587271269, "grad_norm": 0.25092315673828125, "learning_rate": 5.2330342862130455e-06, "loss": 0.7321, "step": 5893 }, { "epoch": 0.9006379646254346, "grad_norm": 0.318852037191391, "learning_rate": 5.217110890060295e-06, "loss": 0.724, "step": 5894 }, { "epoch": 0.9007907705237422, "grad_norm": 0.28129202127456665, "learning_rate": 5.201211108268755e-06, "loss": 0.6249, "step": 5895 }, { "epoch": 0.9009435764220499, "grad_norm": 0.31783658266067505, "learning_rate": 5.185334944799691e-06, "loss": 0.689, "step": 5896 }, { "epoch": 0.9010963823203576, "grad_norm": 0.27820682525634766, "learning_rate": 5.169482403608528e-06, "loss": 0.6808, "step": 5897 }, { "epoch": 0.9012491882186653, "grad_norm": 0.2727433443069458, "learning_rate": 5.153653488644794e-06, "loss": 0.8703, "step": 5898 }, { "epoch": 0.9014019941169729, "grad_norm": 0.2737170457839966, "learning_rate": 5.137848203852125e-06, "loss": 0.6923, "step": 5899 }, { "epoch": 0.9015548000152805, "grad_norm": 0.24887260794639587, "learning_rate": 5.1220665531682925e-06, "loss": 0.7023, "step": 5900 }, { "epoch": 0.9017076059135882, "grad_norm": 0.300696462392807, "learning_rate": 5.106308540525162e-06, "loss": 0.6112, "step": 5901 }, { "epoch": 0.9018604118118959, "grad_norm": 0.3953166604042053, "learning_rate": 5.090574169848672e-06, "loss": 0.6249, "step": 5902 }, { "epoch": 0.9020132177102036, "grad_norm": 0.25591349601745605, "learning_rate": 5.07486344505893e-06, "loss": 0.5919, "step": 5903 }, { "epoch": 0.9021660236085113, "grad_norm": 0.23894433677196503, "learning_rate": 5.0591763700701625e-06, "loss": 0.816, "step": 5904 }, { "epoch": 0.902318829506819, "grad_norm": 0.2608323097229004, "learning_rate": 5.043512948790641e-06, "loss": 0.6331, "step": 5905 }, { "epoch": 0.9024716354051266, "grad_norm": 0.3080928325653076, "learning_rate": 5.027873185122767e-06, "loss": 0.7063, "step": 5906 }, { "epoch": 0.9026244413034343, "grad_norm": 0.45490792393684387, "learning_rate": 5.012257082963067e-06, "loss": 0.771, "step": 5907 }, { "epoch": 0.902777247201742, "grad_norm": 0.3092617690563202, "learning_rate": 4.996664646202176e-06, "loss": 0.6215, "step": 5908 }, { "epoch": 0.9029300531000497, "grad_norm": 0.2615375816822052, "learning_rate": 4.981095878724817e-06, "loss": 0.5978, "step": 5909 }, { "epoch": 0.9030828589983574, "grad_norm": 0.30354490876197815, "learning_rate": 4.965550784409789e-06, "loss": 0.643, "step": 5910 }, { "epoch": 0.903235664896665, "grad_norm": 0.3309997022151947, "learning_rate": 4.950029367130049e-06, "loss": 0.6145, "step": 5911 }, { "epoch": 0.9033884707949726, "grad_norm": 0.2688743472099304, "learning_rate": 4.934531630752615e-06, "loss": 0.769, "step": 5912 }, { "epoch": 0.9035412766932803, "grad_norm": 0.3107849657535553, "learning_rate": 4.919057579138631e-06, "loss": 0.8814, "step": 5913 }, { "epoch": 0.903694082591588, "grad_norm": 0.3090243637561798, "learning_rate": 4.903607216143303e-06, "loss": 0.7884, "step": 5914 }, { "epoch": 0.9038468884898957, "grad_norm": 0.35385674238204956, "learning_rate": 4.888180545615995e-06, "loss": 0.8163, "step": 5915 }, { "epoch": 0.9039996943882034, "grad_norm": 0.35798507928848267, "learning_rate": 4.872777571400089e-06, "loss": 0.6307, "step": 5916 }, { "epoch": 0.9041525002865111, "grad_norm": 0.3256695568561554, "learning_rate": 4.8573982973331486e-06, "loss": 0.9048, "step": 5917 }, { "epoch": 0.9043053061848187, "grad_norm": 0.4998894929885864, "learning_rate": 4.842042727246776e-06, "loss": 0.5458, "step": 5918 }, { "epoch": 0.9044581120831264, "grad_norm": 0.2989078164100647, "learning_rate": 4.826710864966666e-06, "loss": 0.7098, "step": 5919 }, { "epoch": 0.9046109179814341, "grad_norm": 0.2614652216434479, "learning_rate": 4.811402714312629e-06, "loss": 0.6025, "step": 5920 }, { "epoch": 0.9047637238797418, "grad_norm": 0.32760369777679443, "learning_rate": 4.796118279098593e-06, "loss": 0.7237, "step": 5921 }, { "epoch": 0.9049165297780495, "grad_norm": 0.2693394720554352, "learning_rate": 4.780857563132513e-06, "loss": 0.6758, "step": 5922 }, { "epoch": 0.9050693356763571, "grad_norm": 0.28089210391044617, "learning_rate": 4.7656205702164665e-06, "loss": 0.5614, "step": 5923 }, { "epoch": 0.9052221415746647, "grad_norm": 0.3213806450366974, "learning_rate": 4.750407304146642e-06, "loss": 0.815, "step": 5924 }, { "epoch": 0.9053749474729724, "grad_norm": 0.40935635566711426, "learning_rate": 4.735217768713296e-06, "loss": 0.7063, "step": 5925 }, { "epoch": 0.9055277533712801, "grad_norm": 0.3049255609512329, "learning_rate": 4.720051967700767e-06, "loss": 0.7845, "step": 5926 }, { "epoch": 0.9056805592695878, "grad_norm": 0.2561148703098297, "learning_rate": 4.704909904887478e-06, "loss": 0.6889, "step": 5927 }, { "epoch": 0.9058333651678955, "grad_norm": 0.3279137909412384, "learning_rate": 4.689791584045955e-06, "loss": 0.6985, "step": 5928 }, { "epoch": 0.9059861710662032, "grad_norm": 0.32065799832344055, "learning_rate": 4.6746970089428185e-06, "loss": 0.6057, "step": 5929 }, { "epoch": 0.9061389769645108, "grad_norm": 0.267499178647995, "learning_rate": 4.659626183338728e-06, "loss": 0.6368, "step": 5930 }, { "epoch": 0.9062917828628185, "grad_norm": 0.34940314292907715, "learning_rate": 4.644579110988456e-06, "loss": 0.861, "step": 5931 }, { "epoch": 0.9064445887611262, "grad_norm": 0.35976719856262207, "learning_rate": 4.629555795640872e-06, "loss": 0.6298, "step": 5932 }, { "epoch": 0.9065973946594339, "grad_norm": 0.3161333501338959, "learning_rate": 4.614556241038892e-06, "loss": 0.8894, "step": 5933 }, { "epoch": 0.9067502005577416, "grad_norm": 0.3225654363632202, "learning_rate": 4.599580450919538e-06, "loss": 0.5241, "step": 5934 }, { "epoch": 0.9069030064560492, "grad_norm": 0.3067881762981415, "learning_rate": 4.584628429013904e-06, "loss": 0.6733, "step": 5935 }, { "epoch": 0.9070558123543568, "grad_norm": 0.3025769591331482, "learning_rate": 4.569700179047165e-06, "loss": 0.7682, "step": 5936 }, { "epoch": 0.9072086182526645, "grad_norm": 0.279767781496048, "learning_rate": 4.5547957047385345e-06, "loss": 0.6389, "step": 5937 }, { "epoch": 0.9073614241509722, "grad_norm": 0.31643933057785034, "learning_rate": 4.539915009801376e-06, "loss": 0.5944, "step": 5938 }, { "epoch": 0.9075142300492799, "grad_norm": 0.3155369460582733, "learning_rate": 4.525058097943092e-06, "loss": 0.8315, "step": 5939 }, { "epoch": 0.9076670359475876, "grad_norm": 0.3218875527381897, "learning_rate": 4.51022497286514e-06, "loss": 0.6969, "step": 5940 }, { "epoch": 0.9078198418458953, "grad_norm": 0.27116522192955017, "learning_rate": 4.495415638263057e-06, "loss": 0.6819, "step": 5941 }, { "epoch": 0.9079726477442029, "grad_norm": 0.27992215752601624, "learning_rate": 4.480630097826477e-06, "loss": 0.7677, "step": 5942 }, { "epoch": 0.9081254536425106, "grad_norm": 0.3074599504470825, "learning_rate": 4.465868355239111e-06, "loss": 0.5406, "step": 5943 }, { "epoch": 0.9082782595408183, "grad_norm": 0.3808681070804596, "learning_rate": 4.451130414178706e-06, "loss": 0.8077, "step": 5944 }, { "epoch": 0.908431065439126, "grad_norm": 0.35826560854911804, "learning_rate": 4.4364162783170906e-06, "loss": 0.7406, "step": 5945 }, { "epoch": 0.9085838713374337, "grad_norm": 0.2683902680873871, "learning_rate": 4.421725951320177e-06, "loss": 0.5532, "step": 5946 }, { "epoch": 0.9087366772357413, "grad_norm": 0.2759500741958618, "learning_rate": 4.407059436847938e-06, "loss": 0.6203, "step": 5947 }, { "epoch": 0.9088894831340489, "grad_norm": 0.2844933867454529, "learning_rate": 4.392416738554417e-06, "loss": 0.6726, "step": 5948 }, { "epoch": 0.9090422890323566, "grad_norm": 0.3025042414665222, "learning_rate": 4.377797860087696e-06, "loss": 0.8128, "step": 5949 }, { "epoch": 0.9091950949306643, "grad_norm": 0.31045666337013245, "learning_rate": 4.363202805089972e-06, "loss": 0.7232, "step": 5950 }, { "epoch": 0.909347900828972, "grad_norm": 0.2824687659740448, "learning_rate": 4.348631577197459e-06, "loss": 0.7842, "step": 5951 }, { "epoch": 0.9095007067272797, "grad_norm": 0.3096737861633301, "learning_rate": 4.334084180040488e-06, "loss": 0.582, "step": 5952 }, { "epoch": 0.9096535126255874, "grad_norm": 0.2907276749610901, "learning_rate": 4.319560617243379e-06, "loss": 0.6811, "step": 5953 }, { "epoch": 0.909806318523895, "grad_norm": 0.28352272510528564, "learning_rate": 4.305060892424595e-06, "loss": 0.6629, "step": 5954 }, { "epoch": 0.9099591244222027, "grad_norm": 0.2939473092556, "learning_rate": 4.290585009196591e-06, "loss": 0.755, "step": 5955 }, { "epoch": 0.9101119303205104, "grad_norm": 0.28185558319091797, "learning_rate": 4.276132971165936e-06, "loss": 0.7723, "step": 5956 }, { "epoch": 0.9102647362188181, "grad_norm": 0.2669297754764557, "learning_rate": 4.261704781933218e-06, "loss": 0.7339, "step": 5957 }, { "epoch": 0.9104175421171257, "grad_norm": 0.28795671463012695, "learning_rate": 4.247300445093094e-06, "loss": 0.595, "step": 5958 }, { "epoch": 0.9105703480154334, "grad_norm": 0.265563428401947, "learning_rate": 4.232919964234294e-06, "loss": 0.6395, "step": 5959 }, { "epoch": 0.910723153913741, "grad_norm": 0.2824648916721344, "learning_rate": 4.218563342939586e-06, "loss": 0.7736, "step": 5960 }, { "epoch": 0.9108759598120487, "grad_norm": 0.28865858912467957, "learning_rate": 4.20423058478584e-06, "loss": 0.689, "step": 5961 }, { "epoch": 0.9110287657103564, "grad_norm": 0.25682947039604187, "learning_rate": 4.1899216933438904e-06, "loss": 0.625, "step": 5962 }, { "epoch": 0.9111815716086641, "grad_norm": 0.2924043834209442, "learning_rate": 4.1756366721786845e-06, "loss": 0.5827, "step": 5963 }, { "epoch": 0.9113343775069718, "grad_norm": 0.3125070333480835, "learning_rate": 4.161375524849253e-06, "loss": 0.7185, "step": 5964 }, { "epoch": 0.9114871834052795, "grad_norm": 0.3678252696990967, "learning_rate": 4.147138254908589e-06, "loss": 0.6279, "step": 5965 }, { "epoch": 0.9116399893035871, "grad_norm": 0.2505943477153778, "learning_rate": 4.132924865903842e-06, "loss": 0.5451, "step": 5966 }, { "epoch": 0.9117927952018948, "grad_norm": 0.2890109419822693, "learning_rate": 4.118735361376125e-06, "loss": 0.6725, "step": 5967 }, { "epoch": 0.9119456011002025, "grad_norm": 0.37433120608329773, "learning_rate": 4.104569744860642e-06, "loss": 0.8314, "step": 5968 }, { "epoch": 0.9120984069985102, "grad_norm": 0.39062103629112244, "learning_rate": 4.0904280198866274e-06, "loss": 0.5721, "step": 5969 }, { "epoch": 0.9122512128968178, "grad_norm": 0.2963562607765198, "learning_rate": 4.0763101899774056e-06, "loss": 0.7728, "step": 5970 }, { "epoch": 0.9124040187951254, "grad_norm": 0.35083603858947754, "learning_rate": 4.062216258650264e-06, "loss": 0.5537, "step": 5971 }, { "epoch": 0.9125568246934331, "grad_norm": 0.2698793113231659, "learning_rate": 4.048146229416639e-06, "loss": 0.7664, "step": 5972 }, { "epoch": 0.9127096305917408, "grad_norm": 0.25656360387802124, "learning_rate": 4.034100105781924e-06, "loss": 0.5758, "step": 5973 }, { "epoch": 0.9128624364900485, "grad_norm": 0.2732607126235962, "learning_rate": 4.020077891245622e-06, "loss": 0.8021, "step": 5974 }, { "epoch": 0.9130152423883562, "grad_norm": 0.2684570550918579, "learning_rate": 4.006079589301237e-06, "loss": 0.7484, "step": 5975 }, { "epoch": 0.9131680482866639, "grad_norm": 0.26051005721092224, "learning_rate": 3.992105203436303e-06, "loss": 0.8032, "step": 5976 }, { "epoch": 0.9133208541849716, "grad_norm": 0.2778724730014801, "learning_rate": 3.9781547371324555e-06, "loss": 0.5917, "step": 5977 }, { "epoch": 0.9134736600832792, "grad_norm": 0.30622413754463196, "learning_rate": 3.964228193865327e-06, "loss": 0.6081, "step": 5978 }, { "epoch": 0.9136264659815869, "grad_norm": 0.47084107995033264, "learning_rate": 3.950325577104597e-06, "loss": 0.5655, "step": 5979 }, { "epoch": 0.9137792718798946, "grad_norm": 0.3108175992965698, "learning_rate": 3.936446890313983e-06, "loss": 0.7595, "step": 5980 }, { "epoch": 0.9139320777782023, "grad_norm": 0.32348400354385376, "learning_rate": 3.9225921369512305e-06, "loss": 0.6049, "step": 5981 }, { "epoch": 0.9140848836765099, "grad_norm": 0.2693932354450226, "learning_rate": 3.90876132046818e-06, "loss": 0.559, "step": 5982 }, { "epoch": 0.9142376895748175, "grad_norm": 0.26965370774269104, "learning_rate": 3.894954444310617e-06, "loss": 0.5038, "step": 5983 }, { "epoch": 0.9143904954731252, "grad_norm": 0.370766818523407, "learning_rate": 3.881171511918424e-06, "loss": 0.888, "step": 5984 }, { "epoch": 0.9145433013714329, "grad_norm": 0.3055818974971771, "learning_rate": 3.8674125267255e-06, "loss": 0.6979, "step": 5985 }, { "epoch": 0.9146961072697406, "grad_norm": 0.3472610116004944, "learning_rate": 3.85367749215979e-06, "loss": 0.7484, "step": 5986 }, { "epoch": 0.9148489131680483, "grad_norm": 0.30086827278137207, "learning_rate": 3.83996641164327e-06, "loss": 0.7992, "step": 5987 }, { "epoch": 0.915001719066356, "grad_norm": 0.328401118516922, "learning_rate": 3.826279288591905e-06, "loss": 0.6501, "step": 5988 }, { "epoch": 0.9151545249646637, "grad_norm": 0.26785561442375183, "learning_rate": 3.812616126415769e-06, "loss": 0.7008, "step": 5989 }, { "epoch": 0.9153073308629713, "grad_norm": 0.2655331492424011, "learning_rate": 3.7989769285188823e-06, "loss": 0.7452, "step": 5990 }, { "epoch": 0.915460136761279, "grad_norm": 0.28456825017929077, "learning_rate": 3.7853616982993833e-06, "loss": 0.7024, "step": 5991 }, { "epoch": 0.9156129426595867, "grad_norm": 0.25299546122550964, "learning_rate": 3.771770439149347e-06, "loss": 0.6077, "step": 5992 }, { "epoch": 0.9157657485578944, "grad_norm": 0.33184099197387695, "learning_rate": 3.7582031544549643e-06, "loss": 0.6939, "step": 5993 }, { "epoch": 0.915918554456202, "grad_norm": 0.26723524928092957, "learning_rate": 3.744659847596366e-06, "loss": 0.6467, "step": 5994 }, { "epoch": 0.9160713603545096, "grad_norm": 0.2578485310077667, "learning_rate": 3.7311405219477846e-06, "loss": 0.6241, "step": 5995 }, { "epoch": 0.9162241662528173, "grad_norm": 0.26527437567710876, "learning_rate": 3.7176451808774603e-06, "loss": 0.9047, "step": 5996 }, { "epoch": 0.916376972151125, "grad_norm": 0.3009488582611084, "learning_rate": 3.704173827747592e-06, "loss": 0.7634, "step": 5997 }, { "epoch": 0.9165297780494327, "grad_norm": 0.3305479884147644, "learning_rate": 3.6907264659144846e-06, "loss": 0.7967, "step": 5998 }, { "epoch": 0.9166825839477404, "grad_norm": 0.4715300500392914, "learning_rate": 3.677303098728435e-06, "loss": 0.8379, "step": 5999 }, { "epoch": 0.9168353898460481, "grad_norm": 0.36732909083366394, "learning_rate": 3.66390372953378e-06, "loss": 0.7329, "step": 6000 }, { "epoch": 0.9169881957443557, "grad_norm": 0.3098233640193939, "learning_rate": 3.650528361668837e-06, "loss": 0.7784, "step": 6001 }, { "epoch": 0.9171410016426634, "grad_norm": 0.38401779532432556, "learning_rate": 3.6371769984659633e-06, "loss": 0.6848, "step": 6002 }, { "epoch": 0.9172938075409711, "grad_norm": 0.26952889561653137, "learning_rate": 3.6238496432515647e-06, "loss": 0.7228, "step": 6003 }, { "epoch": 0.9174466134392788, "grad_norm": 0.2722468376159668, "learning_rate": 3.610546299345996e-06, "loss": 0.7185, "step": 6004 }, { "epoch": 0.9175994193375865, "grad_norm": 0.32538071274757385, "learning_rate": 3.5972669700637173e-06, "loss": 0.7636, "step": 6005 }, { "epoch": 0.9177522252358941, "grad_norm": 0.2902810573577881, "learning_rate": 3.584011658713138e-06, "loss": 0.7217, "step": 6006 }, { "epoch": 0.9179050311342017, "grad_norm": 0.3095985949039459, "learning_rate": 3.5707803685967268e-06, "loss": 0.6745, "step": 6007 }, { "epoch": 0.9180578370325094, "grad_norm": 0.3057194948196411, "learning_rate": 3.557573103010925e-06, "loss": 0.8661, "step": 6008 }, { "epoch": 0.9182106429308171, "grad_norm": 0.2756378948688507, "learning_rate": 3.5443898652462336e-06, "loss": 0.7208, "step": 6009 }, { "epoch": 0.9183634488291248, "grad_norm": 0.2705722749233246, "learning_rate": 3.5312306585871147e-06, "loss": 0.7542, "step": 6010 }, { "epoch": 0.9185162547274325, "grad_norm": 0.3743616044521332, "learning_rate": 3.518095486312112e-06, "loss": 0.7122, "step": 6011 }, { "epoch": 0.9186690606257402, "grad_norm": 0.27353158593177795, "learning_rate": 3.5049843516937187e-06, "loss": 0.8823, "step": 6012 }, { "epoch": 0.9188218665240478, "grad_norm": 0.27861884236335754, "learning_rate": 3.491897257998478e-06, "loss": 0.5775, "step": 6013 }, { "epoch": 0.9189746724223555, "grad_norm": 0.3004818558692932, "learning_rate": 3.4788342084869364e-06, "loss": 0.5588, "step": 6014 }, { "epoch": 0.9191274783206632, "grad_norm": 0.25704312324523926, "learning_rate": 3.4657952064136025e-06, "loss": 0.7754, "step": 6015 }, { "epoch": 0.9192802842189709, "grad_norm": 0.3294251263141632, "learning_rate": 3.452780255027066e-06, "loss": 0.5161, "step": 6016 }, { "epoch": 0.9194330901172785, "grad_norm": 0.32235532999038696, "learning_rate": 3.4397893575699e-06, "loss": 0.8189, "step": 6017 }, { "epoch": 0.9195858960155862, "grad_norm": 0.2824956774711609, "learning_rate": 3.4268225172786605e-06, "loss": 0.8571, "step": 6018 }, { "epoch": 0.9197387019138938, "grad_norm": 0.33709898591041565, "learning_rate": 3.4138797373839292e-06, "loss": 0.8621, "step": 6019 }, { "epoch": 0.9198915078122015, "grad_norm": 0.26414015889167786, "learning_rate": 3.400961021110294e-06, "loss": 0.6458, "step": 6020 }, { "epoch": 0.9200443137105092, "grad_norm": 0.28258901834487915, "learning_rate": 3.388066371676346e-06, "loss": 0.6819, "step": 6021 }, { "epoch": 0.9201971196088169, "grad_norm": 0.2308105230331421, "learning_rate": 3.375195792294694e-06, "loss": 0.8169, "step": 6022 }, { "epoch": 0.9203499255071246, "grad_norm": 0.2916742265224457, "learning_rate": 3.3623492861718954e-06, "loss": 0.6812, "step": 6023 }, { "epoch": 0.9205027314054323, "grad_norm": 0.39090749621391296, "learning_rate": 3.349526856508567e-06, "loss": 0.7291, "step": 6024 }, { "epoch": 0.92065553730374, "grad_norm": 0.293911337852478, "learning_rate": 3.3367285064993315e-06, "loss": 0.6763, "step": 6025 }, { "epoch": 0.9208083432020476, "grad_norm": 0.3265632688999176, "learning_rate": 3.3239542393327717e-06, "loss": 0.708, "step": 6026 }, { "epoch": 0.9209611491003553, "grad_norm": 0.2923089563846588, "learning_rate": 3.311204058191486e-06, "loss": 0.8443, "step": 6027 }, { "epoch": 0.921113954998663, "grad_norm": 0.5083439350128174, "learning_rate": 3.2984779662520895e-06, "loss": 0.7357, "step": 6028 }, { "epoch": 0.9212667608969706, "grad_norm": 0.41934359073638916, "learning_rate": 3.2857759666851563e-06, "loss": 0.6068, "step": 6029 }, { "epoch": 0.9214195667952783, "grad_norm": 0.3560185730457306, "learning_rate": 3.2730980626553e-06, "loss": 0.8213, "step": 6030 }, { "epoch": 0.9215723726935859, "grad_norm": 0.26341885328292847, "learning_rate": 3.260444257321127e-06, "loss": 0.7186, "step": 6031 }, { "epoch": 0.9217251785918936, "grad_norm": 0.26726219058036804, "learning_rate": 3.2478145538352044e-06, "loss": 0.7076, "step": 6032 }, { "epoch": 0.9218779844902013, "grad_norm": 0.30211687088012695, "learning_rate": 3.2352089553441266e-06, "loss": 0.6466, "step": 6033 }, { "epoch": 0.922030790388509, "grad_norm": 0.28752097487449646, "learning_rate": 3.222627464988459e-06, "loss": 0.7951, "step": 6034 }, { "epoch": 0.9221835962868167, "grad_norm": 0.27376800775527954, "learning_rate": 3.210070085902794e-06, "loss": 0.6021, "step": 6035 }, { "epoch": 0.9223364021851244, "grad_norm": 0.27677032351493835, "learning_rate": 3.1975368212156965e-06, "loss": 0.5663, "step": 6036 }, { "epoch": 0.922489208083432, "grad_norm": 0.294606477022171, "learning_rate": 3.1850276740497007e-06, "loss": 0.7105, "step": 6037 }, { "epoch": 0.9226420139817397, "grad_norm": 0.288740873336792, "learning_rate": 3.1725426475213817e-06, "loss": 0.6642, "step": 6038 }, { "epoch": 0.9227948198800474, "grad_norm": 0.2658214569091797, "learning_rate": 3.1600817447412613e-06, "loss": 0.6853, "step": 6039 }, { "epoch": 0.9229476257783551, "grad_norm": 0.3108449876308441, "learning_rate": 3.1476449688138896e-06, "loss": 0.6996, "step": 6040 }, { "epoch": 0.9231004316766627, "grad_norm": 0.320084810256958, "learning_rate": 3.1352323228377556e-06, "loss": 0.4263, "step": 6041 }, { "epoch": 0.9232532375749704, "grad_norm": 0.2856956124305725, "learning_rate": 3.1228438099053956e-06, "loss": 0.6659, "step": 6042 }, { "epoch": 0.923406043473278, "grad_norm": 0.3506624400615692, "learning_rate": 3.110479433103286e-06, "loss": 0.6637, "step": 6043 }, { "epoch": 0.9235588493715857, "grad_norm": 0.2929691970348358, "learning_rate": 3.0981391955119065e-06, "loss": 0.5573, "step": 6044 }, { "epoch": 0.9237116552698934, "grad_norm": 0.28566694259643555, "learning_rate": 3.0858231002057313e-06, "loss": 0.7845, "step": 6045 }, { "epoch": 0.9238644611682011, "grad_norm": 0.2971111536026001, "learning_rate": 3.073531150253217e-06, "loss": 0.6768, "step": 6046 }, { "epoch": 0.9240172670665088, "grad_norm": 0.2790727913379669, "learning_rate": 3.0612633487167807e-06, "loss": 0.7252, "step": 6047 }, { "epoch": 0.9241700729648165, "grad_norm": 0.2813704013824463, "learning_rate": 3.0490196986528664e-06, "loss": 0.594, "step": 6048 }, { "epoch": 0.9243228788631241, "grad_norm": 0.31998634338378906, "learning_rate": 3.0368002031118446e-06, "loss": 0.5821, "step": 6049 }, { "epoch": 0.9244756847614318, "grad_norm": 0.27246496081352234, "learning_rate": 3.0246048651381367e-06, "loss": 0.7559, "step": 6050 }, { "epoch": 0.9246284906597395, "grad_norm": 0.2767251431941986, "learning_rate": 3.0124336877700775e-06, "loss": 0.5597, "step": 6051 }, { "epoch": 0.9247812965580472, "grad_norm": 0.3764009475708008, "learning_rate": 3.0002866740400427e-06, "loss": 0.6798, "step": 6052 }, { "epoch": 0.9249341024563548, "grad_norm": 0.30215245485305786, "learning_rate": 2.988163826974344e-06, "loss": 0.6884, "step": 6053 }, { "epoch": 0.9250869083546625, "grad_norm": 0.4687015414237976, "learning_rate": 2.9760651495932766e-06, "loss": 0.6384, "step": 6054 }, { "epoch": 0.9252397142529701, "grad_norm": 0.33076927065849304, "learning_rate": 2.96399064491113e-06, "loss": 0.7587, "step": 6055 }, { "epoch": 0.9253925201512778, "grad_norm": 0.26885855197906494, "learning_rate": 2.9519403159361746e-06, "loss": 0.7553, "step": 6056 }, { "epoch": 0.9255453260495855, "grad_norm": 0.2613165080547333, "learning_rate": 2.939914165670665e-06, "loss": 0.7376, "step": 6057 }, { "epoch": 0.9256981319478932, "grad_norm": 0.37895190715789795, "learning_rate": 2.9279121971107716e-06, "loss": 0.4247, "step": 6058 }, { "epoch": 0.9258509378462009, "grad_norm": 0.3495396673679352, "learning_rate": 2.9159344132467014e-06, "loss": 0.7048, "step": 6059 }, { "epoch": 0.9260037437445086, "grad_norm": 0.3031073808670044, "learning_rate": 2.903980817062646e-06, "loss": 0.7942, "step": 6060 }, { "epoch": 0.9261565496428162, "grad_norm": 0.3974023759365082, "learning_rate": 2.8920514115367113e-06, "loss": 0.6494, "step": 6061 }, { "epoch": 0.9263093555411239, "grad_norm": 0.35635390877723694, "learning_rate": 2.8801461996410207e-06, "loss": 0.9328, "step": 6062 }, { "epoch": 0.9264621614394316, "grad_norm": 0.31114310026168823, "learning_rate": 2.8682651843416563e-06, "loss": 0.8144, "step": 6063 }, { "epoch": 0.9266149673377392, "grad_norm": 0.2945016622543335, "learning_rate": 2.8564083685986843e-06, "loss": 0.5831, "step": 6064 }, { "epoch": 0.9267677732360469, "grad_norm": 0.29071712493896484, "learning_rate": 2.844575755366108e-06, "loss": 0.7598, "step": 6065 }, { "epoch": 0.9269205791343545, "grad_norm": 0.3301604986190796, "learning_rate": 2.832767347591936e-06, "loss": 0.7241, "step": 6066 }, { "epoch": 0.9270733850326622, "grad_norm": 0.34881338477134705, "learning_rate": 2.8209831482181483e-06, "loss": 0.9521, "step": 6067 }, { "epoch": 0.9272261909309699, "grad_norm": 0.27328556776046753, "learning_rate": 2.8092231601806517e-06, "loss": 0.6417, "step": 6068 }, { "epoch": 0.9273789968292776, "grad_norm": 0.3333223760128021, "learning_rate": 2.797487386409359e-06, "loss": 0.8499, "step": 6069 }, { "epoch": 0.9275318027275853, "grad_norm": 0.3463587462902069, "learning_rate": 2.785775829828152e-06, "loss": 0.7211, "step": 6070 }, { "epoch": 0.927684608625893, "grad_norm": 0.35262081027030945, "learning_rate": 2.7740884933548538e-06, "loss": 1.0996, "step": 6071 }, { "epoch": 0.9278374145242007, "grad_norm": 0.311922162771225, "learning_rate": 2.762425379901268e-06, "loss": 0.865, "step": 6072 }, { "epoch": 0.9279902204225083, "grad_norm": 0.26930543780326843, "learning_rate": 2.7507864923731584e-06, "loss": 0.5989, "step": 6073 }, { "epoch": 0.928143026320816, "grad_norm": 0.3048146069049835, "learning_rate": 2.739171833670262e-06, "loss": 0.637, "step": 6074 }, { "epoch": 0.9282958322191237, "grad_norm": 0.28027278184890747, "learning_rate": 2.727581406686286e-06, "loss": 0.7139, "step": 6075 }, { "epoch": 0.9284486381174313, "grad_norm": 0.28340592980384827, "learning_rate": 2.7160152143088535e-06, "loss": 0.7486, "step": 6076 }, { "epoch": 0.928601444015739, "grad_norm": 0.4058891236782074, "learning_rate": 2.7044732594196152e-06, "loss": 0.8938, "step": 6077 }, { "epoch": 0.9287542499140466, "grad_norm": 0.28652113676071167, "learning_rate": 2.692955544894149e-06, "loss": 0.5256, "step": 6078 }, { "epoch": 0.9289070558123543, "grad_norm": 1.982176661491394, "learning_rate": 2.6814620736019813e-06, "loss": 0.7787, "step": 6079 }, { "epoch": 0.929059861710662, "grad_norm": 0.26007115840911865, "learning_rate": 2.6699928484066217e-06, "loss": 0.6662, "step": 6080 }, { "epoch": 0.9292126676089697, "grad_norm": 0.2802926003932953, "learning_rate": 2.65854787216554e-06, "loss": 0.8371, "step": 6081 }, { "epoch": 0.9293654735072774, "grad_norm": 0.26776376366615295, "learning_rate": 2.647127147730133e-06, "loss": 0.6102, "step": 6082 }, { "epoch": 0.9295182794055851, "grad_norm": 0.3064311146736145, "learning_rate": 2.6357306779458133e-06, "loss": 0.6569, "step": 6083 }, { "epoch": 0.9296710853038928, "grad_norm": 0.3474329113960266, "learning_rate": 2.624358465651877e-06, "loss": 0.7091, "step": 6084 }, { "epoch": 0.9298238912022004, "grad_norm": 0.28256288170814514, "learning_rate": 2.613010513681646e-06, "loss": 0.6321, "step": 6085 }, { "epoch": 0.9299766971005081, "grad_norm": 0.2918296754360199, "learning_rate": 2.6016868248623482e-06, "loss": 0.6811, "step": 6086 }, { "epoch": 0.9301295029988158, "grad_norm": 0.29972580075263977, "learning_rate": 2.590387402015193e-06, "loss": 0.7267, "step": 6087 }, { "epoch": 0.9302823088971234, "grad_norm": 0.3669845461845398, "learning_rate": 2.5791122479553507e-06, "loss": 0.6745, "step": 6088 }, { "epoch": 0.9304351147954311, "grad_norm": 0.31043675541877747, "learning_rate": 2.567861365491908e-06, "loss": 0.7701, "step": 6089 }, { "epoch": 0.9305879206937387, "grad_norm": 0.30671799182891846, "learning_rate": 2.5566347574279337e-06, "loss": 0.5834, "step": 6090 }, { "epoch": 0.9307407265920464, "grad_norm": 0.42947250604629517, "learning_rate": 2.5454324265604456e-06, "loss": 0.7175, "step": 6091 }, { "epoch": 0.9308935324903541, "grad_norm": 0.3376956582069397, "learning_rate": 2.5342543756804226e-06, "loss": 0.4594, "step": 6092 }, { "epoch": 0.9310463383886618, "grad_norm": 0.2570996880531311, "learning_rate": 2.5231006075727592e-06, "loss": 0.8667, "step": 6093 }, { "epoch": 0.9311991442869695, "grad_norm": 0.28008586168289185, "learning_rate": 2.5119711250163325e-06, "loss": 0.8293, "step": 6094 }, { "epoch": 0.9313519501852772, "grad_norm": 1.2276524305343628, "learning_rate": 2.5008659307839577e-06, "loss": 0.9234, "step": 6095 }, { "epoch": 0.9315047560835849, "grad_norm": 0.3057681620121002, "learning_rate": 2.489785027642422e-06, "loss": 0.5919, "step": 6096 }, { "epoch": 0.9316575619818925, "grad_norm": 0.2838587760925293, "learning_rate": 2.478728418352416e-06, "loss": 0.6622, "step": 6097 }, { "epoch": 0.9318103678802002, "grad_norm": 0.2826572358608246, "learning_rate": 2.4676961056686045e-06, "loss": 0.7228, "step": 6098 }, { "epoch": 0.9319631737785079, "grad_norm": 0.30671226978302, "learning_rate": 2.4566880923395985e-06, "loss": 0.6242, "step": 6099 }, { "epoch": 0.9321159796768155, "grad_norm": 0.31722599267959595, "learning_rate": 2.4457043811079495e-06, "loss": 0.6481, "step": 6100 }, { "epoch": 0.9322687855751232, "grad_norm": 0.32529863715171814, "learning_rate": 2.434744974710168e-06, "loss": 0.7273, "step": 6101 }, { "epoch": 0.9324215914734308, "grad_norm": 0.3722645044326782, "learning_rate": 2.4238098758766816e-06, "loss": 0.6426, "step": 6102 }, { "epoch": 0.9325743973717385, "grad_norm": 0.3125240206718445, "learning_rate": 2.412899087331888e-06, "loss": 0.7518, "step": 6103 }, { "epoch": 0.9327272032700462, "grad_norm": 0.26976755261421204, "learning_rate": 2.4020126117941134e-06, "loss": 0.6159, "step": 6104 }, { "epoch": 0.9328800091683539, "grad_norm": 0.30735623836517334, "learning_rate": 2.3911504519756435e-06, "loss": 0.6861, "step": 6105 }, { "epoch": 0.9330328150666616, "grad_norm": 0.2629193663597107, "learning_rate": 2.380312610582691e-06, "loss": 0.7826, "step": 6106 }, { "epoch": 0.9331856209649693, "grad_norm": 0.30757153034210205, "learning_rate": 2.3694990903153857e-06, "loss": 0.7173, "step": 6107 }, { "epoch": 0.933338426863277, "grad_norm": 0.44342416524887085, "learning_rate": 2.358709893867861e-06, "loss": 0.6669, "step": 6108 }, { "epoch": 0.9334912327615846, "grad_norm": 0.28203803300857544, "learning_rate": 2.3479450239281443e-06, "loss": 0.5641, "step": 6109 }, { "epoch": 0.9336440386598923, "grad_norm": 0.284018337726593, "learning_rate": 2.3372044831782125e-06, "loss": 0.6156, "step": 6110 }, { "epoch": 0.9337968445582, "grad_norm": 0.34571659564971924, "learning_rate": 2.3264882742939697e-06, "loss": 0.6791, "step": 6111 }, { "epoch": 0.9339496504565076, "grad_norm": 0.253670334815979, "learning_rate": 2.3157963999452804e-06, "loss": 0.8092, "step": 6112 }, { "epoch": 0.9341024563548153, "grad_norm": 0.3017883598804474, "learning_rate": 2.3051288627959357e-06, "loss": 0.5998, "step": 6113 }, { "epoch": 0.9342552622531229, "grad_norm": 0.2832029163837433, "learning_rate": 2.294485665503665e-06, "loss": 0.7049, "step": 6114 }, { "epoch": 0.9344080681514306, "grad_norm": 0.30555370450019836, "learning_rate": 2.2838668107201143e-06, "loss": 0.8306, "step": 6115 }, { "epoch": 0.9345608740497383, "grad_norm": 0.28199902176856995, "learning_rate": 2.2732723010909007e-06, "loss": 0.6949, "step": 6116 }, { "epoch": 0.934713679948046, "grad_norm": 0.29065364599227905, "learning_rate": 2.262702139255557e-06, "loss": 0.8368, "step": 6117 }, { "epoch": 0.9348664858463537, "grad_norm": 0.5368300676345825, "learning_rate": 2.252156327847543e-06, "loss": 0.6388, "step": 6118 }, { "epoch": 0.9350192917446614, "grad_norm": 0.3096945583820343, "learning_rate": 2.2416348694942467e-06, "loss": 0.755, "step": 6119 }, { "epoch": 0.935172097642969, "grad_norm": 0.3204090893268585, "learning_rate": 2.2311377668170265e-06, "loss": 0.8538, "step": 6120 }, { "epoch": 0.9353249035412767, "grad_norm": 0.30497118830680847, "learning_rate": 2.2206650224311344e-06, "loss": 0.6324, "step": 6121 }, { "epoch": 0.9354777094395844, "grad_norm": 0.3680538237094879, "learning_rate": 2.2102166389457614e-06, "loss": 0.7133, "step": 6122 }, { "epoch": 0.935630515337892, "grad_norm": 0.24908339977264404, "learning_rate": 2.1997926189640584e-06, "loss": 0.6931, "step": 6123 }, { "epoch": 0.9357833212361997, "grad_norm": 0.28190726041793823, "learning_rate": 2.189392965083059e-06, "loss": 0.7671, "step": 6124 }, { "epoch": 0.9359361271345074, "grad_norm": 0.4661271274089813, "learning_rate": 2.179017679893747e-06, "loss": 0.6609, "step": 6125 }, { "epoch": 0.936088933032815, "grad_norm": 0.29947930574417114, "learning_rate": 2.168666765981053e-06, "loss": 0.638, "step": 6126 }, { "epoch": 0.9362417389311227, "grad_norm": 0.2782745063304901, "learning_rate": 2.1583402259238163e-06, "loss": 0.7907, "step": 6127 }, { "epoch": 0.9363945448294304, "grad_norm": 0.38503745198249817, "learning_rate": 2.1480380622948105e-06, "loss": 0.7776, "step": 6128 }, { "epoch": 0.9365473507277381, "grad_norm": 0.3161745071411133, "learning_rate": 2.1377602776607165e-06, "loss": 0.5808, "step": 6129 }, { "epoch": 0.9367001566260458, "grad_norm": 0.3757038712501526, "learning_rate": 2.1275068745821748e-06, "loss": 0.5165, "step": 6130 }, { "epoch": 0.9368529625243535, "grad_norm": 0.32424384355545044, "learning_rate": 2.1172778556137307e-06, "loss": 0.5642, "step": 6131 }, { "epoch": 0.9370057684226611, "grad_norm": 0.3121247887611389, "learning_rate": 2.107073223303857e-06, "loss": 0.5107, "step": 6132 }, { "epoch": 0.9371585743209688, "grad_norm": 0.48844408988952637, "learning_rate": 2.0968929801949533e-06, "loss": 0.5614, "step": 6133 }, { "epoch": 0.9373113802192765, "grad_norm": 0.30841362476348877, "learning_rate": 2.086737128823335e-06, "loss": 0.8262, "step": 6134 }, { "epoch": 0.9374641861175841, "grad_norm": 0.34776777029037476, "learning_rate": 2.0766056717192674e-06, "loss": 0.8351, "step": 6135 }, { "epoch": 0.9376169920158918, "grad_norm": 0.2539510130882263, "learning_rate": 2.0664986114068974e-06, "loss": 0.542, "step": 6136 }, { "epoch": 0.9377697979141995, "grad_norm": 0.2508013844490051, "learning_rate": 2.0564159504043112e-06, "loss": 0.8223, "step": 6137 }, { "epoch": 0.9379226038125071, "grad_norm": 0.27509501576423645, "learning_rate": 2.046357691223544e-06, "loss": 0.8044, "step": 6138 }, { "epoch": 0.9380754097108148, "grad_norm": 0.29400357604026794, "learning_rate": 2.036323836370502e-06, "loss": 0.6546, "step": 6139 }, { "epoch": 0.9382282156091225, "grad_norm": 0.2571744918823242, "learning_rate": 2.0263143883450406e-06, "loss": 0.7295, "step": 6140 }, { "epoch": 0.9383810215074302, "grad_norm": 0.264504998922348, "learning_rate": 2.016329349640944e-06, "loss": 0.821, "step": 6141 }, { "epoch": 0.9385338274057379, "grad_norm": 0.29346346855163574, "learning_rate": 2.006368722745888e-06, "loss": 0.7786, "step": 6142 }, { "epoch": 0.9386866333040456, "grad_norm": 0.2435157150030136, "learning_rate": 1.996432510141477e-06, "loss": 0.681, "step": 6143 }, { "epoch": 0.9388394392023532, "grad_norm": 0.30916231870651245, "learning_rate": 1.9865207143032525e-06, "loss": 0.7043, "step": 6144 }, { "epoch": 0.9389922451006609, "grad_norm": 0.32135826349258423, "learning_rate": 1.9766333377006398e-06, "loss": 0.6599, "step": 6145 }, { "epoch": 0.9391450509989686, "grad_norm": 0.31256183981895447, "learning_rate": 1.9667703827969897e-06, "loss": 0.8267, "step": 6146 }, { "epoch": 0.9392978568972762, "grad_norm": 0.32118478417396545, "learning_rate": 1.9569318520495817e-06, "loss": 0.6224, "step": 6147 }, { "epoch": 0.9394506627955839, "grad_norm": 0.2773655652999878, "learning_rate": 1.94711774790961e-06, "loss": 0.7065, "step": 6148 }, { "epoch": 0.9396034686938916, "grad_norm": 0.3196839690208435, "learning_rate": 1.9373280728221863e-06, "loss": 0.6748, "step": 6149 }, { "epoch": 0.9397562745921992, "grad_norm": 0.2970350682735443, "learning_rate": 1.9275628292262926e-06, "loss": 0.954, "step": 6150 }, { "epoch": 0.9399090804905069, "grad_norm": 0.33877527713775635, "learning_rate": 1.9178220195548824e-06, "loss": 0.6657, "step": 6151 }, { "epoch": 0.9400618863888146, "grad_norm": 0.31130823493003845, "learning_rate": 1.9081056462347924e-06, "loss": 0.5888, "step": 6152 }, { "epoch": 0.9402146922871223, "grad_norm": 0.3352448344230652, "learning_rate": 1.898413711686764e-06, "loss": 0.5394, "step": 6153 }, { "epoch": 0.94036749818543, "grad_norm": 0.27089211344718933, "learning_rate": 1.8887462183254878e-06, "loss": 0.7617, "step": 6154 }, { "epoch": 0.9405203040837377, "grad_norm": 0.4040941894054413, "learning_rate": 1.879103168559504e-06, "loss": 0.6088, "step": 6155 }, { "epoch": 0.9406731099820453, "grad_norm": 0.2687217593193054, "learning_rate": 1.869484564791335e-06, "loss": 0.7469, "step": 6156 }, { "epoch": 0.940825915880353, "grad_norm": 0.31856074929237366, "learning_rate": 1.8598904094173308e-06, "loss": 0.4672, "step": 6157 }, { "epoch": 0.9409787217786607, "grad_norm": 0.30012932419776917, "learning_rate": 1.8503207048278348e-06, "loss": 0.7236, "step": 6158 }, { "epoch": 0.9411315276769683, "grad_norm": 0.27605703473091125, "learning_rate": 1.8407754534070398e-06, "loss": 0.6358, "step": 6159 }, { "epoch": 0.941284333575276, "grad_norm": 0.2866106629371643, "learning_rate": 1.831254657533077e-06, "loss": 0.5922, "step": 6160 }, { "epoch": 0.9414371394735837, "grad_norm": 0.3067355155944824, "learning_rate": 1.8217583195779485e-06, "loss": 0.7558, "step": 6161 }, { "epoch": 0.9415899453718913, "grad_norm": 0.23551535606384277, "learning_rate": 1.812286441907618e-06, "loss": 0.6481, "step": 6162 }, { "epoch": 0.941742751270199, "grad_norm": 0.508856475353241, "learning_rate": 1.8028390268818973e-06, "loss": 0.6483, "step": 6163 }, { "epoch": 0.9418955571685067, "grad_norm": 0.2898612916469574, "learning_rate": 1.7934160768545372e-06, "loss": 0.5378, "step": 6164 }, { "epoch": 0.9420483630668144, "grad_norm": 0.28692492842674255, "learning_rate": 1.7840175941732041e-06, "loss": 0.5875, "step": 6165 }, { "epoch": 0.9422011689651221, "grad_norm": 0.2957552671432495, "learning_rate": 1.774643581179436e-06, "loss": 0.5997, "step": 6166 }, { "epoch": 0.9423539748634298, "grad_norm": 0.27881738543510437, "learning_rate": 1.7652940402086872e-06, "loss": 0.7217, "step": 6167 }, { "epoch": 0.9425067807617374, "grad_norm": 0.42624831199645996, "learning_rate": 1.7559689735903273e-06, "loss": 0.7888, "step": 6168 }, { "epoch": 0.9426595866600451, "grad_norm": 0.25755804777145386, "learning_rate": 1.7466683836476093e-06, "loss": 0.966, "step": 6169 }, { "epoch": 0.9428123925583528, "grad_norm": 0.26354870200157166, "learning_rate": 1.737392272697702e-06, "loss": 0.76, "step": 6170 }, { "epoch": 0.9429651984566604, "grad_norm": 0.28261834383010864, "learning_rate": 1.728140643051679e-06, "loss": 0.8941, "step": 6171 }, { "epoch": 0.9431180043549681, "grad_norm": 0.25513985753059387, "learning_rate": 1.7189134970144848e-06, "loss": 0.8031, "step": 6172 }, { "epoch": 0.9432708102532757, "grad_norm": 0.3026675283908844, "learning_rate": 1.7097108368849923e-06, "loss": 0.5943, "step": 6173 }, { "epoch": 0.9434236161515834, "grad_norm": 0.37587878108024597, "learning_rate": 1.7005326649559893e-06, "loss": 0.7718, "step": 6174 }, { "epoch": 0.9435764220498911, "grad_norm": 0.3272812068462372, "learning_rate": 1.6913789835141135e-06, "loss": 0.6088, "step": 6175 }, { "epoch": 0.9437292279481988, "grad_norm": 0.28328678011894226, "learning_rate": 1.6822497948399407e-06, "loss": 0.6142, "step": 6176 }, { "epoch": 0.9438820338465065, "grad_norm": 0.4213845133781433, "learning_rate": 1.6731451012079292e-06, "loss": 0.8319, "step": 6177 }, { "epoch": 0.9440348397448142, "grad_norm": 0.2808535695075989, "learning_rate": 1.664064904886431e-06, "loss": 0.6973, "step": 6178 }, { "epoch": 0.9441876456431219, "grad_norm": 0.2822289764881134, "learning_rate": 1.6550092081377034e-06, "loss": 0.6103, "step": 6179 }, { "epoch": 0.9443404515414295, "grad_norm": 0.28763696551322937, "learning_rate": 1.645978013217908e-06, "loss": 0.8216, "step": 6180 }, { "epoch": 0.9444932574397372, "grad_norm": 0.28879836201667786, "learning_rate": 1.6369713223770788e-06, "loss": 0.6061, "step": 6181 }, { "epoch": 0.9446460633380448, "grad_norm": 0.39242953062057495, "learning_rate": 1.627989137859165e-06, "loss": 0.7673, "step": 6182 }, { "epoch": 0.9447988692363525, "grad_norm": 0.29807108640670776, "learning_rate": 1.6190314619019876e-06, "loss": 0.7077, "step": 6183 }, { "epoch": 0.9449516751346602, "grad_norm": 0.2821442484855652, "learning_rate": 1.6100982967373058e-06, "loss": 0.759, "step": 6184 }, { "epoch": 0.9451044810329678, "grad_norm": 0.31379956007003784, "learning_rate": 1.6011896445907171e-06, "loss": 0.6476, "step": 6185 }, { "epoch": 0.9452572869312755, "grad_norm": 0.276607871055603, "learning_rate": 1.592305507681735e-06, "loss": 0.8936, "step": 6186 }, { "epoch": 0.9454100928295832, "grad_norm": 0.468855619430542, "learning_rate": 1.583445888223778e-06, "loss": 0.6336, "step": 6187 }, { "epoch": 0.9455628987278909, "grad_norm": 0.2827998995780945, "learning_rate": 1.574610788424158e-06, "loss": 0.6883, "step": 6188 }, { "epoch": 0.9457157046261986, "grad_norm": 0.25603756308555603, "learning_rate": 1.5658002104840586e-06, "loss": 0.6325, "step": 6189 }, { "epoch": 0.9458685105245063, "grad_norm": 0.27887246012687683, "learning_rate": 1.5570141565985353e-06, "loss": 0.6751, "step": 6190 }, { "epoch": 0.946021316422814, "grad_norm": 1.5862551927566528, "learning_rate": 1.5482526289565924e-06, "loss": 0.6279, "step": 6191 }, { "epoch": 0.9461741223211216, "grad_norm": 0.28105485439300537, "learning_rate": 1.539515629741084e-06, "loss": 0.7632, "step": 6192 }, { "epoch": 0.9463269282194293, "grad_norm": 0.4909925162792206, "learning_rate": 1.5308031611287466e-06, "loss": 0.8598, "step": 6193 }, { "epoch": 0.9464797341177369, "grad_norm": 0.26141437888145447, "learning_rate": 1.5221152252902215e-06, "loss": 0.675, "step": 6194 }, { "epoch": 0.9466325400160446, "grad_norm": 0.37421944737434387, "learning_rate": 1.5134518243900552e-06, "loss": 0.859, "step": 6195 }, { "epoch": 0.9467853459143523, "grad_norm": 0.2938391864299774, "learning_rate": 1.5048129605866433e-06, "loss": 0.7082, "step": 6196 }, { "epoch": 0.94693815181266, "grad_norm": 0.23865161836147308, "learning_rate": 1.4961986360322867e-06, "loss": 0.6739, "step": 6197 }, { "epoch": 0.9470909577109676, "grad_norm": 0.29308661818504333, "learning_rate": 1.487608852873168e-06, "loss": 0.6938, "step": 6198 }, { "epoch": 0.9472437636092753, "grad_norm": 0.3156602084636688, "learning_rate": 1.4790436132493757e-06, "loss": 0.7344, "step": 6199 }, { "epoch": 0.947396569507583, "grad_norm": 0.27409300208091736, "learning_rate": 1.4705029192948584e-06, "loss": 0.8084, "step": 6200 }, { "epoch": 0.9475493754058907, "grad_norm": 0.29559117555618286, "learning_rate": 1.4619867731374581e-06, "loss": 0.6225, "step": 6201 }, { "epoch": 0.9477021813041984, "grad_norm": 0.24822796881198883, "learning_rate": 1.4534951768989002e-06, "loss": 0.6979, "step": 6202 }, { "epoch": 0.947854987202506, "grad_norm": 0.2914453446865082, "learning_rate": 1.4450281326947922e-06, "loss": 0.7604, "step": 6203 }, { "epoch": 0.9480077931008137, "grad_norm": 0.25838714838027954, "learning_rate": 1.4365856426346248e-06, "loss": 0.6905, "step": 6204 }, { "epoch": 0.9481605989991214, "grad_norm": 0.2767367362976074, "learning_rate": 1.4281677088217925e-06, "loss": 0.6708, "step": 6205 }, { "epoch": 0.948313404897429, "grad_norm": 0.2727099657058716, "learning_rate": 1.4197743333535407e-06, "loss": 0.7106, "step": 6206 }, { "epoch": 0.9484662107957367, "grad_norm": 0.3296019434928894, "learning_rate": 1.4114055183209961e-06, "loss": 0.644, "step": 6207 }, { "epoch": 0.9486190166940444, "grad_norm": 0.5572933554649353, "learning_rate": 1.4030612658091913e-06, "loss": 0.7392, "step": 6208 }, { "epoch": 0.948771822592352, "grad_norm": 0.3326074481010437, "learning_rate": 1.3947415778970296e-06, "loss": 0.7055, "step": 6209 }, { "epoch": 0.9489246284906597, "grad_norm": 0.35185036063194275, "learning_rate": 1.3864464566572865e-06, "loss": 0.7567, "step": 6210 }, { "epoch": 0.9490774343889674, "grad_norm": 0.2703647017478943, "learning_rate": 1.37817590415662e-06, "loss": 0.7198, "step": 6211 }, { "epoch": 0.9492302402872751, "grad_norm": 0.2500843405723572, "learning_rate": 1.3699299224555707e-06, "loss": 0.6154, "step": 6212 }, { "epoch": 0.9493830461855828, "grad_norm": 0.24339242279529572, "learning_rate": 1.3617085136085617e-06, "loss": 0.6167, "step": 6213 }, { "epoch": 0.9495358520838905, "grad_norm": 0.24041394889354706, "learning_rate": 1.3535116796638768e-06, "loss": 0.7468, "step": 6214 }, { "epoch": 0.9496886579821981, "grad_norm": 0.2889711558818817, "learning_rate": 1.345339422663705e-06, "loss": 0.6913, "step": 6215 }, { "epoch": 0.9498414638805058, "grad_norm": 0.3345462381839752, "learning_rate": 1.337191744644084e-06, "loss": 0.9103, "step": 6216 }, { "epoch": 0.9499942697788135, "grad_norm": 0.29227215051651, "learning_rate": 1.3290686476349234e-06, "loss": 0.6628, "step": 6217 }, { "epoch": 0.9501470756771211, "grad_norm": 0.28349965810775757, "learning_rate": 1.3209701336600488e-06, "loss": 0.6475, "step": 6218 }, { "epoch": 0.9502998815754288, "grad_norm": 0.2800476551055908, "learning_rate": 1.3128962047371463e-06, "loss": 0.9071, "step": 6219 }, { "epoch": 0.9504526874737365, "grad_norm": 0.27852532267570496, "learning_rate": 1.30484686287774e-06, "loss": 0.6444, "step": 6220 }, { "epoch": 0.9506054933720441, "grad_norm": 0.3461797535419464, "learning_rate": 1.296822110087259e-06, "loss": 0.7275, "step": 6221 }, { "epoch": 0.9507582992703518, "grad_norm": 0.2584891617298126, "learning_rate": 1.2888219483650043e-06, "loss": 0.6953, "step": 6222 }, { "epoch": 0.9509111051686595, "grad_norm": 0.25109121203422546, "learning_rate": 1.2808463797041703e-06, "loss": 0.5831, "step": 6223 }, { "epoch": 0.9510639110669672, "grad_norm": 0.2807595729827881, "learning_rate": 1.2728954060917898e-06, "loss": 0.6815, "step": 6224 }, { "epoch": 0.9512167169652749, "grad_norm": 0.3147951662540436, "learning_rate": 1.264969029508778e-06, "loss": 0.578, "step": 6225 }, { "epoch": 0.9513695228635826, "grad_norm": 0.2849219739437103, "learning_rate": 1.257067251929911e-06, "loss": 0.7381, "step": 6226 }, { "epoch": 0.9515223287618902, "grad_norm": 0.2966640293598175, "learning_rate": 1.2491900753238806e-06, "loss": 0.5892, "step": 6227 }, { "epoch": 0.9516751346601979, "grad_norm": 0.3076305687427521, "learning_rate": 1.2413375016532058e-06, "loss": 0.8709, "step": 6228 }, { "epoch": 0.9518279405585055, "grad_norm": 0.2656288743019104, "learning_rate": 1.2335095328742885e-06, "loss": 0.7306, "step": 6229 }, { "epoch": 0.9519807464568132, "grad_norm": 0.293072909116745, "learning_rate": 1.2257061709373907e-06, "loss": 0.7178, "step": 6230 }, { "epoch": 0.9521335523551209, "grad_norm": 0.3086521327495575, "learning_rate": 1.2179274177866796e-06, "loss": 0.6118, "step": 6231 }, { "epoch": 0.9522863582534286, "grad_norm": 0.31312182545661926, "learning_rate": 1.210173275360138e-06, "loss": 0.7736, "step": 6232 }, { "epoch": 0.9524391641517362, "grad_norm": 0.30142563581466675, "learning_rate": 1.2024437455896653e-06, "loss": 0.6248, "step": 6233 }, { "epoch": 0.9525919700500439, "grad_norm": 0.2886991798877716, "learning_rate": 1.19473883040101e-06, "loss": 0.7147, "step": 6234 }, { "epoch": 0.9527447759483516, "grad_norm": 0.27349576354026794, "learning_rate": 1.1870585317137583e-06, "loss": 0.9949, "step": 6235 }, { "epoch": 0.9528975818466593, "grad_norm": 0.26117628812789917, "learning_rate": 1.1794028514414356e-06, "loss": 0.749, "step": 6236 }, { "epoch": 0.953050387744967, "grad_norm": 0.31577005982398987, "learning_rate": 1.1717717914913496e-06, "loss": 0.7419, "step": 6237 }, { "epoch": 0.9532031936432747, "grad_norm": 0.29771387577056885, "learning_rate": 1.1641653537647456e-06, "loss": 0.6722, "step": 6238 }, { "epoch": 0.9533559995415823, "grad_norm": 0.27761638164520264, "learning_rate": 1.156583540156686e-06, "loss": 0.5968, "step": 6239 }, { "epoch": 0.95350880543989, "grad_norm": 0.2751348316669464, "learning_rate": 1.1490263525561373e-06, "loss": 0.6508, "step": 6240 }, { "epoch": 0.9536616113381976, "grad_norm": 0.245052307844162, "learning_rate": 1.1414937928458824e-06, "loss": 0.6157, "step": 6241 }, { "epoch": 0.9538144172365053, "grad_norm": 0.32120779156684875, "learning_rate": 1.133985862902598e-06, "loss": 0.7439, "step": 6242 }, { "epoch": 0.953967223134813, "grad_norm": 0.3647390305995941, "learning_rate": 1.1265025645968318e-06, "loss": 0.72, "step": 6243 }, { "epoch": 0.9541200290331207, "grad_norm": 0.30320584774017334, "learning_rate": 1.119043899792993e-06, "loss": 0.7782, "step": 6244 }, { "epoch": 0.9542728349314283, "grad_norm": 0.2579381763935089, "learning_rate": 1.1116098703493394e-06, "loss": 0.612, "step": 6245 }, { "epoch": 0.954425640829736, "grad_norm": 0.32816165685653687, "learning_rate": 1.1042004781179893e-06, "loss": 0.7938, "step": 6246 }, { "epoch": 0.9545784467280437, "grad_norm": 0.4004554748535156, "learning_rate": 1.096815724944922e-06, "loss": 0.836, "step": 6247 }, { "epoch": 0.9547312526263514, "grad_norm": 0.2599641978740692, "learning_rate": 1.0894556126700094e-06, "loss": 0.6191, "step": 6248 }, { "epoch": 0.9548840585246591, "grad_norm": 0.2757795453071594, "learning_rate": 1.0821201431269523e-06, "loss": 0.5368, "step": 6249 }, { "epoch": 0.9550368644229668, "grad_norm": 0.2588087022304535, "learning_rate": 1.0748093181433216e-06, "loss": 0.7106, "step": 6250 }, { "epoch": 0.9551896703212744, "grad_norm": 0.26044732332229614, "learning_rate": 1.0675231395405495e-06, "loss": 0.6474, "step": 6251 }, { "epoch": 0.9553424762195821, "grad_norm": 0.30654361844062805, "learning_rate": 1.0602616091339168e-06, "loss": 0.6999, "step": 6252 }, { "epoch": 0.9554952821178897, "grad_norm": 0.2802150845527649, "learning_rate": 1.0530247287325768e-06, "loss": 0.6875, "step": 6253 }, { "epoch": 0.9556480880161974, "grad_norm": 0.2953641712665558, "learning_rate": 1.0458125001395536e-06, "loss": 0.7029, "step": 6254 }, { "epoch": 0.9558008939145051, "grad_norm": 0.268854022026062, "learning_rate": 1.038624925151699e-06, "loss": 0.8357, "step": 6255 }, { "epoch": 0.9559536998128128, "grad_norm": 0.26503345370292664, "learning_rate": 1.0314620055597246e-06, "loss": 0.5589, "step": 6256 }, { "epoch": 0.9561065057111204, "grad_norm": 0.23971620202064514, "learning_rate": 1.0243237431482366e-06, "loss": 0.6528, "step": 6257 }, { "epoch": 0.9562593116094281, "grad_norm": 0.464169442653656, "learning_rate": 1.0172101396956567e-06, "loss": 0.8958, "step": 6258 }, { "epoch": 0.9564121175077358, "grad_norm": 0.3739687204360962, "learning_rate": 1.0101211969742896e-06, "loss": 0.8565, "step": 6259 }, { "epoch": 0.9565649234060435, "grad_norm": 0.3403286635875702, "learning_rate": 1.0030569167502778e-06, "loss": 0.5678, "step": 6260 }, { "epoch": 0.9567177293043512, "grad_norm": 0.2906002700328827, "learning_rate": 9.96017300783636e-07, "loss": 0.7359, "step": 6261 }, { "epoch": 0.9568705352026589, "grad_norm": 0.33831942081451416, "learning_rate": 9.890023508282166e-07, "loss": 0.7405, "step": 6262 }, { "epoch": 0.9570233411009665, "grad_norm": 0.2358943223953247, "learning_rate": 9.820120686317435e-07, "loss": 0.5598, "step": 6263 }, { "epoch": 0.9571761469992742, "grad_norm": 0.3636802136898041, "learning_rate": 9.750464559357686e-07, "loss": 0.8462, "step": 6264 }, { "epoch": 0.9573289528975818, "grad_norm": 0.27122461795806885, "learning_rate": 9.681055144757367e-07, "loss": 0.6452, "step": 6265 }, { "epoch": 0.9574817587958895, "grad_norm": 0.31347087025642395, "learning_rate": 9.611892459809201e-07, "loss": 0.566, "step": 6266 }, { "epoch": 0.9576345646941972, "grad_norm": 0.47410255670547485, "learning_rate": 9.542976521744518e-07, "loss": 0.6469, "step": 6267 }, { "epoch": 0.9577873705925048, "grad_norm": 0.2801438868045807, "learning_rate": 9.474307347733025e-07, "loss": 0.7822, "step": 6268 }, { "epoch": 0.9579401764908125, "grad_norm": 0.23874764144420624, "learning_rate": 9.405884954883148e-07, "loss": 0.5762, "step": 6269 }, { "epoch": 0.9580929823891202, "grad_norm": 0.5731150507926941, "learning_rate": 9.337709360241809e-07, "loss": 0.6621, "step": 6270 }, { "epoch": 0.9582457882874279, "grad_norm": 0.3229494094848633, "learning_rate": 9.269780580794307e-07, "loss": 0.5108, "step": 6271 }, { "epoch": 0.9583985941857356, "grad_norm": 0.27924874424934387, "learning_rate": 9.20209863346444e-07, "loss": 0.5841, "step": 6272 }, { "epoch": 0.9585514000840433, "grad_norm": 0.2501339018344879, "learning_rate": 9.134663535114829e-07, "loss": 0.7157, "step": 6273 }, { "epoch": 0.958704205982351, "grad_norm": 0.38578376173973083, "learning_rate": 9.067475302546147e-07, "loss": 0.782, "step": 6274 }, { "epoch": 0.9588570118806586, "grad_norm": 0.2761724293231964, "learning_rate": 9.000533952497892e-07, "loss": 0.9587, "step": 6275 }, { "epoch": 0.9590098177789663, "grad_norm": 0.3226098120212555, "learning_rate": 8.933839501647945e-07, "loss": 0.7157, "step": 6276 }, { "epoch": 0.9591626236772739, "grad_norm": 0.28342607617378235, "learning_rate": 8.86739196661257e-07, "loss": 0.6777, "step": 6277 }, { "epoch": 0.9593154295755816, "grad_norm": 0.27868959307670593, "learning_rate": 8.801191363946748e-07, "loss": 0.7985, "step": 6278 }, { "epoch": 0.9594682354738893, "grad_norm": 0.29065823554992676, "learning_rate": 8.735237710143618e-07, "loss": 0.6976, "step": 6279 }, { "epoch": 0.959621041372197, "grad_norm": 0.32618793845176697, "learning_rate": 8.669531021635258e-07, "loss": 0.6947, "step": 6280 }, { "epoch": 0.9597738472705046, "grad_norm": 0.4600655436515808, "learning_rate": 8.604071314791684e-07, "loss": 0.724, "step": 6281 }, { "epoch": 0.9599266531688123, "grad_norm": 0.27274805307388306, "learning_rate": 8.53885860592174e-07, "loss": 0.7377, "step": 6282 }, { "epoch": 0.96007945906712, "grad_norm": 0.36312827467918396, "learning_rate": 8.47389291127254e-07, "loss": 0.7004, "step": 6283 }, { "epoch": 0.9602322649654277, "grad_norm": 0.286912739276886, "learning_rate": 8.409174247029916e-07, "loss": 0.6783, "step": 6284 }, { "epoch": 0.9603850708637354, "grad_norm": 0.3135521709918976, "learning_rate": 8.344702629317857e-07, "loss": 0.7232, "step": 6285 }, { "epoch": 0.960537876762043, "grad_norm": 0.2688337564468384, "learning_rate": 8.28047807419885e-07, "loss": 0.612, "step": 6286 }, { "epoch": 0.9606906826603507, "grad_norm": 0.31565752625465393, "learning_rate": 8.216500597674093e-07, "loss": 0.8034, "step": 6287 }, { "epoch": 0.9608434885586583, "grad_norm": 0.257478266954422, "learning_rate": 8.152770215682836e-07, "loss": 0.6048, "step": 6288 }, { "epoch": 0.960996294456966, "grad_norm": 0.2403869926929474, "learning_rate": 8.089286944103158e-07, "loss": 0.6884, "step": 6289 }, { "epoch": 0.9611491003552737, "grad_norm": 0.4128204584121704, "learning_rate": 8.026050798751294e-07, "loss": 0.8223, "step": 6290 }, { "epoch": 0.9613019062535814, "grad_norm": 0.2880534529685974, "learning_rate": 7.963061795381976e-07, "loss": 0.687, "step": 6291 }, { "epoch": 0.961454712151889, "grad_norm": 0.31626081466674805, "learning_rate": 7.900319949688428e-07, "loss": 0.4903, "step": 6292 }, { "epoch": 0.9616075180501967, "grad_norm": 0.23981697857379913, "learning_rate": 7.837825277302258e-07, "loss": 0.7501, "step": 6293 }, { "epoch": 0.9617603239485044, "grad_norm": 0.34683600068092346, "learning_rate": 7.775577793793454e-07, "loss": 0.8155, "step": 6294 }, { "epoch": 0.9619131298468121, "grad_norm": 0.291105717420578, "learning_rate": 7.7135775146705e-07, "loss": 0.707, "step": 6295 }, { "epoch": 0.9620659357451198, "grad_norm": 0.38474923372268677, "learning_rate": 7.651824455380153e-07, "loss": 0.8183, "step": 6296 }, { "epoch": 0.9622187416434275, "grad_norm": 0.2873779833316803, "learning_rate": 7.590318631307769e-07, "loss": 0.5921, "step": 6297 }, { "epoch": 0.9623715475417352, "grad_norm": 0.3031384348869324, "learning_rate": 7.529060057776982e-07, "loss": 0.8043, "step": 6298 }, { "epoch": 0.9625243534400428, "grad_norm": 0.27383852005004883, "learning_rate": 7.468048750049694e-07, "loss": 0.5811, "step": 6299 }, { "epoch": 0.9626771593383504, "grad_norm": 0.2807175815105438, "learning_rate": 7.407284723326635e-07, "loss": 0.6625, "step": 6300 }, { "epoch": 0.9628299652366581, "grad_norm": 0.31852301955223083, "learning_rate": 7.346767992746584e-07, "loss": 0.6023, "step": 6301 }, { "epoch": 0.9629827711349658, "grad_norm": 0.27150818705558777, "learning_rate": 7.286498573386591e-07, "loss": 0.7008, "step": 6302 }, { "epoch": 0.9631355770332735, "grad_norm": 0.3553084433078766, "learning_rate": 7.226476480262423e-07, "loss": 0.842, "step": 6303 }, { "epoch": 0.9632883829315811, "grad_norm": 0.2782277762889862, "learning_rate": 7.166701728328118e-07, "loss": 0.7429, "step": 6304 }, { "epoch": 0.9634411888298888, "grad_norm": 0.3005227744579315, "learning_rate": 7.107174332475986e-07, "loss": 0.6681, "step": 6305 }, { "epoch": 0.9635939947281965, "grad_norm": 0.2721962332725525, "learning_rate": 7.047894307536718e-07, "loss": 0.6964, "step": 6306 }, { "epoch": 0.9637468006265042, "grad_norm": 0.2653356194496155, "learning_rate": 6.9888616682795e-07, "loss": 0.7553, "step": 6307 }, { "epoch": 0.9638996065248119, "grad_norm": 0.39718544483184814, "learning_rate": 6.930076429411902e-07, "loss": 0.5233, "step": 6308 }, { "epoch": 0.9640524124231196, "grad_norm": 0.2432132214307785, "learning_rate": 6.871538605579653e-07, "loss": 0.9123, "step": 6309 }, { "epoch": 0.9642052183214272, "grad_norm": 0.2763080596923828, "learning_rate": 6.813248211366973e-07, "loss": 0.6741, "step": 6310 }, { "epoch": 0.9643580242197349, "grad_norm": 0.3247620165348053, "learning_rate": 6.755205261296471e-07, "loss": 0.8244, "step": 6311 }, { "epoch": 0.9645108301180425, "grad_norm": 0.2837117910385132, "learning_rate": 6.697409769829132e-07, "loss": 0.5683, "step": 6312 }, { "epoch": 0.9646636360163502, "grad_norm": 0.2593044638633728, "learning_rate": 6.639861751363996e-07, "loss": 0.6678, "step": 6313 }, { "epoch": 0.9648164419146579, "grad_norm": 0.2585027813911438, "learning_rate": 6.582561220238814e-07, "loss": 0.7211, "step": 6314 }, { "epoch": 0.9649692478129656, "grad_norm": 0.27481648325920105, "learning_rate": 6.525508190729501e-07, "loss": 0.6615, "step": 6315 }, { "epoch": 0.9651220537112732, "grad_norm": 0.28525200486183167, "learning_rate": 6.468702677050464e-07, "loss": 0.6796, "step": 6316 }, { "epoch": 0.9652748596095809, "grad_norm": 0.39430657029151917, "learning_rate": 6.41214469335405e-07, "loss": 0.7448, "step": 6317 }, { "epoch": 0.9654276655078886, "grad_norm": 0.31355559825897217, "learning_rate": 6.35583425373143e-07, "loss": 0.8168, "step": 6318 }, { "epoch": 0.9655804714061963, "grad_norm": 0.2974868416786194, "learning_rate": 6.299771372211937e-07, "loss": 0.6698, "step": 6319 }, { "epoch": 0.965733277304504, "grad_norm": 0.2556328773498535, "learning_rate": 6.243956062762956e-07, "loss": 0.6631, "step": 6320 }, { "epoch": 0.9658860832028117, "grad_norm": 0.3320145308971405, "learning_rate": 6.188388339290474e-07, "loss": 0.6777, "step": 6321 }, { "epoch": 0.9660388891011193, "grad_norm": 0.3326362073421478, "learning_rate": 6.133068215638749e-07, "loss": 0.7277, "step": 6322 }, { "epoch": 0.966191694999427, "grad_norm": 0.3622719347476959, "learning_rate": 6.077995705590311e-07, "loss": 0.8761, "step": 6323 }, { "epoch": 0.9663445008977346, "grad_norm": 0.32251179218292236, "learning_rate": 6.023170822866075e-07, "loss": 0.7356, "step": 6324 }, { "epoch": 0.9664973067960423, "grad_norm": 0.2953559458255768, "learning_rate": 5.968593581125004e-07, "loss": 0.6952, "step": 6325 }, { "epoch": 0.96665011269435, "grad_norm": 0.29557231068611145, "learning_rate": 5.914263993964886e-07, "loss": 0.5888, "step": 6326 }, { "epoch": 0.9668029185926577, "grad_norm": 0.3225257992744446, "learning_rate": 5.860182074921117e-07, "loss": 0.743, "step": 6327 }, { "epoch": 0.9669557244909653, "grad_norm": 0.283965528011322, "learning_rate": 5.806347837468029e-07, "loss": 0.8457, "step": 6328 }, { "epoch": 0.967108530389273, "grad_norm": 0.3201238811016083, "learning_rate": 5.752761295017895e-07, "loss": 0.6668, "step": 6329 }, { "epoch": 0.9672613362875807, "grad_norm": 0.2973778247833252, "learning_rate": 5.699422460921255e-07, "loss": 0.8359, "step": 6330 }, { "epoch": 0.9674141421858884, "grad_norm": 0.24163559079170227, "learning_rate": 5.646331348467149e-07, "loss": 0.708, "step": 6331 }, { "epoch": 0.9675669480841961, "grad_norm": 0.2467086911201477, "learning_rate": 5.593487970882771e-07, "loss": 0.7813, "step": 6332 }, { "epoch": 0.9677197539825038, "grad_norm": 0.2962093651294708, "learning_rate": 5.540892341333592e-07, "loss": 0.6654, "step": 6333 }, { "epoch": 0.9678725598808114, "grad_norm": 0.28548145294189453, "learning_rate": 5.488544472923241e-07, "loss": 0.5871, "step": 6334 }, { "epoch": 0.9680253657791191, "grad_norm": 0.29998165369033813, "learning_rate": 5.436444378693951e-07, "loss": 0.705, "step": 6335 }, { "epoch": 0.9681781716774267, "grad_norm": 0.29790687561035156, "learning_rate": 5.384592071625894e-07, "loss": 1.0276, "step": 6336 }, { "epoch": 0.9683309775757344, "grad_norm": 0.2657981812953949, "learning_rate": 5.332987564637737e-07, "loss": 0.7289, "step": 6337 }, { "epoch": 0.9684837834740421, "grad_norm": 0.3056153655052185, "learning_rate": 5.281630870586196e-07, "loss": 0.7339, "step": 6338 }, { "epoch": 0.9686365893723498, "grad_norm": 0.2564420700073242, "learning_rate": 5.230522002266481e-07, "loss": 0.8466, "step": 6339 }, { "epoch": 0.9687893952706574, "grad_norm": 0.27843570709228516, "learning_rate": 5.179660972411848e-07, "loss": 0.6875, "step": 6340 }, { "epoch": 0.9689422011689651, "grad_norm": 0.35951921343803406, "learning_rate": 5.129047793693831e-07, "loss": 0.6167, "step": 6341 }, { "epoch": 0.9690950070672728, "grad_norm": 0.2623217701911926, "learning_rate": 5.078682478722451e-07, "loss": 0.5105, "step": 6342 }, { "epoch": 0.9692478129655805, "grad_norm": 0.2455863207578659, "learning_rate": 5.028565040045674e-07, "loss": 0.5531, "step": 6343 }, { "epoch": 0.9694006188638882, "grad_norm": 0.28337204456329346, "learning_rate": 4.978695490149953e-07, "loss": 0.7003, "step": 6344 }, { "epoch": 0.9695534247621959, "grad_norm": 0.29444316029548645, "learning_rate": 4.929073841459686e-07, "loss": 0.7188, "step": 6345 }, { "epoch": 0.9697062306605035, "grad_norm": 0.26820212602615356, "learning_rate": 4.879700106337981e-07, "loss": 0.6843, "step": 6346 }, { "epoch": 0.9698590365588111, "grad_norm": 0.2960645854473114, "learning_rate": 4.830574297085555e-07, "loss": 0.67, "step": 6347 }, { "epoch": 0.9700118424571188, "grad_norm": 0.25855451822280884, "learning_rate": 4.78169642594195e-07, "loss": 0.5746, "step": 6348 }, { "epoch": 0.9701646483554265, "grad_norm": 0.2733268141746521, "learning_rate": 4.733066505084427e-07, "loss": 0.6331, "step": 6349 }, { "epoch": 0.9703174542537342, "grad_norm": 0.38543620705604553, "learning_rate": 4.68468454662907e-07, "loss": 0.8998, "step": 6350 }, { "epoch": 0.9704702601520419, "grad_norm": 0.2834281325340271, "learning_rate": 4.636550562629571e-07, "loss": 0.6405, "step": 6351 }, { "epoch": 0.9706230660503495, "grad_norm": 0.3168293833732605, "learning_rate": 4.588664565078116e-07, "loss": 1.0936, "step": 6352 }, { "epoch": 0.9707758719486572, "grad_norm": 0.32848724722862244, "learning_rate": 4.54102656590516e-07, "loss": 0.742, "step": 6353 }, { "epoch": 0.9709286778469649, "grad_norm": 0.3079994022846222, "learning_rate": 4.493636576979321e-07, "loss": 0.5539, "step": 6354 }, { "epoch": 0.9710814837452726, "grad_norm": 0.3098090887069702, "learning_rate": 4.446494610107488e-07, "loss": 0.5675, "step": 6355 }, { "epoch": 0.9712342896435803, "grad_norm": 0.2650286555290222, "learning_rate": 4.399600677034488e-07, "loss": 0.6844, "step": 6356 }, { "epoch": 0.971387095541888, "grad_norm": 0.269327312707901, "learning_rate": 4.352954789443753e-07, "loss": 0.7365, "step": 6357 }, { "epoch": 0.9715399014401956, "grad_norm": 0.25867950916290283, "learning_rate": 4.3065569589565425e-07, "loss": 0.6923, "step": 6358 }, { "epoch": 0.9716927073385032, "grad_norm": 0.259370893239975, "learning_rate": 4.260407197132721e-07, "loss": 0.5959, "step": 6359 }, { "epoch": 0.9718455132368109, "grad_norm": 0.3415398895740509, "learning_rate": 4.2145055154697575e-07, "loss": 0.8221, "step": 6360 }, { "epoch": 0.9719983191351186, "grad_norm": 0.29200610518455505, "learning_rate": 4.16885192540406e-07, "loss": 0.61, "step": 6361 }, { "epoch": 0.9721511250334263, "grad_norm": 0.28360190987586975, "learning_rate": 4.1234464383095304e-07, "loss": 0.67, "step": 6362 }, { "epoch": 0.972303930931734, "grad_norm": 0.2954583764076233, "learning_rate": 4.078289065498786e-07, "loss": 0.7833, "step": 6363 }, { "epoch": 0.9724567368300416, "grad_norm": 0.4153759479522705, "learning_rate": 4.0333798182222716e-07, "loss": 0.6991, "step": 6364 }, { "epoch": 0.9726095427283493, "grad_norm": 0.3050250709056854, "learning_rate": 3.988718707668815e-07, "loss": 0.7836, "step": 6365 }, { "epoch": 0.972762348626657, "grad_norm": 0.28519800305366516, "learning_rate": 3.944305744965293e-07, "loss": 0.8232, "step": 6366 }, { "epoch": 0.9729151545249647, "grad_norm": 0.2635805606842041, "learning_rate": 3.900140941176855e-07, "loss": 0.7258, "step": 6367 }, { "epoch": 0.9730679604232724, "grad_norm": 0.3375990390777588, "learning_rate": 3.8562243073068107e-07, "loss": 0.8841, "step": 6368 }, { "epoch": 0.97322076632158, "grad_norm": 0.25449639558792114, "learning_rate": 3.812555854296629e-07, "loss": 0.759, "step": 6369 }, { "epoch": 0.9733735722198877, "grad_norm": 0.2887367308139801, "learning_rate": 3.769135593025941e-07, "loss": 0.7091, "step": 6370 }, { "epoch": 0.9735263781181953, "grad_norm": 0.28508010506629944, "learning_rate": 3.725963534312427e-07, "loss": 0.7535, "step": 6371 }, { "epoch": 0.973679184016503, "grad_norm": 0.32648003101348877, "learning_rate": 3.6830396889122597e-07, "loss": 0.8433, "step": 6372 }, { "epoch": 0.9738319899148107, "grad_norm": 0.35597503185272217, "learning_rate": 3.6403640675193307e-07, "loss": 0.5688, "step": 6373 }, { "epoch": 0.9739847958131184, "grad_norm": 0.42400041222572327, "learning_rate": 3.597936680766023e-07, "loss": 0.7087, "step": 6374 }, { "epoch": 0.974137601711426, "grad_norm": 0.33522462844848633, "learning_rate": 3.5557575392226595e-07, "loss": 0.7737, "step": 6375 }, { "epoch": 0.9742904076097337, "grad_norm": 0.32269105315208435, "learning_rate": 3.513826653398056e-07, "loss": 0.5236, "step": 6376 }, { "epoch": 0.9744432135080414, "grad_norm": 0.27982404828071594, "learning_rate": 3.4721440337387445e-07, "loss": 0.5924, "step": 6377 }, { "epoch": 0.9745960194063491, "grad_norm": 0.35985851287841797, "learning_rate": 3.430709690629641e-07, "loss": 0.7741, "step": 6378 }, { "epoch": 0.9747488253046568, "grad_norm": 0.3565606474876404, "learning_rate": 3.3895236343937097e-07, "loss": 0.705, "step": 6379 }, { "epoch": 0.9749016312029645, "grad_norm": 0.2602279484272003, "learning_rate": 3.348585875292298e-07, "loss": 0.6135, "step": 6380 }, { "epoch": 0.9750544371012722, "grad_norm": 0.3423800766468048, "learning_rate": 3.307896423524581e-07, "loss": 1.0398, "step": 6381 }, { "epoch": 0.9752072429995798, "grad_norm": 0.2850226163864136, "learning_rate": 3.267455289227894e-07, "loss": 0.7439, "step": 6382 }, { "epoch": 0.9753600488978874, "grad_norm": 0.3209698498249054, "learning_rate": 3.227262482477955e-07, "loss": 0.6061, "step": 6383 }, { "epoch": 0.9755128547961951, "grad_norm": 0.35197779536247253, "learning_rate": 3.187318013288421e-07, "loss": 0.684, "step": 6384 }, { "epoch": 0.9756656606945028, "grad_norm": 0.3414282500743866, "learning_rate": 3.147621891611108e-07, "loss": 0.7432, "step": 6385 }, { "epoch": 0.9758184665928105, "grad_norm": 0.28099194169044495, "learning_rate": 3.1081741273358835e-07, "loss": 0.6576, "step": 6386 }, { "epoch": 0.9759712724911181, "grad_norm": 0.2589315176010132, "learning_rate": 3.0689747302911074e-07, "loss": 0.8223, "step": 6387 }, { "epoch": 0.9761240783894258, "grad_norm": 0.3601363003253937, "learning_rate": 3.0300237102426355e-07, "loss": 0.8068, "step": 6388 }, { "epoch": 0.9762768842877335, "grad_norm": 0.3021461069583893, "learning_rate": 2.9913210768950374e-07, "loss": 0.6874, "step": 6389 }, { "epoch": 0.9764296901860412, "grad_norm": 0.2406938225030899, "learning_rate": 2.952866839890711e-07, "loss": 0.4772, "step": 6390 }, { "epoch": 0.9765824960843489, "grad_norm": 0.26343291997909546, "learning_rate": 2.9146610088099933e-07, "loss": 0.6531, "step": 6391 }, { "epoch": 0.9767353019826566, "grad_norm": 0.30205318331718445, "learning_rate": 2.8767035931718256e-07, "loss": 0.7255, "step": 6392 }, { "epoch": 0.9768881078809643, "grad_norm": 0.3833494186401367, "learning_rate": 2.838994602432865e-07, "loss": 0.498, "step": 6393 }, { "epoch": 0.9770409137792718, "grad_norm": 0.41331061720848083, "learning_rate": 2.8015340459879304e-07, "loss": 0.6305, "step": 6394 }, { "epoch": 0.9771937196775795, "grad_norm": 0.32242459058761597, "learning_rate": 2.764321933170111e-07, "loss": 0.7668, "step": 6395 }, { "epoch": 0.9773465255758872, "grad_norm": 0.2696183919906616, "learning_rate": 2.727358273250324e-07, "loss": 0.7781, "step": 6396 }, { "epoch": 0.9774993314741949, "grad_norm": 0.3143042325973511, "learning_rate": 2.690643075437982e-07, "loss": 0.5344, "step": 6397 }, { "epoch": 0.9776521373725026, "grad_norm": 0.2705305218696594, "learning_rate": 2.654176348880322e-07, "loss": 0.6465, "step": 6398 }, { "epoch": 0.9778049432708102, "grad_norm": 0.2748562693595886, "learning_rate": 2.617958102662521e-07, "loss": 0.6275, "step": 6399 }, { "epoch": 0.9779577491691179, "grad_norm": 0.27969980239868164, "learning_rate": 2.581988345808251e-07, "loss": 0.8317, "step": 6400 }, { "epoch": 0.9781105550674256, "grad_norm": 0.3142959773540497, "learning_rate": 2.5462670872790085e-07, "loss": 0.856, "step": 6401 }, { "epoch": 0.9782633609657333, "grad_norm": 0.37164634466171265, "learning_rate": 2.510794335974453e-07, "loss": 0.5511, "step": 6402 }, { "epoch": 0.978416166864041, "grad_norm": 0.28552374243736267, "learning_rate": 2.475570100732405e-07, "loss": 0.648, "step": 6403 }, { "epoch": 0.9785689727623487, "grad_norm": 0.3474300503730774, "learning_rate": 2.44059439032851e-07, "loss": 0.711, "step": 6404 }, { "epoch": 0.9787217786606563, "grad_norm": 0.2783917188644409, "learning_rate": 2.405867213476798e-07, "loss": 0.578, "step": 6405 }, { "epoch": 0.9788745845589639, "grad_norm": 0.32661694288253784, "learning_rate": 2.3713885788291258e-07, "loss": 0.5063, "step": 6406 }, { "epoch": 0.9790273904572716, "grad_norm": 0.2747705280780792, "learning_rate": 2.3371584949757331e-07, "loss": 0.6251, "step": 6407 }, { "epoch": 0.9791801963555793, "grad_norm": 0.2549538016319275, "learning_rate": 2.303176970444687e-07, "loss": 0.8576, "step": 6408 }, { "epoch": 0.979333002253887, "grad_norm": 0.35974299907684326, "learning_rate": 2.2694440137022155e-07, "loss": 0.8619, "step": 6409 }, { "epoch": 0.9794858081521947, "grad_norm": 0.26300784945487976, "learning_rate": 2.2359596331524847e-07, "loss": 0.6533, "step": 6410 }, { "epoch": 0.9796386140505023, "grad_norm": 0.24799039959907532, "learning_rate": 2.2027238371380431e-07, "loss": 0.5962, "step": 6411 }, { "epoch": 0.97979141994881, "grad_norm": 0.27073585987091064, "learning_rate": 2.1697366339391568e-07, "loss": 0.7697, "step": 6412 }, { "epoch": 0.9799442258471177, "grad_norm": 0.24513952434062958, "learning_rate": 2.136998031774362e-07, "loss": 0.5483, "step": 6413 }, { "epoch": 0.9800970317454254, "grad_norm": 0.30561357736587524, "learning_rate": 2.1045080388001348e-07, "loss": 0.7539, "step": 6414 }, { "epoch": 0.9802498376437331, "grad_norm": 0.4293336570262909, "learning_rate": 2.072266663111222e-07, "loss": 0.6453, "step": 6415 }, { "epoch": 0.9804026435420408, "grad_norm": 0.3037967383861542, "learning_rate": 2.040273912740198e-07, "loss": 0.6495, "step": 6416 }, { "epoch": 0.9805554494403484, "grad_norm": 0.24957461655139923, "learning_rate": 2.0085297956577987e-07, "loss": 0.5915, "step": 6417 }, { "epoch": 0.980708255338656, "grad_norm": 0.2533586621284485, "learning_rate": 1.977034319772919e-07, "loss": 0.5522, "step": 6418 }, { "epoch": 0.9808610612369637, "grad_norm": 0.36297425627708435, "learning_rate": 1.9457874929321718e-07, "loss": 0.8412, "step": 6419 }, { "epoch": 0.9810138671352714, "grad_norm": 0.2810041308403015, "learning_rate": 1.9147893229206626e-07, "loss": 0.6873, "step": 6420 }, { "epoch": 0.9811666730335791, "grad_norm": 0.2846044898033142, "learning_rate": 1.884039817461103e-07, "loss": 0.7492, "step": 6421 }, { "epoch": 0.9813194789318868, "grad_norm": 0.2690313756465912, "learning_rate": 1.8535389842146978e-07, "loss": 0.7325, "step": 6422 }, { "epoch": 0.9814722848301944, "grad_norm": 0.3626621663570404, "learning_rate": 1.8232868307802574e-07, "loss": 0.6484, "step": 6423 }, { "epoch": 0.9816250907285021, "grad_norm": 0.2992866337299347, "learning_rate": 1.7932833646950865e-07, "loss": 0.6384, "step": 6424 }, { "epoch": 0.9817778966268098, "grad_norm": 0.25797179341316223, "learning_rate": 1.763528593434094e-07, "loss": 0.6844, "step": 6425 }, { "epoch": 0.9819307025251175, "grad_norm": 1.2198379039764404, "learning_rate": 1.7340225244105722e-07, "loss": 0.5816, "step": 6426 }, { "epoch": 0.9820835084234252, "grad_norm": 0.27036669850349426, "learning_rate": 1.7047651649756414e-07, "loss": 0.8355, "step": 6427 }, { "epoch": 0.9822363143217329, "grad_norm": 0.3024647831916809, "learning_rate": 1.6757565224184702e-07, "loss": 0.5679, "step": 6428 }, { "epoch": 0.9823891202200405, "grad_norm": 0.3333212733268738, "learning_rate": 1.6469966039664996e-07, "loss": 0.5932, "step": 6429 }, { "epoch": 0.9825419261183481, "grad_norm": 0.32781001925468445, "learning_rate": 1.6184854167847764e-07, "loss": 0.8297, "step": 6430 }, { "epoch": 0.9826947320166558, "grad_norm": 0.2812676429748535, "learning_rate": 1.5902229679768398e-07, "loss": 0.5423, "step": 6431 }, { "epoch": 0.9828475379149635, "grad_norm": 0.39646944403648376, "learning_rate": 1.562209264583836e-07, "loss": 0.6035, "step": 6432 }, { "epoch": 0.9830003438132712, "grad_norm": 0.32472583651542664, "learning_rate": 1.5344443135854037e-07, "loss": 0.6549, "step": 6433 }, { "epoch": 0.9831531497115789, "grad_norm": 0.2801920771598816, "learning_rate": 1.5069281218987873e-07, "loss": 0.7028, "step": 6434 }, { "epoch": 0.9833059556098865, "grad_norm": 0.2855030596256256, "learning_rate": 1.4796606963793924e-07, "loss": 0.6612, "step": 6435 }, { "epoch": 0.9834587615081942, "grad_norm": 0.260616272687912, "learning_rate": 1.4526420438207845e-07, "loss": 0.6242, "step": 6436 }, { "epoch": 0.9836115674065019, "grad_norm": 0.2544775605201721, "learning_rate": 1.4258721709542456e-07, "loss": 0.6208, "step": 6437 }, { "epoch": 0.9837643733048096, "grad_norm": 0.29562172293663025, "learning_rate": 1.3993510844494406e-07, "loss": 0.6581, "step": 6438 }, { "epoch": 0.9839171792031173, "grad_norm": 0.3022526204586029, "learning_rate": 1.373078790913862e-07, "loss": 0.5913, "step": 6439 }, { "epoch": 0.984069985101425, "grad_norm": 0.28804531693458557, "learning_rate": 1.3470552968929405e-07, "loss": 0.587, "step": 6440 }, { "epoch": 0.9842227909997326, "grad_norm": 0.29857340455055237, "learning_rate": 1.3212806088702678e-07, "loss": 0.7566, "step": 6441 }, { "epoch": 0.9843755968980402, "grad_norm": 0.25568607449531555, "learning_rate": 1.2957547332673736e-07, "loss": 0.5728, "step": 6442 }, { "epoch": 0.9845284027963479, "grad_norm": 0.2965342700481415, "learning_rate": 1.2704776764438374e-07, "loss": 0.7538, "step": 6443 }, { "epoch": 0.9846812086946556, "grad_norm": 0.30493324995040894, "learning_rate": 1.2454494446971777e-07, "loss": 0.9407, "step": 6444 }, { "epoch": 0.9848340145929633, "grad_norm": 0.3716253936290741, "learning_rate": 1.2206700442629616e-07, "loss": 0.7945, "step": 6445 }, { "epoch": 0.984986820491271, "grad_norm": 0.310250848531723, "learning_rate": 1.1961394813149173e-07, "loss": 0.7811, "step": 6446 }, { "epoch": 0.9851396263895786, "grad_norm": 0.40353232622146606, "learning_rate": 1.171857761964379e-07, "loss": 0.7651, "step": 6447 }, { "epoch": 0.9852924322878863, "grad_norm": 0.4181149899959564, "learning_rate": 1.1478248922611734e-07, "loss": 0.6196, "step": 6448 }, { "epoch": 0.985445238186194, "grad_norm": 0.2948823869228363, "learning_rate": 1.1240408781927336e-07, "loss": 0.6426, "step": 6449 }, { "epoch": 0.9855980440845017, "grad_norm": 0.2704068124294281, "learning_rate": 1.100505725684764e-07, "loss": 0.6998, "step": 6450 }, { "epoch": 0.9857508499828094, "grad_norm": 0.30879390239715576, "learning_rate": 1.0772194406007962e-07, "loss": 0.5366, "step": 6451 }, { "epoch": 0.9859036558811171, "grad_norm": 0.3139268159866333, "learning_rate": 1.0541820287423009e-07, "loss": 0.6669, "step": 6452 }, { "epoch": 0.9860564617794246, "grad_norm": 0.26715102791786194, "learning_rate": 1.0313934958490201e-07, "loss": 0.5881, "step": 6453 }, { "epoch": 0.9862092676777323, "grad_norm": 0.2639731764793396, "learning_rate": 1.0088538475985232e-07, "loss": 0.7364, "step": 6454 }, { "epoch": 0.98636207357604, "grad_norm": 0.293069452047348, "learning_rate": 9.865630896062073e-08, "loss": 0.6151, "step": 6455 }, { "epoch": 0.9865148794743477, "grad_norm": 0.27687135338783264, "learning_rate": 9.645212274257409e-08, "loss": 0.5804, "step": 6456 }, { "epoch": 0.9866676853726554, "grad_norm": 0.251794695854187, "learning_rate": 9.427282665487314e-08, "loss": 0.7532, "step": 6457 }, { "epoch": 0.986820491270963, "grad_norm": 0.4326778054237366, "learning_rate": 9.211842124046132e-08, "loss": 0.7069, "step": 6458 }, { "epoch": 0.9869732971692707, "grad_norm": 0.2878977358341217, "learning_rate": 8.99889070360982e-08, "loss": 0.6604, "step": 6459 }, { "epoch": 0.9871261030675784, "grad_norm": 0.40638232231140137, "learning_rate": 8.788428457232601e-08, "loss": 0.7524, "step": 6460 }, { "epoch": 0.9872789089658861, "grad_norm": 0.6233261823654175, "learning_rate": 8.58045543735031e-08, "loss": 0.5427, "step": 6461 }, { "epoch": 0.9874317148641938, "grad_norm": 0.3426532745361328, "learning_rate": 8.374971695775946e-08, "loss": 0.6793, "step": 6462 }, { "epoch": 0.9875845207625015, "grad_norm": 0.3367740213871002, "learning_rate": 8.171977283706333e-08, "loss": 0.6241, "step": 6463 }, { "epoch": 0.9877373266608092, "grad_norm": 0.30172792077064514, "learning_rate": 7.971472251714352e-08, "loss": 0.7202, "step": 6464 }, { "epoch": 0.9878901325591167, "grad_norm": 0.27091774344444275, "learning_rate": 7.773456649754485e-08, "loss": 0.5109, "step": 6465 }, { "epoch": 0.9880429384574244, "grad_norm": 0.30555298924446106, "learning_rate": 7.577930527160604e-08, "loss": 0.6339, "step": 6466 }, { "epoch": 0.9881957443557321, "grad_norm": 0.34500110149383545, "learning_rate": 7.384893932645965e-08, "loss": 0.9091, "step": 6467 }, { "epoch": 0.9883485502540398, "grad_norm": 0.28037339448928833, "learning_rate": 7.194346914305427e-08, "loss": 0.7315, "step": 6468 }, { "epoch": 0.9885013561523475, "grad_norm": 0.3640865385532379, "learning_rate": 7.00628951961102e-08, "loss": 0.6326, "step": 6469 }, { "epoch": 0.9886541620506551, "grad_norm": 0.399759441614151, "learning_rate": 6.820721795416373e-08, "loss": 0.7696, "step": 6470 }, { "epoch": 0.9888069679489628, "grad_norm": 0.3892582356929779, "learning_rate": 6.637643787953395e-08, "loss": 0.6551, "step": 6471 }, { "epoch": 0.9889597738472705, "grad_norm": 0.25890570878982544, "learning_rate": 6.45705554283449e-08, "loss": 0.7363, "step": 6472 }, { "epoch": 0.9891125797455782, "grad_norm": 0.2983168065547943, "learning_rate": 6.278957105052552e-08, "loss": 0.5663, "step": 6473 }, { "epoch": 0.9892653856438859, "grad_norm": 0.27841824293136597, "learning_rate": 6.103348518978758e-08, "loss": 0.6055, "step": 6474 }, { "epoch": 0.9894181915421936, "grad_norm": 0.3425733745098114, "learning_rate": 5.9302298283636645e-08, "loss": 0.8392, "step": 6475 }, { "epoch": 0.9895709974405013, "grad_norm": 0.27444878220558167, "learning_rate": 5.7596010763394384e-08, "loss": 0.889, "step": 6476 }, { "epoch": 0.9897238033388088, "grad_norm": 0.2911641001701355, "learning_rate": 5.591462305416517e-08, "loss": 0.7882, "step": 6477 }, { "epoch": 0.9898766092371165, "grad_norm": 0.2616746127605438, "learning_rate": 5.4258135574858373e-08, "loss": 0.6209, "step": 6478 }, { "epoch": 0.9900294151354242, "grad_norm": 0.3651047348976135, "learning_rate": 5.262654873816608e-08, "loss": 0.6955, "step": 6479 }, { "epoch": 0.9901822210337319, "grad_norm": 0.3302326202392578, "learning_rate": 5.1019862950585364e-08, "loss": 0.765, "step": 6480 }, { "epoch": 0.9903350269320396, "grad_norm": 1.1409937143325806, "learning_rate": 4.9438078612407124e-08, "loss": 0.7911, "step": 6481 }, { "epoch": 0.9904878328303472, "grad_norm": 0.2777949571609497, "learning_rate": 4.7881196117727237e-08, "loss": 0.746, "step": 6482 }, { "epoch": 0.9906406387286549, "grad_norm": 0.2922649681568146, "learning_rate": 4.634921585442431e-08, "loss": 0.5891, "step": 6483 }, { "epoch": 0.9907934446269626, "grad_norm": 0.38926783204078674, "learning_rate": 4.484213820417082e-08, "loss": 0.6729, "step": 6484 }, { "epoch": 0.9909462505252703, "grad_norm": 0.29072585701942444, "learning_rate": 4.335996354245531e-08, "loss": 0.6274, "step": 6485 }, { "epoch": 0.991099056423578, "grad_norm": 0.2570950388908386, "learning_rate": 4.190269223854904e-08, "loss": 0.6373, "step": 6486 }, { "epoch": 0.9912518623218857, "grad_norm": 0.29220324754714966, "learning_rate": 4.047032465550604e-08, "loss": 0.6388, "step": 6487 }, { "epoch": 0.9914046682201934, "grad_norm": 0.3268110752105713, "learning_rate": 3.906286115020752e-08, "loss": 0.811, "step": 6488 }, { "epoch": 0.9915574741185009, "grad_norm": 0.402190625667572, "learning_rate": 3.7680302073295204e-08, "loss": 0.7488, "step": 6489 }, { "epoch": 0.9917102800168086, "grad_norm": 0.3102878928184509, "learning_rate": 3.632264776922689e-08, "loss": 0.5882, "step": 6490 }, { "epoch": 0.9918630859151163, "grad_norm": 0.3195332884788513, "learning_rate": 3.4989898576254234e-08, "loss": 0.6679, "step": 6491 }, { "epoch": 0.992015891813424, "grad_norm": 0.3438659608364105, "learning_rate": 3.3682054826411627e-08, "loss": 0.6346, "step": 6492 }, { "epoch": 0.9921686977117317, "grad_norm": 0.3308936655521393, "learning_rate": 3.239911684554953e-08, "loss": 0.6171, "step": 6493 }, { "epoch": 0.9923215036100393, "grad_norm": 0.25023937225341797, "learning_rate": 3.114108495329004e-08, "loss": 0.6195, "step": 6494 }, { "epoch": 0.992474309508347, "grad_norm": 0.3215552568435669, "learning_rate": 2.9907959463071346e-08, "loss": 0.8299, "step": 6495 }, { "epoch": 0.9926271154066547, "grad_norm": 0.38360846042633057, "learning_rate": 2.8699740682103237e-08, "loss": 0.7618, "step": 6496 }, { "epoch": 0.9927799213049624, "grad_norm": 0.299980491399765, "learning_rate": 2.7516428911422698e-08, "loss": 0.6017, "step": 6497 }, { "epoch": 0.9929327272032701, "grad_norm": 0.2955044209957123, "learning_rate": 2.6358024445816142e-08, "loss": 0.5831, "step": 6498 }, { "epoch": 0.9930855331015778, "grad_norm": 0.4438436031341553, "learning_rate": 2.5224527573919353e-08, "loss": 0.7005, "step": 6499 }, { "epoch": 0.9932383389998855, "grad_norm": 0.34785547852516174, "learning_rate": 2.4115938578117558e-08, "loss": 0.779, "step": 6500 }, { "epoch": 0.993391144898193, "grad_norm": 0.2642604410648346, "learning_rate": 2.3032257734600937e-08, "loss": 0.715, "step": 6501 }, { "epoch": 0.9935439507965007, "grad_norm": 0.2867738604545593, "learning_rate": 2.1973485313364627e-08, "loss": 0.6634, "step": 6502 }, { "epoch": 0.9936967566948084, "grad_norm": 0.27846238017082214, "learning_rate": 2.0939621578197623e-08, "loss": 0.7115, "step": 6503 }, { "epoch": 0.9938495625931161, "grad_norm": 0.5877290368080139, "learning_rate": 1.993066678668276e-08, "loss": 0.7385, "step": 6504 }, { "epoch": 0.9940023684914238, "grad_norm": 0.25964123010635376, "learning_rate": 1.894662119017454e-08, "loss": 0.5934, "step": 6505 }, { "epoch": 0.9941551743897314, "grad_norm": 0.33881714940071106, "learning_rate": 1.7987485033854613e-08, "loss": 0.6664, "step": 6506 }, { "epoch": 0.9943079802880391, "grad_norm": 0.25839629769325256, "learning_rate": 1.7053258556676277e-08, "loss": 0.6436, "step": 6507 }, { "epoch": 0.9944607861863468, "grad_norm": 0.39556682109832764, "learning_rate": 1.6143941991397792e-08, "loss": 0.7926, "step": 6508 }, { "epoch": 0.9946135920846545, "grad_norm": 0.3205921947956085, "learning_rate": 1.525953556457127e-08, "loss": 0.8031, "step": 6509 }, { "epoch": 0.9947663979829622, "grad_norm": 0.2670949399471283, "learning_rate": 1.440003949653157e-08, "loss": 0.696, "step": 6510 }, { "epoch": 0.9949192038812699, "grad_norm": 0.2751535177230835, "learning_rate": 1.3565454001429611e-08, "loss": 0.6453, "step": 6511 }, { "epoch": 0.9950720097795774, "grad_norm": 0.3137910068035126, "learning_rate": 1.2755779287176862e-08, "loss": 0.6896, "step": 6512 }, { "epoch": 0.9952248156778851, "grad_norm": 0.3138625919818878, "learning_rate": 1.1971015555500841e-08, "loss": 0.6058, "step": 6513 }, { "epoch": 0.9953776215761928, "grad_norm": 0.25915777683258057, "learning_rate": 1.1211163001922931e-08, "loss": 0.6876, "step": 6514 }, { "epoch": 0.9955304274745005, "grad_norm": 0.29875871539115906, "learning_rate": 1.0476221815758358e-08, "loss": 0.6711, "step": 6515 }, { "epoch": 0.9956832333728082, "grad_norm": 0.24008925259113312, "learning_rate": 9.766192180105105e-09, "loss": 0.6005, "step": 6516 }, { "epoch": 0.9958360392711159, "grad_norm": 0.28519225120544434, "learning_rate": 9.081074271855005e-09, "loss": 0.6034, "step": 6517 }, { "epoch": 0.9959888451694235, "grad_norm": 0.5138064622879028, "learning_rate": 8.420868261715953e-09, "loss": 0.7087, "step": 6518 }, { "epoch": 0.9961416510677312, "grad_norm": 0.47081300616264343, "learning_rate": 7.785574314156385e-09, "loss": 0.5475, "step": 6519 }, { "epoch": 0.9962944569660389, "grad_norm": 0.28254806995391846, "learning_rate": 7.175192587471902e-09, "loss": 0.6454, "step": 6520 }, { "epoch": 0.9964472628643466, "grad_norm": 0.3041219115257263, "learning_rate": 6.589723233718648e-09, "loss": 0.7802, "step": 6521 }, { "epoch": 0.9966000687626543, "grad_norm": 0.32065922021865845, "learning_rate": 6.029166398768826e-09, "loss": 0.8051, "step": 6522 }, { "epoch": 0.996752874660962, "grad_norm": 0.3174595534801483, "learning_rate": 5.493522222277392e-09, "loss": 0.7947, "step": 6523 }, { "epoch": 0.9969056805592695, "grad_norm": 0.2868291139602661, "learning_rate": 4.9827908376931524e-09, "loss": 0.8688, "step": 6524 }, { "epoch": 0.9970584864575772, "grad_norm": 0.3602704405784607, "learning_rate": 4.49697237226987e-09, "loss": 0.6016, "step": 6525 }, { "epoch": 0.9972112923558849, "grad_norm": 0.2539635896682739, "learning_rate": 4.036066947032957e-09, "loss": 0.7062, "step": 6526 }, { "epoch": 0.9973640982541926, "grad_norm": 0.3819586932659149, "learning_rate": 3.6000746768238834e-09, "loss": 0.8573, "step": 6527 }, { "epoch": 0.9975169041525003, "grad_norm": 0.292914479970932, "learning_rate": 3.1889956702557675e-09, "loss": 0.716, "step": 6528 }, { "epoch": 0.997669710050808, "grad_norm": 0.33907845616340637, "learning_rate": 2.8028300297577857e-09, "loss": 0.6172, "step": 6529 }, { "epoch": 0.9978225159491156, "grad_norm": 0.30105409026145935, "learning_rate": 2.4415778515418654e-09, "loss": 0.858, "step": 6530 }, { "epoch": 0.9979753218474233, "grad_norm": 0.2698521018028259, "learning_rate": 2.105239225591582e-09, "loss": 0.5617, "step": 6531 }, { "epoch": 0.998128127745731, "grad_norm": 0.34229230880737305, "learning_rate": 1.7938142357176724e-09, "loss": 0.7707, "step": 6532 }, { "epoch": 0.9982809336440387, "grad_norm": 0.33994314074516296, "learning_rate": 1.5073029595025213e-09, "loss": 0.559, "step": 6533 }, { "epoch": 0.9984337395423464, "grad_norm": 0.3268454372882843, "learning_rate": 1.245705468333469e-09, "loss": 0.8513, "step": 6534 }, { "epoch": 0.9985865454406541, "grad_norm": 0.2963615953922272, "learning_rate": 1.0090218273806073e-09, "loss": 0.7742, "step": 6535 }, { "epoch": 0.9987393513389616, "grad_norm": 0.2639820873737335, "learning_rate": 7.972520956189833e-10, "loss": 0.5563, "step": 6536 }, { "epoch": 0.9988921572372693, "grad_norm": 0.4614093005657196, "learning_rate": 6.10396325806395e-10, "loss": 0.8817, "step": 6537 }, { "epoch": 0.999044963135577, "grad_norm": 0.29073458909988403, "learning_rate": 4.4845456448339154e-10, "loss": 0.6519, "step": 6538 }, { "epoch": 0.9991977690338847, "grad_norm": 0.28678426146507263, "learning_rate": 3.1142685201768217e-10, "loss": 0.6484, "step": 6539 }, { "epoch": 0.9993505749321924, "grad_norm": 0.3102395534515381, "learning_rate": 1.993132225375227e-10, "loss": 0.7115, "step": 6540 }, { "epoch": 0.9995033808305, "grad_norm": 0.30270108580589294, "learning_rate": 1.1211370396502218e-10, "loss": 0.7306, "step": 6541 }, { "epoch": 0.9996561867288077, "grad_norm": 0.29784873127937317, "learning_rate": 4.9828318049449654e-11, "loss": 0.7267, "step": 6542 }, { "epoch": 0.9998089926271154, "grad_norm": 0.2554897665977478, "learning_rate": 1.245708028951853e-11, "loss": 0.6481, "step": 6543 }, { "epoch": 0.9999617985254231, "grad_norm": 0.5652004480361938, "learning_rate": 0.0, "loss": 0.7754, "step": 6544 } ], "logging_steps": 1, "max_steps": 6544, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.300239644555477e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }