{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998107852412488, "eval_steps": 500, "global_step": 2642, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003784295175023652, "grad_norm": 467.624415262933, "learning_rate": 1.25e-05, "loss": 12.3926, "step": 1 }, { "epoch": 0.0007568590350047304, "grad_norm": 467.52658681084654, "learning_rate": 2.5e-05, "loss": 12.37, "step": 2 }, { "epoch": 0.0011352885525070956, "grad_norm": 256.35123000485095, "learning_rate": 3.75e-05, "loss": 9.7567, "step": 3 }, { "epoch": 0.0015137180700094607, "grad_norm": 190.4545652019137, "learning_rate": 5e-05, "loss": 6.9933, "step": 4 }, { "epoch": 0.001892147587511826, "grad_norm": 102.9105915042276, "learning_rate": 6.25e-05, "loss": 5.629, "step": 5 }, { "epoch": 0.002270577105014191, "grad_norm": 370.91877058865595, "learning_rate": 7.5e-05, "loss": 3.1659, "step": 6 }, { "epoch": 0.0026490066225165563, "grad_norm": 31.095700119437232, "learning_rate": 8.75e-05, "loss": 2.2748, "step": 7 }, { "epoch": 0.0030274361400189215, "grad_norm": 7.4798529548267245, "learning_rate": 0.0001, "loss": 1.864, "step": 8 }, { "epoch": 0.0034058656575212867, "grad_norm": 3.6958402293132684, "learning_rate": 0.00011250000000000001, "loss": 1.7106, "step": 9 }, { "epoch": 0.003784295175023652, "grad_norm": 3.819107708199471, "learning_rate": 0.000125, "loss": 1.5616, "step": 10 }, { "epoch": 0.004162724692526017, "grad_norm": 1.4162714444293616, "learning_rate": 0.0001375, "loss": 1.4532, "step": 11 }, { "epoch": 0.004541154210028382, "grad_norm": 2.043935280896907, "learning_rate": 0.00015, "loss": 1.4698, "step": 12 }, { "epoch": 0.004919583727530747, "grad_norm": 3.2649129428911583, "learning_rate": 0.00016250000000000002, "loss": 1.3984, "step": 13 }, { "epoch": 0.005298013245033113, "grad_norm": 2.181411990733394, "learning_rate": 0.000175, "loss": 1.3857, "step": 14 }, { "epoch": 0.005676442762535478, "grad_norm": 1.3098595262150987, "learning_rate": 0.0001875, "loss": 1.3505, "step": 15 }, { "epoch": 0.006054872280037843, "grad_norm": 0.4851743194114561, "learning_rate": 0.0002, "loss": 1.3359, "step": 16 }, { "epoch": 0.006433301797540208, "grad_norm": 0.5944463762012281, "learning_rate": 0.0002125, "loss": 1.273, "step": 17 }, { "epoch": 0.006811731315042573, "grad_norm": 0.3176152830450339, "learning_rate": 0.00022500000000000002, "loss": 1.304, "step": 18 }, { "epoch": 0.0071901608325449385, "grad_norm": 0.7621905308561917, "learning_rate": 0.0002375, "loss": 1.2883, "step": 19 }, { "epoch": 0.007568590350047304, "grad_norm": 0.31540861738475495, "learning_rate": 0.00025, "loss": 1.2755, "step": 20 }, { "epoch": 0.007947019867549669, "grad_norm": 0.2886982117280388, "learning_rate": 0.00026250000000000004, "loss": 1.27, "step": 21 }, { "epoch": 0.008325449385052034, "grad_norm": 0.32415130787032026, "learning_rate": 0.000275, "loss": 1.2649, "step": 22 }, { "epoch": 0.0087038789025544, "grad_norm": 0.2773194906880951, "learning_rate": 0.0002875, "loss": 1.2631, "step": 23 }, { "epoch": 0.009082308420056764, "grad_norm": 0.2441995244503663, "learning_rate": 0.0003, "loss": 1.1827, "step": 24 }, { "epoch": 0.00946073793755913, "grad_norm": 0.26352879916254385, "learning_rate": 0.0003125, "loss": 1.201, "step": 25 }, { "epoch": 0.009839167455061495, "grad_norm": 0.27975274608958534, "learning_rate": 0.00032500000000000004, "loss": 1.2268, "step": 26 }, { "epoch": 0.01021759697256386, "grad_norm": 0.21977846168486095, "learning_rate": 0.0003375, "loss": 1.2495, "step": 27 }, { "epoch": 0.010596026490066225, "grad_norm": 0.26270760716593305, "learning_rate": 0.00035, "loss": 1.1781, "step": 28 }, { "epoch": 0.01097445600756859, "grad_norm": 0.30155790058640836, "learning_rate": 0.0003625, "loss": 1.2019, "step": 29 }, { "epoch": 0.011352885525070956, "grad_norm": 0.23166067027031817, "learning_rate": 0.000375, "loss": 1.1752, "step": 30 }, { "epoch": 0.01173131504257332, "grad_norm": 0.24616655150879801, "learning_rate": 0.00038750000000000004, "loss": 1.1857, "step": 31 }, { "epoch": 0.012109744560075686, "grad_norm": 0.20399093367552967, "learning_rate": 0.0004, "loss": 1.1661, "step": 32 }, { "epoch": 0.012488174077578051, "grad_norm": 0.332022711700161, "learning_rate": 0.0004125, "loss": 1.1844, "step": 33 }, { "epoch": 0.012866603595080416, "grad_norm": 0.21617748515159865, "learning_rate": 0.000425, "loss": 1.1747, "step": 34 }, { "epoch": 0.013245033112582781, "grad_norm": 0.2745028450589144, "learning_rate": 0.0004375, "loss": 1.1715, "step": 35 }, { "epoch": 0.013623462630085147, "grad_norm": 0.2314957752825833, "learning_rate": 0.00045000000000000004, "loss": 1.1611, "step": 36 }, { "epoch": 0.014001892147587512, "grad_norm": 0.21549952733271246, "learning_rate": 0.0004625, "loss": 1.1457, "step": 37 }, { "epoch": 0.014380321665089877, "grad_norm": 0.2494158999405991, "learning_rate": 0.000475, "loss": 1.1376, "step": 38 }, { "epoch": 0.014758751182592242, "grad_norm": 0.2700175997508054, "learning_rate": 0.0004875, "loss": 1.13, "step": 39 }, { "epoch": 0.015137180700094607, "grad_norm": 0.20171163423783478, "learning_rate": 0.0005, "loss": 1.168, "step": 40 }, { "epoch": 0.015515610217596973, "grad_norm": 0.2609361077459963, "learning_rate": 0.0005124999999999999, "loss": 1.1563, "step": 41 }, { "epoch": 0.015894039735099338, "grad_norm": 0.29220848620113515, "learning_rate": 0.0005250000000000001, "loss": 1.0902, "step": 42 }, { "epoch": 0.016272469252601705, "grad_norm": 0.30922283563136316, "learning_rate": 0.0005375, "loss": 1.1335, "step": 43 }, { "epoch": 0.016650898770104068, "grad_norm": 0.29233660845696224, "learning_rate": 0.00055, "loss": 1.1374, "step": 44 }, { "epoch": 0.017029328287606435, "grad_norm": 0.27055154715416246, "learning_rate": 0.0005625000000000001, "loss": 1.1224, "step": 45 }, { "epoch": 0.0174077578051088, "grad_norm": 0.20494500335471114, "learning_rate": 0.000575, "loss": 1.1282, "step": 46 }, { "epoch": 0.017786187322611165, "grad_norm": 0.2547419951943546, "learning_rate": 0.0005875, "loss": 1.117, "step": 47 }, { "epoch": 0.01816461684011353, "grad_norm": 0.2764656107308095, "learning_rate": 0.0006, "loss": 1.1304, "step": 48 }, { "epoch": 0.018543046357615896, "grad_norm": 0.1890634678587297, "learning_rate": 0.0006125000000000001, "loss": 1.1188, "step": 49 }, { "epoch": 0.01892147587511826, "grad_norm": 0.2330460861504734, "learning_rate": 0.000625, "loss": 1.1211, "step": 50 }, { "epoch": 0.019299905392620626, "grad_norm": 0.8702704126670258, "learning_rate": 0.0006374999999999999, "loss": 1.1022, "step": 51 }, { "epoch": 0.01967833491012299, "grad_norm": 0.3327976021991774, "learning_rate": 0.0006500000000000001, "loss": 1.1353, "step": 52 }, { "epoch": 0.020056764427625356, "grad_norm": 0.41304576849820795, "learning_rate": 0.0006625, "loss": 1.1407, "step": 53 }, { "epoch": 0.02043519394512772, "grad_norm": 0.3336970001993246, "learning_rate": 0.000675, "loss": 1.1249, "step": 54 }, { "epoch": 0.020813623462630087, "grad_norm": 0.2876553178116314, "learning_rate": 0.0006875, "loss": 1.0947, "step": 55 }, { "epoch": 0.02119205298013245, "grad_norm": 0.3703509475772107, "learning_rate": 0.0007, "loss": 1.1293, "step": 56 }, { "epoch": 0.021570482497634817, "grad_norm": 0.2396497668887747, "learning_rate": 0.0007125, "loss": 1.1165, "step": 57 }, { "epoch": 0.02194891201513718, "grad_norm": 0.32109886267459736, "learning_rate": 0.000725, "loss": 1.1179, "step": 58 }, { "epoch": 0.022327341532639548, "grad_norm": 0.21780515875957437, "learning_rate": 0.0007375000000000001, "loss": 1.1041, "step": 59 }, { "epoch": 0.02270577105014191, "grad_norm": 0.2619819883013857, "learning_rate": 0.00075, "loss": 1.1031, "step": 60 }, { "epoch": 0.023084200567644278, "grad_norm": 0.1983751029937948, "learning_rate": 0.0007624999999999999, "loss": 1.1151, "step": 61 }, { "epoch": 0.02346263008514664, "grad_norm": 0.21493349801697448, "learning_rate": 0.0007750000000000001, "loss": 1.0954, "step": 62 }, { "epoch": 0.02384105960264901, "grad_norm": 0.2873729222825295, "learning_rate": 0.0007875, "loss": 1.1032, "step": 63 }, { "epoch": 0.024219489120151372, "grad_norm": 0.38163650651035097, "learning_rate": 0.0008, "loss": 1.0909, "step": 64 }, { "epoch": 0.02459791863765374, "grad_norm": 0.32163315177576574, "learning_rate": 0.0008125000000000001, "loss": 1.1063, "step": 65 }, { "epoch": 0.024976348155156102, "grad_norm": 0.2571742799033319, "learning_rate": 0.000825, "loss": 1.1184, "step": 66 }, { "epoch": 0.02535477767265847, "grad_norm": 0.30950657522338093, "learning_rate": 0.0008375, "loss": 1.0856, "step": 67 }, { "epoch": 0.025733207190160833, "grad_norm": 0.30866022239091606, "learning_rate": 0.00085, "loss": 1.1029, "step": 68 }, { "epoch": 0.0261116367076632, "grad_norm": 0.20897268993473606, "learning_rate": 0.0008625000000000001, "loss": 1.0821, "step": 69 }, { "epoch": 0.026490066225165563, "grad_norm": 0.2417375861647427, "learning_rate": 0.000875, "loss": 1.0688, "step": 70 }, { "epoch": 0.02686849574266793, "grad_norm": 0.21498127783720725, "learning_rate": 0.0008874999999999999, "loss": 1.0715, "step": 71 }, { "epoch": 0.027246925260170293, "grad_norm": 0.20039122694791395, "learning_rate": 0.0009000000000000001, "loss": 1.0917, "step": 72 }, { "epoch": 0.02762535477767266, "grad_norm": 0.28400539021346644, "learning_rate": 0.0009125, "loss": 1.0801, "step": 73 }, { "epoch": 0.028003784295175024, "grad_norm": 0.19072855458316754, "learning_rate": 0.000925, "loss": 1.0922, "step": 74 }, { "epoch": 0.02838221381267739, "grad_norm": 0.20530063728423134, "learning_rate": 0.0009375, "loss": 1.0633, "step": 75 }, { "epoch": 0.028760643330179754, "grad_norm": 0.5648316958501153, "learning_rate": 0.00095, "loss": 1.0694, "step": 76 }, { "epoch": 0.02913907284768212, "grad_norm": 0.19195163643942548, "learning_rate": 0.0009625, "loss": 1.0907, "step": 77 }, { "epoch": 0.029517502365184484, "grad_norm": 0.16944333359044986, "learning_rate": 0.000975, "loss": 1.0727, "step": 78 }, { "epoch": 0.02989593188268685, "grad_norm": 0.24954415276645117, "learning_rate": 0.0009875, "loss": 1.0788, "step": 79 }, { "epoch": 0.030274361400189215, "grad_norm": 0.22603287123050303, "learning_rate": 0.001, "loss": 1.0907, "step": 80 }, { "epoch": 0.03065279091769158, "grad_norm": 0.18130993222177455, "learning_rate": 0.0009999996240921034, "loss": 1.0621, "step": 81 }, { "epoch": 0.031031220435193945, "grad_norm": 0.21787386233743491, "learning_rate": 0.0009999984963689791, "loss": 1.0808, "step": 82 }, { "epoch": 0.03140964995269631, "grad_norm": 0.19377373119347832, "learning_rate": 0.0009999966168323226, "loss": 1.0815, "step": 83 }, { "epoch": 0.031788079470198675, "grad_norm": 0.19293687881564972, "learning_rate": 0.0009999939854849601, "loss": 1.0454, "step": 84 }, { "epoch": 0.03216650898770104, "grad_norm": 0.21156964242111445, "learning_rate": 0.0009999906023308485, "loss": 1.0793, "step": 85 }, { "epoch": 0.03254493850520341, "grad_norm": 0.2215515808460154, "learning_rate": 0.0009999864673750742, "loss": 1.0705, "step": 86 }, { "epoch": 0.03292336802270577, "grad_norm": 0.2725230320549992, "learning_rate": 0.000999981580623855, "loss": 1.0688, "step": 87 }, { "epoch": 0.033301797540208136, "grad_norm": 0.2731792047428543, "learning_rate": 0.0009999759420845386, "loss": 1.064, "step": 88 }, { "epoch": 0.0336802270577105, "grad_norm": 0.21125266584147634, "learning_rate": 0.0009999695517656036, "loss": 1.0796, "step": 89 }, { "epoch": 0.03405865657521287, "grad_norm": 0.20851371029617538, "learning_rate": 0.0009999624096766583, "loss": 1.0657, "step": 90 }, { "epoch": 0.03443708609271523, "grad_norm": 0.2450897839794509, "learning_rate": 0.0009999545158284422, "loss": 1.0371, "step": 91 }, { "epoch": 0.0348155156102176, "grad_norm": 0.27046763813797375, "learning_rate": 0.0009999458702328243, "loss": 1.0611, "step": 92 }, { "epoch": 0.035193945127719964, "grad_norm": 0.21827828261229612, "learning_rate": 0.0009999364729028047, "loss": 1.0402, "step": 93 }, { "epoch": 0.03557237464522233, "grad_norm": 0.17415960188808768, "learning_rate": 0.0009999263238525135, "loss": 1.0664, "step": 94 }, { "epoch": 0.03595080416272469, "grad_norm": 0.24306090591233592, "learning_rate": 0.000999915423097211, "loss": 1.0576, "step": 95 }, { "epoch": 0.03632923368022706, "grad_norm": 0.2077158997387356, "learning_rate": 0.0009999037706532878, "loss": 1.0782, "step": 96 }, { "epoch": 0.036707663197729425, "grad_norm": 0.2242769185817708, "learning_rate": 0.0009998913665382652, "loss": 1.0465, "step": 97 }, { "epoch": 0.03708609271523179, "grad_norm": 0.2302733498169503, "learning_rate": 0.0009998782107707943, "loss": 1.0574, "step": 98 }, { "epoch": 0.03746452223273415, "grad_norm": 0.2815834691096356, "learning_rate": 0.0009998643033706566, "loss": 1.0367, "step": 99 }, { "epoch": 0.03784295175023652, "grad_norm": 0.2220063764181572, "learning_rate": 0.0009998496443587633, "loss": 1.0411, "step": 100 }, { "epoch": 0.038221381267738885, "grad_norm": 0.21874963385687568, "learning_rate": 0.0009998342337571564, "loss": 1.0675, "step": 101 }, { "epoch": 0.03859981078524125, "grad_norm": 0.2408653594593496, "learning_rate": 0.0009998180715890081, "loss": 1.0347, "step": 102 }, { "epoch": 0.03897824030274361, "grad_norm": 0.17168211951502393, "learning_rate": 0.0009998011578786201, "loss": 1.0345, "step": 103 }, { "epoch": 0.03935666982024598, "grad_norm": 0.2528185508201969, "learning_rate": 0.0009997834926514242, "loss": 1.0332, "step": 104 }, { "epoch": 0.039735099337748346, "grad_norm": 0.21782811936647384, "learning_rate": 0.0009997650759339827, "loss": 1.031, "step": 105 }, { "epoch": 0.04011352885525071, "grad_norm": 0.2478654979278263, "learning_rate": 0.0009997459077539872, "loss": 1.0467, "step": 106 }, { "epoch": 0.04049195837275307, "grad_norm": 0.20245403649714583, "learning_rate": 0.00099972598814026, "loss": 1.015, "step": 107 }, { "epoch": 0.04087038789025544, "grad_norm": 0.21924215538493094, "learning_rate": 0.0009997053171227526, "loss": 1.055, "step": 108 }, { "epoch": 0.04124881740775781, "grad_norm": 0.1851888816986655, "learning_rate": 0.0009996838947325466, "loss": 1.0385, "step": 109 }, { "epoch": 0.041627246925260174, "grad_norm": 0.18409294885290425, "learning_rate": 0.0009996617210018537, "loss": 1.0508, "step": 110 }, { "epoch": 0.042005676442762534, "grad_norm": 0.20468395135438736, "learning_rate": 0.0009996387959640145, "loss": 1.0407, "step": 111 }, { "epoch": 0.0423841059602649, "grad_norm": 0.19430440138844504, "learning_rate": 0.0009996151196535, "loss": 1.029, "step": 112 }, { "epoch": 0.04276253547776727, "grad_norm": 0.1678297831960669, "learning_rate": 0.0009995906921059108, "loss": 1.0386, "step": 113 }, { "epoch": 0.043140964995269634, "grad_norm": 0.18892342953937205, "learning_rate": 0.0009995655133579768, "loss": 1.0386, "step": 114 }, { "epoch": 0.043519394512771994, "grad_norm": 0.1865873876262221, "learning_rate": 0.0009995395834475577, "loss": 1.0541, "step": 115 }, { "epoch": 0.04389782403027436, "grad_norm": 0.1813598842329414, "learning_rate": 0.0009995129024136424, "loss": 1.0293, "step": 116 }, { "epoch": 0.04427625354777673, "grad_norm": 0.1702714371831727, "learning_rate": 0.0009994854702963494, "loss": 1.0343, "step": 117 }, { "epoch": 0.044654683065279095, "grad_norm": 0.2165499303178184, "learning_rate": 0.0009994572871369265, "loss": 1.0214, "step": 118 }, { "epoch": 0.045033112582781455, "grad_norm": 0.2112299869300825, "learning_rate": 0.0009994283529777507, "loss": 1.0156, "step": 119 }, { "epoch": 0.04541154210028382, "grad_norm": 0.3104138119244144, "learning_rate": 0.0009993986678623285, "loss": 1.0493, "step": 120 }, { "epoch": 0.04578997161778619, "grad_norm": 88.27538973912124, "learning_rate": 0.000999368231835295, "loss": 9.5933, "step": 121 }, { "epoch": 0.046168401135288556, "grad_norm": 0.7235833072588395, "learning_rate": 0.0009993370449424153, "loss": 1.0692, "step": 122 }, { "epoch": 0.046546830652790916, "grad_norm": 0.34765965749377986, "learning_rate": 0.0009993051072305825, "loss": 1.0467, "step": 123 }, { "epoch": 0.04692526017029328, "grad_norm": 0.3251829975534252, "learning_rate": 0.0009992724187478197, "loss": 1.0526, "step": 124 }, { "epoch": 0.04730368968779565, "grad_norm": 0.2654610997714847, "learning_rate": 0.0009992389795432778, "loss": 1.028, "step": 125 }, { "epoch": 0.04768211920529802, "grad_norm": 0.21703554882463696, "learning_rate": 0.0009992047896672375, "loss": 1.0364, "step": 126 }, { "epoch": 0.04806054872280038, "grad_norm": 0.2570153396691132, "learning_rate": 0.0009991698491711072, "loss": 1.0362, "step": 127 }, { "epoch": 0.048438978240302744, "grad_norm": 0.1901948409937453, "learning_rate": 0.000999134158107425, "loss": 1.0409, "step": 128 }, { "epoch": 0.04881740775780511, "grad_norm": 0.17653857381291924, "learning_rate": 0.000999097716529857, "loss": 1.0083, "step": 129 }, { "epoch": 0.04919583727530748, "grad_norm": 0.18347197410169014, "learning_rate": 0.000999060524493198, "loss": 1.0026, "step": 130 }, { "epoch": 0.04957426679280984, "grad_norm": 0.17326234508853144, "learning_rate": 0.000999022582053371, "loss": 1.0271, "step": 131 }, { "epoch": 0.049952696310312204, "grad_norm": 0.17551756260557216, "learning_rate": 0.0009989838892674272, "loss": 0.9855, "step": 132 }, { "epoch": 0.05033112582781457, "grad_norm": 0.15992066455116483, "learning_rate": 0.0009989444461935468, "loss": 1.0109, "step": 133 }, { "epoch": 0.05070955534531694, "grad_norm": 0.1602953678777531, "learning_rate": 0.0009989042528910374, "loss": 0.9946, "step": 134 }, { "epoch": 0.0510879848628193, "grad_norm": 0.1481831592650484, "learning_rate": 0.0009988633094203348, "loss": 1.0285, "step": 135 }, { "epoch": 0.051466414380321665, "grad_norm": 0.16460907806166927, "learning_rate": 0.0009988216158430033, "loss": 1.0134, "step": 136 }, { "epoch": 0.05184484389782403, "grad_norm": 0.18366916609064127, "learning_rate": 0.000998779172221734, "loss": 1.0298, "step": 137 }, { "epoch": 0.0522232734153264, "grad_norm": 0.16807315597856015, "learning_rate": 0.0009987359786203472, "loss": 0.9922, "step": 138 }, { "epoch": 0.05260170293282876, "grad_norm": 0.1645693386491836, "learning_rate": 0.00099869203510379, "loss": 1.0129, "step": 139 }, { "epoch": 0.052980132450331126, "grad_norm": 0.207383953011589, "learning_rate": 0.0009986473417381366, "loss": 1.025, "step": 140 }, { "epoch": 0.05335856196783349, "grad_norm": 0.2009286799356453, "learning_rate": 0.00099860189859059, "loss": 1.0272, "step": 141 }, { "epoch": 0.05373699148533586, "grad_norm": 0.21701426389449885, "learning_rate": 0.00099855570572948, "loss": 1.0283, "step": 142 }, { "epoch": 0.05411542100283822, "grad_norm": 0.16236233794829585, "learning_rate": 0.0009985087632242633, "loss": 1.0002, "step": 143 }, { "epoch": 0.054493850520340587, "grad_norm": 0.18975485537527778, "learning_rate": 0.0009984610711455243, "loss": 0.9759, "step": 144 }, { "epoch": 0.05487228003784295, "grad_norm": 0.19013982377458333, "learning_rate": 0.0009984126295649745, "loss": 1.0069, "step": 145 }, { "epoch": 0.05525070955534532, "grad_norm": 0.216008014096495, "learning_rate": 0.0009983634385554517, "loss": 0.9916, "step": 146 }, { "epoch": 0.05562913907284768, "grad_norm": 0.23302117986458626, "learning_rate": 0.0009983134981909215, "loss": 1.0107, "step": 147 }, { "epoch": 0.05600756859035005, "grad_norm": 0.19536727137298193, "learning_rate": 0.0009982628085464758, "loss": 1.0059, "step": 148 }, { "epoch": 0.056385998107852414, "grad_norm": 0.24662328696006422, "learning_rate": 0.0009982113696983329, "loss": 1.0381, "step": 149 }, { "epoch": 0.05676442762535478, "grad_norm": 0.26775960963054063, "learning_rate": 0.0009981591817238378, "loss": 1.004, "step": 150 }, { "epoch": 0.05714285714285714, "grad_norm": 0.20908086100237003, "learning_rate": 0.0009981062447014623, "loss": 0.9676, "step": 151 }, { "epoch": 0.05752128666035951, "grad_norm": 0.254081108331452, "learning_rate": 0.000998052558710804, "loss": 0.9822, "step": 152 }, { "epoch": 0.057899716177861875, "grad_norm": 0.24714743876718917, "learning_rate": 0.0009979981238325871, "loss": 0.9881, "step": 153 }, { "epoch": 0.05827814569536424, "grad_norm": 0.26629219690139866, "learning_rate": 0.000997942940148661, "loss": 1.0048, "step": 154 }, { "epoch": 0.0586565752128666, "grad_norm": 0.2170696432759937, "learning_rate": 0.0009978870077420024, "loss": 0.9934, "step": 155 }, { "epoch": 0.05903500473036897, "grad_norm": 0.22991533430195515, "learning_rate": 0.0009978303266967125, "loss": 0.9948, "step": 156 }, { "epoch": 0.059413434247871336, "grad_norm": 0.2683852059760594, "learning_rate": 0.0009977728970980192, "loss": 0.9831, "step": 157 }, { "epoch": 0.0597918637653737, "grad_norm": 0.22936862297444383, "learning_rate": 0.000997714719032275, "loss": 0.9627, "step": 158 }, { "epoch": 0.06017029328287606, "grad_norm": 0.2599483936931921, "learning_rate": 0.000997655792586958, "loss": 0.9672, "step": 159 }, { "epoch": 0.06054872280037843, "grad_norm": 0.24831733826648372, "learning_rate": 0.0009975961178506727, "loss": 0.9871, "step": 160 }, { "epoch": 0.060927152317880796, "grad_norm": 0.24097928804218802, "learning_rate": 0.0009975356949131473, "loss": 0.9834, "step": 161 }, { "epoch": 0.06130558183538316, "grad_norm": 0.23203934517299232, "learning_rate": 0.0009974745238652358, "loss": 0.9872, "step": 162 }, { "epoch": 0.06168401135288552, "grad_norm": 0.26757676061574764, "learning_rate": 0.0009974126047989171, "loss": 0.9947, "step": 163 }, { "epoch": 0.06206244087038789, "grad_norm": 0.279492129919658, "learning_rate": 0.0009973499378072945, "loss": 0.9717, "step": 164 }, { "epoch": 0.06244087038789026, "grad_norm": 0.2673128860021323, "learning_rate": 0.0009972865229845962, "loss": 0.9898, "step": 165 }, { "epoch": 0.06281929990539262, "grad_norm": 0.25482723755611497, "learning_rate": 0.0009972223604261742, "loss": 0.9794, "step": 166 }, { "epoch": 0.06319772942289499, "grad_norm": 0.3169242715619418, "learning_rate": 0.0009971574502285062, "loss": 0.9603, "step": 167 }, { "epoch": 0.06357615894039735, "grad_norm": 0.2664835043171325, "learning_rate": 0.0009970917924891926, "loss": 0.9326, "step": 168 }, { "epoch": 0.06395458845789971, "grad_norm": 0.30873614056171816, "learning_rate": 0.0009970253873069588, "loss": 0.9233, "step": 169 }, { "epoch": 0.06433301797540208, "grad_norm": 0.3952414286659471, "learning_rate": 0.0009969582347816534, "loss": 0.9695, "step": 170 }, { "epoch": 0.06471144749290444, "grad_norm": 2.514358201545661, "learning_rate": 0.0009968903350142493, "loss": 0.9367, "step": 171 }, { "epoch": 0.06508987701040682, "grad_norm": 0.5252052462238805, "learning_rate": 0.0009968216881068425, "loss": 0.9655, "step": 172 }, { "epoch": 0.06546830652790918, "grad_norm": 0.5180629259113039, "learning_rate": 0.000996752294162653, "loss": 0.956, "step": 173 }, { "epoch": 0.06584673604541154, "grad_norm": 0.40990971013015093, "learning_rate": 0.0009966821532860235, "loss": 0.9573, "step": 174 }, { "epoch": 0.06622516556291391, "grad_norm": 0.321606719606868, "learning_rate": 0.00099661126558242, "loss": 0.9281, "step": 175 }, { "epoch": 0.06660359508041627, "grad_norm": 0.4231706080797162, "learning_rate": 0.0009965396311584318, "loss": 0.9484, "step": 176 }, { "epoch": 0.06698202459791863, "grad_norm": 0.3161928846779583, "learning_rate": 0.00099646725012177, "loss": 0.963, "step": 177 }, { "epoch": 0.067360454115421, "grad_norm": 0.32763555898901714, "learning_rate": 0.00099639412258127, "loss": 0.9362, "step": 178 }, { "epoch": 0.06773888363292337, "grad_norm": 0.28055197137226023, "learning_rate": 0.000996320248646888, "loss": 0.939, "step": 179 }, { "epoch": 0.06811731315042574, "grad_norm": 3.1675463675919135, "learning_rate": 0.0009962456284297032, "loss": 0.9743, "step": 180 }, { "epoch": 0.0684957426679281, "grad_norm": 0.4223886118442837, "learning_rate": 0.000996170262041917, "loss": 0.9331, "step": 181 }, { "epoch": 0.06887417218543046, "grad_norm": 0.4815455829438775, "learning_rate": 0.000996094149596853, "loss": 0.9733, "step": 182 }, { "epoch": 0.06925260170293283, "grad_norm": 0.3351052865715314, "learning_rate": 0.0009960172912089557, "loss": 0.9616, "step": 183 }, { "epoch": 0.0696310312204352, "grad_norm": 0.2761575827346799, "learning_rate": 0.0009959396869937925, "loss": 0.96, "step": 184 }, { "epoch": 0.07000946073793755, "grad_norm": 0.31347863027300643, "learning_rate": 0.0009958613370680508, "loss": 0.959, "step": 185 }, { "epoch": 0.07038789025543993, "grad_norm": 0.2846136810203173, "learning_rate": 0.0009957822415495405, "loss": 0.9465, "step": 186 }, { "epoch": 0.07076631977294229, "grad_norm": 0.6869957765466059, "learning_rate": 0.000995702400557192, "loss": 0.941, "step": 187 }, { "epoch": 0.07114474929044466, "grad_norm": 0.3040430148586518, "learning_rate": 0.0009956218142110568, "loss": 0.9234, "step": 188 }, { "epoch": 0.07152317880794702, "grad_norm": 0.33182265517228937, "learning_rate": 0.0009955404826323066, "loss": 0.9319, "step": 189 }, { "epoch": 0.07190160832544938, "grad_norm": 0.2699101176371807, "learning_rate": 0.000995458405943235, "loss": 0.9099, "step": 190 }, { "epoch": 0.07228003784295176, "grad_norm": 0.3441492817858343, "learning_rate": 0.0009953755842672545, "loss": 0.9385, "step": 191 }, { "epoch": 0.07265846736045412, "grad_norm": 0.3104409331025707, "learning_rate": 0.0009952920177288985, "loss": 0.9879, "step": 192 }, { "epoch": 0.07303689687795648, "grad_norm": 0.51625404134651, "learning_rate": 0.0009952077064538203, "loss": 0.9666, "step": 193 }, { "epoch": 0.07341532639545885, "grad_norm": 0.29524594621677475, "learning_rate": 0.000995122650568793, "loss": 0.9548, "step": 194 }, { "epoch": 0.07379375591296121, "grad_norm": 0.3235911534166989, "learning_rate": 0.0009950368502017095, "loss": 0.8962, "step": 195 }, { "epoch": 0.07417218543046358, "grad_norm": 0.37297976224676627, "learning_rate": 0.0009949503054815815, "loss": 0.934, "step": 196 }, { "epoch": 0.07455061494796594, "grad_norm": 0.3256552482291601, "learning_rate": 0.000994863016538541, "loss": 0.8951, "step": 197 }, { "epoch": 0.0749290444654683, "grad_norm": 0.3017485410383236, "learning_rate": 0.0009947749835038378, "loss": 0.9084, "step": 198 }, { "epoch": 0.07530747398297068, "grad_norm": 0.3506325608388548, "learning_rate": 0.0009946862065098414, "loss": 0.9074, "step": 199 }, { "epoch": 0.07568590350047304, "grad_norm": 0.5711287314246536, "learning_rate": 0.0009945966856900398, "loss": 0.9157, "step": 200 }, { "epoch": 0.0760643330179754, "grad_norm": 0.3309619177184321, "learning_rate": 0.0009945064211790394, "loss": 0.9389, "step": 201 }, { "epoch": 0.07644276253547777, "grad_norm": 0.3893509305063281, "learning_rate": 0.0009944154131125641, "loss": 0.9308, "step": 202 }, { "epoch": 0.07682119205298013, "grad_norm": 0.8248894299480475, "learning_rate": 0.0009943236616274575, "loss": 0.8966, "step": 203 }, { "epoch": 0.0771996215704825, "grad_norm": 0.3039574680464213, "learning_rate": 0.0009942311668616794, "loss": 0.9015, "step": 204 }, { "epoch": 0.07757805108798486, "grad_norm": 0.3610999546813725, "learning_rate": 0.000994137928954308, "loss": 0.9087, "step": 205 }, { "epoch": 0.07795648060548722, "grad_norm": 0.39280045320160356, "learning_rate": 0.0009940439480455386, "loss": 0.9185, "step": 206 }, { "epoch": 0.0783349101229896, "grad_norm": 0.295675968349268, "learning_rate": 0.000993949224276684, "loss": 0.9257, "step": 207 }, { "epoch": 0.07871333964049196, "grad_norm": 0.37119894042644874, "learning_rate": 0.000993853757790174, "loss": 0.9374, "step": 208 }, { "epoch": 0.07909176915799432, "grad_norm": 1.287142716990376, "learning_rate": 0.000993757548729555, "loss": 0.9259, "step": 209 }, { "epoch": 0.07947019867549669, "grad_norm": 0.4944772742105093, "learning_rate": 0.0009936605972394896, "loss": 0.9246, "step": 210 }, { "epoch": 0.07984862819299905, "grad_norm": 0.3947964522877753, "learning_rate": 0.0009935629034657576, "loss": 0.8993, "step": 211 }, { "epoch": 0.08022705771050143, "grad_norm": 0.3934399837570024, "learning_rate": 0.0009934644675552542, "loss": 0.9255, "step": 212 }, { "epoch": 0.08060548722800379, "grad_norm": 0.33736191989083153, "learning_rate": 0.0009933652896559908, "loss": 0.8928, "step": 213 }, { "epoch": 0.08098391674550615, "grad_norm": 0.32019599098207213, "learning_rate": 0.0009932653699170943, "loss": 0.93, "step": 214 }, { "epoch": 0.08136234626300852, "grad_norm": 0.3707400393019346, "learning_rate": 0.0009931647084888073, "loss": 0.8992, "step": 215 }, { "epoch": 0.08174077578051088, "grad_norm": 1.142949179366423, "learning_rate": 0.0009930633055224875, "loss": 0.8973, "step": 216 }, { "epoch": 0.08211920529801324, "grad_norm": 0.3480584785933283, "learning_rate": 0.0009929611611706076, "loss": 0.8866, "step": 217 }, { "epoch": 0.08249763481551561, "grad_norm": 0.4077644729103915, "learning_rate": 0.0009928582755867551, "loss": 0.9053, "step": 218 }, { "epoch": 0.08287606433301797, "grad_norm": 0.3939874508306684, "learning_rate": 0.000992754648925632, "loss": 0.8917, "step": 219 }, { "epoch": 0.08325449385052035, "grad_norm": 0.29022904156267854, "learning_rate": 0.0009926502813430544, "loss": 0.8687, "step": 220 }, { "epoch": 0.08363292336802271, "grad_norm": 0.37819278046491783, "learning_rate": 0.0009925451729959531, "loss": 0.9185, "step": 221 }, { "epoch": 0.08401135288552507, "grad_norm": 0.32336501815896596, "learning_rate": 0.000992439324042372, "loss": 0.8931, "step": 222 }, { "epoch": 0.08438978240302744, "grad_norm": 4.424040031831433, "learning_rate": 0.0009923327346414693, "loss": 0.9085, "step": 223 }, { "epoch": 0.0847682119205298, "grad_norm": 0.5538340705792439, "learning_rate": 0.0009922254049535156, "loss": 0.9078, "step": 224 }, { "epoch": 0.08514664143803216, "grad_norm": 0.49897073067996894, "learning_rate": 0.0009921173351398957, "loss": 0.8928, "step": 225 }, { "epoch": 0.08552507095553454, "grad_norm": 0.37172123911771043, "learning_rate": 0.0009920085253631067, "loss": 0.8836, "step": 226 }, { "epoch": 0.0859035004730369, "grad_norm": 0.4833131040527168, "learning_rate": 0.0009918989757867582, "loss": 0.9086, "step": 227 }, { "epoch": 0.08628192999053927, "grad_norm": 0.4327439925307359, "learning_rate": 0.0009917886865755726, "loss": 0.8774, "step": 228 }, { "epoch": 0.08666035950804163, "grad_norm": 0.36618383688786527, "learning_rate": 0.0009916776578953843, "loss": 0.9094, "step": 229 }, { "epoch": 0.08703878902554399, "grad_norm": 0.3775056337731959, "learning_rate": 0.0009915658899131393, "loss": 0.8633, "step": 230 }, { "epoch": 0.08741721854304636, "grad_norm": 0.38231334451579335, "learning_rate": 0.0009914533827968957, "loss": 0.8864, "step": 231 }, { "epoch": 0.08779564806054872, "grad_norm": 2.366392297914569, "learning_rate": 0.0009913401367158225, "loss": 0.8853, "step": 232 }, { "epoch": 0.08817407757805108, "grad_norm": 0.506972718938183, "learning_rate": 0.0009912261518402004, "loss": 0.8834, "step": 233 }, { "epoch": 0.08855250709555346, "grad_norm": 0.5128191407847085, "learning_rate": 0.0009911114283414205, "loss": 0.8722, "step": 234 }, { "epoch": 0.08893093661305582, "grad_norm": 0.2954121613237416, "learning_rate": 0.0009909959663919844, "loss": 0.8904, "step": 235 }, { "epoch": 0.08930936613055819, "grad_norm": 0.4787517354351093, "learning_rate": 0.0009908797661655048, "loss": 0.9007, "step": 236 }, { "epoch": 0.08968779564806055, "grad_norm": 0.366434481738974, "learning_rate": 0.0009907628278367038, "loss": 0.922, "step": 237 }, { "epoch": 0.09006622516556291, "grad_norm": 1.1688546485765035, "learning_rate": 0.0009906451515814137, "loss": 0.8769, "step": 238 }, { "epoch": 0.09044465468306528, "grad_norm": 0.3555625959326832, "learning_rate": 0.000990526737576576, "loss": 0.8543, "step": 239 }, { "epoch": 0.09082308420056764, "grad_norm": 0.39847261610676843, "learning_rate": 0.000990407586000242, "loss": 0.8708, "step": 240 }, { "epoch": 0.09120151371807, "grad_norm": 0.4631539412707904, "learning_rate": 0.0009902876970315714, "loss": 0.8643, "step": 241 }, { "epoch": 0.09157994323557238, "grad_norm": 0.5003871313087779, "learning_rate": 0.0009901670708508334, "loss": 0.9012, "step": 242 }, { "epoch": 0.09195837275307474, "grad_norm": 0.43161607905541816, "learning_rate": 0.0009900457076394054, "loss": 0.8599, "step": 243 }, { "epoch": 0.09233680227057711, "grad_norm": 0.33809571849793985, "learning_rate": 0.0009899236075797727, "loss": 0.8471, "step": 244 }, { "epoch": 0.09271523178807947, "grad_norm": 0.4183793591197299, "learning_rate": 0.0009898007708555288, "loss": 0.8687, "step": 245 }, { "epoch": 0.09309366130558183, "grad_norm": 0.30744042540521543, "learning_rate": 0.000989677197651375, "loss": 0.8859, "step": 246 }, { "epoch": 0.0934720908230842, "grad_norm": 0.298170725436774, "learning_rate": 0.0009895528881531197, "loss": 0.8741, "step": 247 }, { "epoch": 0.09385052034058657, "grad_norm": 0.33495347484583343, "learning_rate": 0.000989427842547679, "loss": 0.8831, "step": 248 }, { "epoch": 0.09422894985808893, "grad_norm": 1.155254347732971, "learning_rate": 0.000989302061023075, "loss": 0.8519, "step": 249 }, { "epoch": 0.0946073793755913, "grad_norm": 0.3451714818934184, "learning_rate": 0.0009891755437684369, "loss": 0.8453, "step": 250 }, { "epoch": 0.09498580889309366, "grad_norm": 0.2745623612076799, "learning_rate": 0.0009890482909739999, "loss": 0.8453, "step": 251 }, { "epoch": 0.09536423841059603, "grad_norm": 0.3565770363080431, "learning_rate": 0.0009889203028311058, "loss": 0.8732, "step": 252 }, { "epoch": 0.0957426679280984, "grad_norm": 0.32397406893514763, "learning_rate": 0.000988791579532201, "loss": 0.8508, "step": 253 }, { "epoch": 0.09612109744560075, "grad_norm": 0.39221109722015585, "learning_rate": 0.0009886621212708384, "loss": 0.8584, "step": 254 }, { "epoch": 0.09649952696310313, "grad_norm": 0.33411237706531033, "learning_rate": 0.0009885319282416753, "loss": 0.8731, "step": 255 }, { "epoch": 0.09687795648060549, "grad_norm": 0.30849375492959136, "learning_rate": 0.0009884010006404743, "loss": 0.8683, "step": 256 }, { "epoch": 0.09725638599810785, "grad_norm": 0.37642292427574514, "learning_rate": 0.0009882693386641017, "loss": 0.8672, "step": 257 }, { "epoch": 0.09763481551561022, "grad_norm": 0.3894524523577147, "learning_rate": 0.0009881369425105294, "loss": 0.8949, "step": 258 }, { "epoch": 0.09801324503311258, "grad_norm": 0.38487884869436806, "learning_rate": 0.0009880038123788316, "loss": 0.8375, "step": 259 }, { "epoch": 0.09839167455061495, "grad_norm": 0.37362254996595606, "learning_rate": 0.0009878699484691875, "loss": 0.8335, "step": 260 }, { "epoch": 0.09877010406811731, "grad_norm": 1.6131588794767149, "learning_rate": 0.000987735350982879, "loss": 0.8407, "step": 261 }, { "epoch": 0.09914853358561967, "grad_norm": 0.29163323222124543, "learning_rate": 0.0009876000201222911, "loss": 0.8476, "step": 262 }, { "epoch": 0.09952696310312205, "grad_norm": 0.2993082540126885, "learning_rate": 0.0009874639560909118, "loss": 0.8302, "step": 263 }, { "epoch": 0.09990539262062441, "grad_norm": 0.33113835270629915, "learning_rate": 0.0009873271590933308, "loss": 0.8274, "step": 264 }, { "epoch": 0.10028382213812677, "grad_norm": 0.3819610702864904, "learning_rate": 0.000987189629335241, "loss": 0.8535, "step": 265 }, { "epoch": 0.10066225165562914, "grad_norm": 0.3052011464068054, "learning_rate": 0.0009870513670234358, "loss": 0.8423, "step": 266 }, { "epoch": 0.1010406811731315, "grad_norm": 0.33456631205450255, "learning_rate": 0.000986912372365811, "loss": 0.8191, "step": 267 }, { "epoch": 0.10141911069063388, "grad_norm": 0.3259879753075977, "learning_rate": 0.0009867726455713636, "loss": 0.8489, "step": 268 }, { "epoch": 0.10179754020813624, "grad_norm": 0.37178469130337494, "learning_rate": 0.0009866321868501912, "loss": 0.8774, "step": 269 }, { "epoch": 0.1021759697256386, "grad_norm": 5.854085177287691, "learning_rate": 0.0009864909964134919, "loss": 0.8807, "step": 270 }, { "epoch": 0.10255439924314097, "grad_norm": 0.49246071997718127, "learning_rate": 0.000986349074473564, "loss": 0.8322, "step": 271 }, { "epoch": 0.10293282876064333, "grad_norm": 0.38206344422033856, "learning_rate": 0.0009862064212438058, "loss": 0.8311, "step": 272 }, { "epoch": 0.10331125827814569, "grad_norm": 0.4455565235712603, "learning_rate": 0.0009860630369387152, "loss": 0.8439, "step": 273 }, { "epoch": 0.10368968779564806, "grad_norm": 0.5725391242139228, "learning_rate": 0.0009859189217738895, "loss": 0.8531, "step": 274 }, { "epoch": 0.10406811731315042, "grad_norm": 0.37742369932264014, "learning_rate": 0.0009857740759660247, "loss": 0.854, "step": 275 }, { "epoch": 0.1044465468306528, "grad_norm": 0.5581176169100414, "learning_rate": 0.0009856284997329158, "loss": 0.8645, "step": 276 }, { "epoch": 0.10482497634815516, "grad_norm": 0.2999254412189382, "learning_rate": 0.0009854821932934554, "loss": 0.8517, "step": 277 }, { "epoch": 0.10520340586565752, "grad_norm": 0.40210318549302493, "learning_rate": 0.0009853351568676347, "loss": 0.8489, "step": 278 }, { "epoch": 0.10558183538315989, "grad_norm": 0.3764119191453421, "learning_rate": 0.0009851873906765423, "loss": 0.8305, "step": 279 }, { "epoch": 0.10596026490066225, "grad_norm": 2.973630086462215, "learning_rate": 0.0009850388949423643, "loss": 0.8532, "step": 280 }, { "epoch": 0.10633869441816461, "grad_norm": 0.43203527438339884, "learning_rate": 0.0009848896698883833, "loss": 0.8328, "step": 281 }, { "epoch": 0.10671712393566699, "grad_norm": 0.4953591321780892, "learning_rate": 0.0009847397157389788, "loss": 0.8387, "step": 282 }, { "epoch": 0.10709555345316935, "grad_norm": 0.37699245583703844, "learning_rate": 0.0009845890327196268, "loss": 0.8552, "step": 283 }, { "epoch": 0.10747398297067172, "grad_norm": 0.3783957985567119, "learning_rate": 0.000984437621056899, "loss": 0.8268, "step": 284 }, { "epoch": 0.10785241248817408, "grad_norm": 0.38365627253209894, "learning_rate": 0.0009842854809784625, "loss": 0.8577, "step": 285 }, { "epoch": 0.10823084200567644, "grad_norm": 0.34206269962647196, "learning_rate": 0.0009841326127130803, "loss": 0.8248, "step": 286 }, { "epoch": 0.10860927152317881, "grad_norm": 0.4349781993905943, "learning_rate": 0.0009839790164906097, "loss": 0.8331, "step": 287 }, { "epoch": 0.10898770104068117, "grad_norm": 0.37843631822951224, "learning_rate": 0.0009838246925420028, "loss": 0.8112, "step": 288 }, { "epoch": 0.10936613055818353, "grad_norm": 5.910783591952799, "learning_rate": 0.0009836696410993064, "loss": 0.8497, "step": 289 }, { "epoch": 0.1097445600756859, "grad_norm": 0.4408619149064092, "learning_rate": 0.0009835138623956602, "loss": 0.8357, "step": 290 }, { "epoch": 0.11012298959318827, "grad_norm": 0.3452040091922711, "learning_rate": 0.0009833573566652982, "loss": 0.8603, "step": 291 }, { "epoch": 0.11050141911069064, "grad_norm": 0.442788214115146, "learning_rate": 0.0009832001241435475, "loss": 0.8417, "step": 292 }, { "epoch": 0.110879848628193, "grad_norm": 0.3714087303955037, "learning_rate": 0.0009830421650668276, "loss": 0.8321, "step": 293 }, { "epoch": 0.11125827814569536, "grad_norm": 0.32106145663887387, "learning_rate": 0.000982883479672651, "loss": 0.8582, "step": 294 }, { "epoch": 0.11163670766319773, "grad_norm": 0.36827545099948766, "learning_rate": 0.0009827240681996219, "loss": 0.8322, "step": 295 }, { "epoch": 0.1120151371807001, "grad_norm": 0.3535929853403249, "learning_rate": 0.0009825639308874365, "loss": 0.8287, "step": 296 }, { "epoch": 0.11239356669820245, "grad_norm": 0.34962273684066825, "learning_rate": 0.0009824030679768823, "loss": 0.8076, "step": 297 }, { "epoch": 0.11277199621570483, "grad_norm": 0.3032511085024557, "learning_rate": 0.000982241479709838, "loss": 0.8194, "step": 298 }, { "epoch": 0.11315042573320719, "grad_norm": 0.36536152304330444, "learning_rate": 0.0009820791663292725, "loss": 0.8276, "step": 299 }, { "epoch": 0.11352885525070956, "grad_norm": 0.7592078272160782, "learning_rate": 0.0009819161280792458, "loss": 0.8342, "step": 300 }, { "epoch": 0.11390728476821192, "grad_norm": 0.41343252373639455, "learning_rate": 0.000981752365204907, "loss": 0.8483, "step": 301 }, { "epoch": 0.11428571428571428, "grad_norm": 0.3677970007561517, "learning_rate": 0.000981587877952495, "loss": 0.8133, "step": 302 }, { "epoch": 0.11466414380321666, "grad_norm": 0.37288170798058046, "learning_rate": 0.0009814226665693384, "loss": 0.8551, "step": 303 }, { "epoch": 0.11504257332071902, "grad_norm": 0.3703009116697511, "learning_rate": 0.000981256731303854, "loss": 0.8432, "step": 304 }, { "epoch": 0.11542100283822138, "grad_norm": 0.34801062192820137, "learning_rate": 0.0009810900724055477, "loss": 0.8111, "step": 305 }, { "epoch": 0.11579943235572375, "grad_norm": 0.31933309104654073, "learning_rate": 0.0009809226901250124, "loss": 0.8222, "step": 306 }, { "epoch": 0.11617786187322611, "grad_norm": 0.3552777751974471, "learning_rate": 0.00098075458471393, "loss": 0.8003, "step": 307 }, { "epoch": 0.11655629139072848, "grad_norm": 0.3004115917096367, "learning_rate": 0.0009805857564250688, "loss": 0.7953, "step": 308 }, { "epoch": 0.11693472090823084, "grad_norm": 0.27954162864118104, "learning_rate": 0.0009804162055122845, "loss": 0.8323, "step": 309 }, { "epoch": 0.1173131504257332, "grad_norm": 0.606662434949194, "learning_rate": 0.0009802459322305192, "loss": 0.8002, "step": 310 }, { "epoch": 0.11769157994323558, "grad_norm": 0.5399605830252874, "learning_rate": 0.0009800749368358008, "loss": 0.8079, "step": 311 }, { "epoch": 0.11807000946073794, "grad_norm": 0.3236275303558643, "learning_rate": 0.000979903219585244, "loss": 0.821, "step": 312 }, { "epoch": 0.1184484389782403, "grad_norm": 0.3199832666388051, "learning_rate": 0.000979730780737048, "loss": 0.811, "step": 313 }, { "epoch": 0.11882686849574267, "grad_norm": 0.3021458226826219, "learning_rate": 0.000979557620550497, "loss": 0.7929, "step": 314 }, { "epoch": 0.11920529801324503, "grad_norm": 0.2819368267765857, "learning_rate": 0.0009793837392859605, "loss": 0.8208, "step": 315 }, { "epoch": 0.1195837275307474, "grad_norm": 0.3291646291371955, "learning_rate": 0.000979209137204892, "loss": 0.8038, "step": 316 }, { "epoch": 0.11996215704824977, "grad_norm": 0.279737192140278, "learning_rate": 0.0009790338145698283, "loss": 0.8088, "step": 317 }, { "epoch": 0.12034058656575213, "grad_norm": 2.5989847328409854, "learning_rate": 0.0009788577716443902, "loss": 0.9021, "step": 318 }, { "epoch": 0.1207190160832545, "grad_norm": 0.6150018646242982, "learning_rate": 0.0009786810086932815, "loss": 0.8351, "step": 319 }, { "epoch": 0.12109744560075686, "grad_norm": 0.4205806798385763, "learning_rate": 0.0009785035259822882, "loss": 0.818, "step": 320 }, { "epoch": 0.12147587511825922, "grad_norm": 0.34591710208829274, "learning_rate": 0.0009783253237782795, "loss": 0.8387, "step": 321 }, { "epoch": 0.12185430463576159, "grad_norm": 0.33783853824001603, "learning_rate": 0.0009781464023492054, "loss": 0.8408, "step": 322 }, { "epoch": 0.12223273415326395, "grad_norm": 0.3382096520964054, "learning_rate": 0.0009779667619640982, "loss": 0.8019, "step": 323 }, { "epoch": 0.12261116367076633, "grad_norm": 0.30577343821110736, "learning_rate": 0.0009777864028930705, "loss": 0.7822, "step": 324 }, { "epoch": 0.12298959318826869, "grad_norm": 0.33190850625736207, "learning_rate": 0.0009776053254073158, "loss": 0.8097, "step": 325 }, { "epoch": 0.12336802270577105, "grad_norm": 0.32561722154900175, "learning_rate": 0.0009774235297791082, "loss": 0.8207, "step": 326 }, { "epoch": 0.12374645222327342, "grad_norm": 0.2723165423426138, "learning_rate": 0.0009772410162818014, "loss": 0.8157, "step": 327 }, { "epoch": 0.12412488174077578, "grad_norm": 0.2853322335236698, "learning_rate": 0.0009770577851898287, "loss": 0.8075, "step": 328 }, { "epoch": 0.12450331125827814, "grad_norm": 0.23060302540520286, "learning_rate": 0.0009768738367787013, "loss": 0.7811, "step": 329 }, { "epoch": 0.12488174077578051, "grad_norm": 0.3034988840560256, "learning_rate": 0.0009766891713250106, "loss": 0.8465, "step": 330 }, { "epoch": 0.12526017029328287, "grad_norm": 0.2987887038613879, "learning_rate": 0.000976503789106425, "loss": 0.762, "step": 331 }, { "epoch": 0.12563859981078523, "grad_norm": 0.2834812771469646, "learning_rate": 0.0009763176904016913, "loss": 0.7797, "step": 332 }, { "epoch": 0.1260170293282876, "grad_norm": 0.2786309767116715, "learning_rate": 0.0009761308754906333, "loss": 0.7766, "step": 333 }, { "epoch": 0.12639545884578998, "grad_norm": 0.3184089856020423, "learning_rate": 0.000975943344654152, "loss": 0.7776, "step": 334 }, { "epoch": 0.12677388836329234, "grad_norm": 0.24022872266865394, "learning_rate": 0.0009757550981742241, "loss": 0.7961, "step": 335 }, { "epoch": 0.1271523178807947, "grad_norm": 0.2930568562584728, "learning_rate": 0.0009755661363339037, "loss": 0.8017, "step": 336 }, { "epoch": 0.12753074739829706, "grad_norm": 0.275392654086815, "learning_rate": 0.0009753764594173192, "loss": 0.7952, "step": 337 }, { "epoch": 0.12790917691579942, "grad_norm": 0.34089574552915386, "learning_rate": 0.0009751860677096752, "loss": 0.7644, "step": 338 }, { "epoch": 0.1282876064333018, "grad_norm": 0.2987520208250515, "learning_rate": 0.0009749949614972505, "loss": 0.8059, "step": 339 }, { "epoch": 0.12866603595080417, "grad_norm": 0.291851328680214, "learning_rate": 0.0009748031410673985, "loss": 0.7965, "step": 340 }, { "epoch": 0.12904446546830653, "grad_norm": 0.2905731934343656, "learning_rate": 0.0009746106067085464, "loss": 0.7789, "step": 341 }, { "epoch": 0.1294228949858089, "grad_norm": 0.24547876991980858, "learning_rate": 0.000974417358710195, "loss": 0.7703, "step": 342 }, { "epoch": 0.12980132450331125, "grad_norm": 0.27528605268058354, "learning_rate": 0.0009742233973629179, "loss": 0.7756, "step": 343 }, { "epoch": 0.13017975402081364, "grad_norm": 0.698706199283159, "learning_rate": 0.000974028722958362, "loss": 0.7945, "step": 344 }, { "epoch": 0.130558183538316, "grad_norm": 0.3289953337696455, "learning_rate": 0.0009738333357892453, "loss": 0.7888, "step": 345 }, { "epoch": 0.13093661305581836, "grad_norm": 0.29598174480418865, "learning_rate": 0.0009736372361493584, "loss": 0.7893, "step": 346 }, { "epoch": 0.13131504257332072, "grad_norm": 0.29617086780148416, "learning_rate": 0.0009734404243335631, "loss": 0.763, "step": 347 }, { "epoch": 0.13169347209082308, "grad_norm": 0.32081050074236966, "learning_rate": 0.0009732429006377914, "loss": 0.7655, "step": 348 }, { "epoch": 0.13207190160832544, "grad_norm": 0.29992705590835306, "learning_rate": 0.0009730446653590466, "loss": 0.762, "step": 349 }, { "epoch": 0.13245033112582782, "grad_norm": 0.3282813766784278, "learning_rate": 0.0009728457187954012, "loss": 0.7773, "step": 350 }, { "epoch": 0.13282876064333018, "grad_norm": 0.26588795773110463, "learning_rate": 0.0009726460612459977, "loss": 0.7764, "step": 351 }, { "epoch": 0.13320719016083254, "grad_norm": 0.36389923855491446, "learning_rate": 0.0009724456930110475, "loss": 0.8093, "step": 352 }, { "epoch": 0.1335856196783349, "grad_norm": 0.2532918391994017, "learning_rate": 0.0009722446143918306, "loss": 0.7957, "step": 353 }, { "epoch": 0.13396404919583726, "grad_norm": 0.3361523739466377, "learning_rate": 0.0009720428256906953, "loss": 0.7644, "step": 354 }, { "epoch": 0.13434247871333965, "grad_norm": 1.6595732259797746, "learning_rate": 0.000971840327211057, "loss": 0.7869, "step": 355 }, { "epoch": 0.134720908230842, "grad_norm": 0.320112863992072, "learning_rate": 0.0009716371192573994, "loss": 0.7667, "step": 356 }, { "epoch": 0.13509933774834437, "grad_norm": 0.4516760910934822, "learning_rate": 0.0009714332021352721, "loss": 0.7672, "step": 357 }, { "epoch": 0.13547776726584673, "grad_norm": 3.242372365070362, "learning_rate": 0.0009712285761512913, "loss": 0.8033, "step": 358 }, { "epoch": 0.1358561967833491, "grad_norm": 0.558540341685041, "learning_rate": 0.000971023241613139, "loss": 0.7802, "step": 359 }, { "epoch": 0.13623462630085148, "grad_norm": 0.5895350071362426, "learning_rate": 0.000970817198829563, "loss": 0.779, "step": 360 }, { "epoch": 0.13661305581835384, "grad_norm": 0.40943691207734056, "learning_rate": 0.0009706104481103754, "loss": 0.7975, "step": 361 }, { "epoch": 0.1369914853358562, "grad_norm": 0.37601223070462125, "learning_rate": 0.0009704029897664536, "loss": 0.7869, "step": 362 }, { "epoch": 0.13736991485335856, "grad_norm": 0.34511888565137905, "learning_rate": 0.000970194824109738, "loss": 0.7606, "step": 363 }, { "epoch": 0.13774834437086092, "grad_norm": 0.26179181687430014, "learning_rate": 0.0009699859514532333, "loss": 0.7633, "step": 364 }, { "epoch": 0.13812677388836328, "grad_norm": 4.818123244556063, "learning_rate": 0.000969776372111007, "loss": 0.7947, "step": 365 }, { "epoch": 0.13850520340586567, "grad_norm": 0.45980446780242223, "learning_rate": 0.0009695660863981892, "loss": 0.7862, "step": 366 }, { "epoch": 0.13888363292336803, "grad_norm": 0.41169576996980684, "learning_rate": 0.0009693550946309722, "loss": 0.7731, "step": 367 }, { "epoch": 0.1392620624408704, "grad_norm": 0.29027496403197917, "learning_rate": 0.0009691433971266097, "loss": 0.7528, "step": 368 }, { "epoch": 0.13964049195837275, "grad_norm": 0.32339718148118574, "learning_rate": 0.000968930994203417, "loss": 0.7794, "step": 369 }, { "epoch": 0.1400189214758751, "grad_norm": 0.42251199739114287, "learning_rate": 0.0009687178861807697, "loss": 0.7807, "step": 370 }, { "epoch": 0.1403973509933775, "grad_norm": 0.2843635331451484, "learning_rate": 0.0009685040733791037, "loss": 0.7751, "step": 371 }, { "epoch": 0.14077578051087986, "grad_norm": 1.1196660111771672, "learning_rate": 0.0009682895561199149, "loss": 0.7532, "step": 372 }, { "epoch": 0.14115421002838222, "grad_norm": 0.614666679894674, "learning_rate": 0.000968074334725758, "loss": 0.746, "step": 373 }, { "epoch": 0.14153263954588458, "grad_norm": 0.4514004512267843, "learning_rate": 0.0009678584095202469, "loss": 0.7865, "step": 374 }, { "epoch": 0.14191106906338694, "grad_norm": 0.43851367139219705, "learning_rate": 0.0009676417808280534, "loss": 0.7666, "step": 375 }, { "epoch": 0.14228949858088932, "grad_norm": 0.7491418777186419, "learning_rate": 0.0009674244489749071, "loss": 0.7836, "step": 376 }, { "epoch": 0.14266792809839168, "grad_norm": 1.2215263825486247, "learning_rate": 0.0009672064142875954, "loss": 0.7516, "step": 377 }, { "epoch": 0.14304635761589404, "grad_norm": 0.6083619916601698, "learning_rate": 0.0009669876770939619, "loss": 0.7734, "step": 378 }, { "epoch": 0.1434247871333964, "grad_norm": 0.47389575698862435, "learning_rate": 0.0009667682377229069, "loss": 0.7588, "step": 379 }, { "epoch": 0.14380321665089876, "grad_norm": 0.35440180022878215, "learning_rate": 0.0009665480965043862, "loss": 0.7703, "step": 380 }, { "epoch": 0.14418164616840112, "grad_norm": 0.4018139359992483, "learning_rate": 0.0009663272537694112, "loss": 0.7607, "step": 381 }, { "epoch": 0.1445600756859035, "grad_norm": 0.4284804469702902, "learning_rate": 0.0009661057098500481, "loss": 0.762, "step": 382 }, { "epoch": 0.14493850520340587, "grad_norm": 3.2578891955525506, "learning_rate": 0.0009658834650794171, "loss": 0.782, "step": 383 }, { "epoch": 0.14531693472090823, "grad_norm": 0.5470036491572197, "learning_rate": 0.0009656605197916926, "loss": 0.7983, "step": 384 }, { "epoch": 0.1456953642384106, "grad_norm": 0.48163218865968865, "learning_rate": 0.0009654368743221021, "loss": 0.7817, "step": 385 }, { "epoch": 0.14607379375591295, "grad_norm": 0.3461382546668084, "learning_rate": 0.0009652125290069263, "loss": 0.784, "step": 386 }, { "epoch": 0.14645222327341534, "grad_norm": 0.3070941414132326, "learning_rate": 0.0009649874841834974, "loss": 0.7814, "step": 387 }, { "epoch": 0.1468306527909177, "grad_norm": 0.39181860714216266, "learning_rate": 0.0009647617401902002, "loss": 0.7885, "step": 388 }, { "epoch": 0.14720908230842006, "grad_norm": 0.29118060869363566, "learning_rate": 0.0009645352973664705, "loss": 0.7838, "step": 389 }, { "epoch": 0.14758751182592242, "grad_norm": 0.8644213850328436, "learning_rate": 0.0009643081560527948, "loss": 0.8103, "step": 390 }, { "epoch": 0.14796594134342478, "grad_norm": 0.2959850927188237, "learning_rate": 0.0009640803165907099, "loss": 0.7477, "step": 391 }, { "epoch": 0.14834437086092717, "grad_norm": 0.351473224748773, "learning_rate": 0.0009638517793228027, "loss": 0.7631, "step": 392 }, { "epoch": 0.14872280037842953, "grad_norm": 0.3291728567897442, "learning_rate": 0.0009636225445927087, "loss": 0.7457, "step": 393 }, { "epoch": 0.14910122989593189, "grad_norm": 0.3179061096875065, "learning_rate": 0.0009633926127451127, "loss": 0.7596, "step": 394 }, { "epoch": 0.14947965941343425, "grad_norm": 0.3857072838552675, "learning_rate": 0.0009631619841257475, "loss": 0.7704, "step": 395 }, { "epoch": 0.1498580889309366, "grad_norm": 0.3296680724942003, "learning_rate": 0.0009629306590813934, "loss": 0.7407, "step": 396 }, { "epoch": 0.15023651844843897, "grad_norm": 0.2953685384391499, "learning_rate": 0.0009626986379598782, "loss": 0.7689, "step": 397 }, { "epoch": 0.15061494796594135, "grad_norm": 0.4122656937035812, "learning_rate": 0.0009624659211100761, "loss": 0.7424, "step": 398 }, { "epoch": 0.1509933774834437, "grad_norm": 0.27564464514219217, "learning_rate": 0.0009622325088819076, "loss": 0.7512, "step": 399 }, { "epoch": 0.15137180700094607, "grad_norm": 0.29509820782405827, "learning_rate": 0.0009619984016263386, "loss": 0.7532, "step": 400 }, { "epoch": 0.15175023651844843, "grad_norm": 0.8030996860418512, "learning_rate": 0.0009617635996953802, "loss": 0.7406, "step": 401 }, { "epoch": 0.1521286660359508, "grad_norm": 0.35111568455410197, "learning_rate": 0.000961528103442088, "loss": 0.7508, "step": 402 }, { "epoch": 0.15250709555345318, "grad_norm": 0.3005641674067682, "learning_rate": 0.0009612919132205616, "loss": 0.751, "step": 403 }, { "epoch": 0.15288552507095554, "grad_norm": 0.2680564002999373, "learning_rate": 0.0009610550293859441, "loss": 0.7645, "step": 404 }, { "epoch": 0.1532639545884579, "grad_norm": 0.2925634759920494, "learning_rate": 0.0009608174522944215, "loss": 0.7525, "step": 405 }, { "epoch": 0.15364238410596026, "grad_norm": 0.3040967496222814, "learning_rate": 0.0009605791823032222, "loss": 0.7633, "step": 406 }, { "epoch": 0.15402081362346262, "grad_norm": 0.31675978324673176, "learning_rate": 0.0009603402197706166, "loss": 0.7168, "step": 407 }, { "epoch": 0.154399243140965, "grad_norm": 0.2854091620096729, "learning_rate": 0.0009601005650559161, "loss": 0.752, "step": 408 }, { "epoch": 0.15477767265846737, "grad_norm": 0.3194937593062523, "learning_rate": 0.0009598602185194733, "loss": 0.7325, "step": 409 }, { "epoch": 0.15515610217596973, "grad_norm": 0.25856396944450516, "learning_rate": 0.0009596191805226808, "loss": 0.7377, "step": 410 }, { "epoch": 0.1555345316934721, "grad_norm": 0.3711712806422301, "learning_rate": 0.0009593774514279707, "loss": 0.7764, "step": 411 }, { "epoch": 0.15591296121097445, "grad_norm": 0.28154251993059526, "learning_rate": 0.000959135031598815, "loss": 0.7721, "step": 412 }, { "epoch": 0.1562913907284768, "grad_norm": 0.27559143212327003, "learning_rate": 0.0009588919213997232, "loss": 0.7364, "step": 413 }, { "epoch": 0.1566698202459792, "grad_norm": 0.8567218399339995, "learning_rate": 0.000958648121196244, "loss": 0.7423, "step": 414 }, { "epoch": 0.15704824976348156, "grad_norm": 0.3068679889957997, "learning_rate": 0.0009584036313549629, "loss": 0.7489, "step": 415 }, { "epoch": 0.15742667928098392, "grad_norm": 0.2970697993588201, "learning_rate": 0.0009581584522435024, "loss": 0.7569, "step": 416 }, { "epoch": 0.15780510879848628, "grad_norm": 0.28122231559133504, "learning_rate": 0.0009579125842305217, "loss": 0.77, "step": 417 }, { "epoch": 0.15818353831598864, "grad_norm": 0.24648851188658188, "learning_rate": 0.0009576660276857157, "loss": 0.7357, "step": 418 }, { "epoch": 0.15856196783349102, "grad_norm": 0.4527419872154144, "learning_rate": 0.0009574187829798147, "loss": 0.7536, "step": 419 }, { "epoch": 0.15894039735099338, "grad_norm": 0.30722798749104574, "learning_rate": 0.0009571708504845835, "loss": 0.7695, "step": 420 }, { "epoch": 0.15931882686849574, "grad_norm": 0.2768742792107218, "learning_rate": 0.0009569222305728213, "loss": 0.7522, "step": 421 }, { "epoch": 0.1596972563859981, "grad_norm": 0.27307334902535907, "learning_rate": 0.0009566729236183607, "loss": 0.7162, "step": 422 }, { "epoch": 0.16007568590350046, "grad_norm": 0.3895006018867788, "learning_rate": 0.0009564229299960679, "loss": 0.752, "step": 423 }, { "epoch": 0.16045411542100285, "grad_norm": 0.2965463777024702, "learning_rate": 0.0009561722500818407, "loss": 0.7363, "step": 424 }, { "epoch": 0.1608325449385052, "grad_norm": 0.36970813461462315, "learning_rate": 0.0009559208842526097, "loss": 0.726, "step": 425 }, { "epoch": 0.16121097445600757, "grad_norm": 0.25760972111245406, "learning_rate": 0.0009556688328863363, "loss": 0.7376, "step": 426 }, { "epoch": 0.16158940397350993, "grad_norm": 0.3580829572714847, "learning_rate": 0.0009554160963620132, "loss": 0.7615, "step": 427 }, { "epoch": 0.1619678334910123, "grad_norm": 0.2719281226908318, "learning_rate": 0.0009551626750596628, "loss": 0.7264, "step": 428 }, { "epoch": 0.16234626300851465, "grad_norm": 0.2880314662320415, "learning_rate": 0.0009549085693603371, "loss": 0.7422, "step": 429 }, { "epoch": 0.16272469252601704, "grad_norm": 0.23281772146834182, "learning_rate": 0.0009546537796461179, "loss": 0.7392, "step": 430 }, { "epoch": 0.1631031220435194, "grad_norm": 0.3124729860073785, "learning_rate": 0.0009543983063001149, "loss": 0.733, "step": 431 }, { "epoch": 0.16348155156102176, "grad_norm": 0.2451978755839451, "learning_rate": 0.0009541421497064659, "loss": 0.7378, "step": 432 }, { "epoch": 0.16385998107852412, "grad_norm": 0.24325712943141217, "learning_rate": 0.0009538853102503361, "loss": 0.7544, "step": 433 }, { "epoch": 0.16423841059602648, "grad_norm": 0.23481366908402015, "learning_rate": 0.0009536277883179173, "loss": 0.758, "step": 434 }, { "epoch": 0.16461684011352887, "grad_norm": 0.2943807652591247, "learning_rate": 0.0009533695842964276, "loss": 0.7478, "step": 435 }, { "epoch": 0.16499526963103123, "grad_norm": 0.2947479139194325, "learning_rate": 0.000953110698574111, "loss": 0.7399, "step": 436 }, { "epoch": 0.1653736991485336, "grad_norm": 0.2721973360914397, "learning_rate": 0.0009528511315402357, "loss": 0.7423, "step": 437 }, { "epoch": 0.16575212866603595, "grad_norm": 0.28141181317580544, "learning_rate": 0.0009525908835850955, "loss": 0.7275, "step": 438 }, { "epoch": 0.1661305581835383, "grad_norm": 0.3141448126577589, "learning_rate": 0.0009523299551000071, "loss": 0.7431, "step": 439 }, { "epoch": 0.1665089877010407, "grad_norm": 0.2069799431618452, "learning_rate": 0.0009520683464773109, "loss": 0.7555, "step": 440 }, { "epoch": 0.16688741721854305, "grad_norm": 0.24383849713804154, "learning_rate": 0.0009518060581103698, "loss": 0.7416, "step": 441 }, { "epoch": 0.16726584673604541, "grad_norm": 0.24398579074572238, "learning_rate": 0.0009515430903935692, "loss": 0.7256, "step": 442 }, { "epoch": 0.16764427625354777, "grad_norm": 0.2865649140650446, "learning_rate": 0.0009512794437223152, "loss": 0.7152, "step": 443 }, { "epoch": 0.16802270577105013, "grad_norm": 0.2414304992693735, "learning_rate": 0.0009510151184930353, "loss": 0.7266, "step": 444 }, { "epoch": 0.1684011352885525, "grad_norm": 0.26604658218379323, "learning_rate": 0.0009507501151031776, "loss": 0.7449, "step": 445 }, { "epoch": 0.16877956480605488, "grad_norm": 0.2782869553235475, "learning_rate": 0.0009504844339512095, "loss": 0.7368, "step": 446 }, { "epoch": 0.16915799432355724, "grad_norm": 0.2525019914246391, "learning_rate": 0.0009502180754366175, "loss": 0.7361, "step": 447 }, { "epoch": 0.1695364238410596, "grad_norm": 0.2971652017966633, "learning_rate": 0.0009499510399599066, "loss": 0.7393, "step": 448 }, { "epoch": 0.16991485335856196, "grad_norm": 0.7781814585353098, "learning_rate": 0.0009496833279225998, "loss": 0.7372, "step": 449 }, { "epoch": 0.17029328287606432, "grad_norm": 0.2750411284642318, "learning_rate": 0.0009494149397272374, "loss": 0.739, "step": 450 }, { "epoch": 0.1706717123935667, "grad_norm": 0.30601318214265366, "learning_rate": 0.0009491458757773766, "loss": 0.727, "step": 451 }, { "epoch": 0.17105014191106907, "grad_norm": 0.3032715983675422, "learning_rate": 0.0009488761364775899, "loss": 0.7048, "step": 452 }, { "epoch": 0.17142857142857143, "grad_norm": 0.328705319896619, "learning_rate": 0.0009486057222334663, "loss": 0.7215, "step": 453 }, { "epoch": 0.1718070009460738, "grad_norm": 0.2725301128337919, "learning_rate": 0.0009483346334516091, "loss": 0.7277, "step": 454 }, { "epoch": 0.17218543046357615, "grad_norm": 0.31953375052893696, "learning_rate": 0.0009480628705396359, "loss": 0.7257, "step": 455 }, { "epoch": 0.17256385998107854, "grad_norm": 0.33239237276764616, "learning_rate": 0.000947790433906178, "loss": 0.7321, "step": 456 }, { "epoch": 0.1729422894985809, "grad_norm": 0.7993445930334834, "learning_rate": 0.0009475173239608796, "loss": 0.7422, "step": 457 }, { "epoch": 0.17332071901608326, "grad_norm": 0.3785512185068186, "learning_rate": 0.0009472435411143978, "loss": 0.7334, "step": 458 }, { "epoch": 0.17369914853358562, "grad_norm": 0.8933921791262094, "learning_rate": 0.0009469690857784007, "loss": 0.7573, "step": 459 }, { "epoch": 0.17407757805108798, "grad_norm": 1.3685445535534202, "learning_rate": 0.0009466939583655685, "loss": 0.7334, "step": 460 }, { "epoch": 0.17445600756859034, "grad_norm": 0.9510125686745562, "learning_rate": 0.000946418159289591, "loss": 0.7472, "step": 461 }, { "epoch": 0.17483443708609273, "grad_norm": 0.35703498454404964, "learning_rate": 0.0009461416889651687, "loss": 0.7497, "step": 462 }, { "epoch": 0.17521286660359509, "grad_norm": 0.5597680825860326, "learning_rate": 0.000945864547808011, "loss": 0.6962, "step": 463 }, { "epoch": 0.17559129612109745, "grad_norm": 0.41000585069203566, "learning_rate": 0.000945586736234836, "loss": 0.7229, "step": 464 }, { "epoch": 0.1759697256385998, "grad_norm": 0.38465546710890514, "learning_rate": 0.0009453082546633702, "loss": 0.7201, "step": 465 }, { "epoch": 0.17634815515610217, "grad_norm": 0.3492319957943449, "learning_rate": 0.000945029103512347, "loss": 0.7348, "step": 466 }, { "epoch": 0.17672658467360455, "grad_norm": 0.8224292148404972, "learning_rate": 0.0009447492832015072, "loss": 0.7324, "step": 467 }, { "epoch": 0.1771050141911069, "grad_norm": 0.585043976100825, "learning_rate": 0.0009444687941515973, "loss": 0.7568, "step": 468 }, { "epoch": 0.17748344370860927, "grad_norm": 0.4037086889077928, "learning_rate": 0.0009441876367843694, "loss": 0.7299, "step": 469 }, { "epoch": 0.17786187322611163, "grad_norm": 0.4760104562167927, "learning_rate": 0.0009439058115225807, "loss": 0.7328, "step": 470 }, { "epoch": 0.178240302743614, "grad_norm": 0.4224833235186354, "learning_rate": 0.0009436233187899927, "loss": 0.7137, "step": 471 }, { "epoch": 0.17861873226111638, "grad_norm": 0.44433296957177576, "learning_rate": 0.00094334015901137, "loss": 0.7661, "step": 472 }, { "epoch": 0.17899716177861874, "grad_norm": 0.40737116599583206, "learning_rate": 0.000943056332612481, "loss": 0.7078, "step": 473 }, { "epoch": 0.1793755912961211, "grad_norm": 1.5608861599747224, "learning_rate": 0.0009427718400200958, "loss": 0.7314, "step": 474 }, { "epoch": 0.17975402081362346, "grad_norm": 1.084420496181323, "learning_rate": 0.0009424866816619866, "loss": 0.7397, "step": 475 }, { "epoch": 0.18013245033112582, "grad_norm": 0.5804327962632815, "learning_rate": 0.0009422008579669265, "loss": 0.734, "step": 476 }, { "epoch": 0.18051087984862818, "grad_norm": 0.49805125814266893, "learning_rate": 0.0009419143693646888, "loss": 0.7481, "step": 477 }, { "epoch": 0.18088930936613057, "grad_norm": 0.37887490526309536, "learning_rate": 0.0009416272162860471, "loss": 0.7412, "step": 478 }, { "epoch": 0.18126773888363293, "grad_norm": 0.37045900368228496, "learning_rate": 0.0009413393991627737, "loss": 0.7223, "step": 479 }, { "epoch": 0.1816461684011353, "grad_norm": 0.41672215886933406, "learning_rate": 0.0009410509184276395, "loss": 0.7564, "step": 480 }, { "epoch": 0.18202459791863765, "grad_norm": 0.3516889662568837, "learning_rate": 0.0009407617745144135, "loss": 0.7488, "step": 481 }, { "epoch": 0.18240302743614, "grad_norm": 0.3626806359924242, "learning_rate": 0.0009404719678578611, "loss": 0.729, "step": 482 }, { "epoch": 0.1827814569536424, "grad_norm": 0.34200130343746515, "learning_rate": 0.0009401814988937452, "loss": 0.7312, "step": 483 }, { "epoch": 0.18315988647114476, "grad_norm": 0.27937847892677453, "learning_rate": 0.000939890368058824, "loss": 0.7394, "step": 484 }, { "epoch": 0.18353831598864712, "grad_norm": 1.174650795767306, "learning_rate": 0.0009395985757908509, "loss": 0.7449, "step": 485 }, { "epoch": 0.18391674550614948, "grad_norm": 0.34610617036254193, "learning_rate": 0.0009393061225285742, "loss": 0.7377, "step": 486 }, { "epoch": 0.18429517502365184, "grad_norm": 0.41262797407911844, "learning_rate": 0.0009390130087117356, "loss": 0.7338, "step": 487 }, { "epoch": 0.18467360454115422, "grad_norm": 0.5456569458216602, "learning_rate": 0.0009387192347810703, "loss": 0.7271, "step": 488 }, { "epoch": 0.18505203405865658, "grad_norm": 0.8368315861945903, "learning_rate": 0.0009384248011783063, "loss": 0.7372, "step": 489 }, { "epoch": 0.18543046357615894, "grad_norm": 0.35363952825859335, "learning_rate": 0.0009381297083461632, "loss": 0.7221, "step": 490 }, { "epoch": 0.1858088930936613, "grad_norm": 0.29456001465070863, "learning_rate": 0.0009378339567283518, "loss": 0.7278, "step": 491 }, { "epoch": 0.18618732261116366, "grad_norm": 0.32215314115807275, "learning_rate": 0.0009375375467695735, "loss": 0.7109, "step": 492 }, { "epoch": 0.18656575212866602, "grad_norm": 0.45456396371371577, "learning_rate": 0.0009372404789155198, "loss": 0.7122, "step": 493 }, { "epoch": 0.1869441816461684, "grad_norm": 0.3506798578550395, "learning_rate": 0.0009369427536128714, "loss": 0.7129, "step": 494 }, { "epoch": 0.18732261116367077, "grad_norm": 0.44769754335842654, "learning_rate": 0.0009366443713092974, "loss": 0.7173, "step": 495 }, { "epoch": 0.18770104068117313, "grad_norm": 2.490620374495783, "learning_rate": 0.0009363453324534546, "loss": 0.7554, "step": 496 }, { "epoch": 0.1880794701986755, "grad_norm": 0.5434451860232015, "learning_rate": 0.0009360456374949877, "loss": 0.7606, "step": 497 }, { "epoch": 0.18845789971617785, "grad_norm": 0.5008507881148448, "learning_rate": 0.0009357452868845273, "loss": 0.7296, "step": 498 }, { "epoch": 0.18883632923368024, "grad_norm": 0.4611123610036576, "learning_rate": 0.0009354442810736899, "loss": 0.7203, "step": 499 }, { "epoch": 0.1892147587511826, "grad_norm": 0.3921082374752347, "learning_rate": 0.0009351426205150777, "loss": 0.7096, "step": 500 }, { "epoch": 0.18959318826868496, "grad_norm": 0.2994501459185999, "learning_rate": 0.0009348403056622768, "loss": 0.7123, "step": 501 }, { "epoch": 0.18997161778618732, "grad_norm": 0.39244426513809144, "learning_rate": 0.0009345373369698573, "loss": 0.7584, "step": 502 }, { "epoch": 0.19035004730368968, "grad_norm": 0.31902177621766903, "learning_rate": 0.0009342337148933726, "loss": 0.7152, "step": 503 }, { "epoch": 0.19072847682119207, "grad_norm": 0.3402804057783167, "learning_rate": 0.0009339294398893586, "loss": 0.7239, "step": 504 }, { "epoch": 0.19110690633869443, "grad_norm": 0.4067458647791497, "learning_rate": 0.0009336245124153325, "loss": 0.7544, "step": 505 }, { "epoch": 0.1914853358561968, "grad_norm": 0.2982804509307565, "learning_rate": 0.0009333189329297931, "loss": 0.6988, "step": 506 }, { "epoch": 0.19186376537369915, "grad_norm": 0.3275507602411181, "learning_rate": 0.0009330127018922195, "loss": 0.7344, "step": 507 }, { "epoch": 0.1922421948912015, "grad_norm": 0.27976731381603004, "learning_rate": 0.0009327058197630697, "loss": 0.7232, "step": 508 }, { "epoch": 0.19262062440870387, "grad_norm": 0.3522241512070939, "learning_rate": 0.0009323982870037823, "loss": 0.7449, "step": 509 }, { "epoch": 0.19299905392620625, "grad_norm": 1.1142987336543964, "learning_rate": 0.0009320901040767726, "loss": 0.739, "step": 510 }, { "epoch": 0.19337748344370861, "grad_norm": 0.32594714160531324, "learning_rate": 0.0009317812714454343, "loss": 0.7054, "step": 511 }, { "epoch": 0.19375591296121097, "grad_norm": 0.35747000365025344, "learning_rate": 0.0009314717895741383, "loss": 0.703, "step": 512 }, { "epoch": 0.19413434247871333, "grad_norm": 0.4197928651866732, "learning_rate": 0.0009311616589282307, "loss": 0.7328, "step": 513 }, { "epoch": 0.1945127719962157, "grad_norm": 0.29544367324225446, "learning_rate": 0.0009308508799740341, "loss": 0.7552, "step": 514 }, { "epoch": 0.19489120151371808, "grad_norm": 0.28501261398776917, "learning_rate": 0.0009305394531788456, "loss": 0.7213, "step": 515 }, { "epoch": 0.19526963103122044, "grad_norm": 0.26872342444030817, "learning_rate": 0.0009302273790109361, "loss": 0.7378, "step": 516 }, { "epoch": 0.1956480605487228, "grad_norm": 0.30530168301368027, "learning_rate": 0.0009299146579395503, "loss": 0.7325, "step": 517 }, { "epoch": 0.19602649006622516, "grad_norm": 0.310462628339248, "learning_rate": 0.0009296012904349057, "loss": 0.7172, "step": 518 }, { "epoch": 0.19640491958372752, "grad_norm": 0.26642174997293777, "learning_rate": 0.0009292872769681912, "loss": 0.725, "step": 519 }, { "epoch": 0.1967833491012299, "grad_norm": 0.24209743794066804, "learning_rate": 0.0009289726180115677, "loss": 0.7366, "step": 520 }, { "epoch": 0.19716177861873227, "grad_norm": 0.2299634518169195, "learning_rate": 0.0009286573140381662, "loss": 0.7207, "step": 521 }, { "epoch": 0.19754020813623463, "grad_norm": 0.2946642325025308, "learning_rate": 0.0009283413655220876, "loss": 0.7148, "step": 522 }, { "epoch": 0.197918637653737, "grad_norm": 0.25236592741553854, "learning_rate": 0.0009280247729384024, "loss": 0.7304, "step": 523 }, { "epoch": 0.19829706717123935, "grad_norm": 0.2927185388847136, "learning_rate": 0.0009277075367631486, "loss": 0.7417, "step": 524 }, { "epoch": 0.1986754966887417, "grad_norm": 0.24175599327271996, "learning_rate": 0.0009273896574733334, "loss": 0.7317, "step": 525 }, { "epoch": 0.1990539262062441, "grad_norm": 0.21537524284973444, "learning_rate": 0.0009270711355469294, "loss": 0.7186, "step": 526 }, { "epoch": 0.19943235572374646, "grad_norm": 0.8067250426965202, "learning_rate": 0.0009267519714628765, "loss": 0.7119, "step": 527 }, { "epoch": 0.19981078524124882, "grad_norm": 0.28820266944888545, "learning_rate": 0.00092643216570108, "loss": 0.7327, "step": 528 }, { "epoch": 0.20018921475875118, "grad_norm": 0.28553652555441955, "learning_rate": 0.0009261117187424096, "loss": 0.7157, "step": 529 }, { "epoch": 0.20056764427625354, "grad_norm": 0.2804569348155077, "learning_rate": 0.0009257906310686999, "loss": 0.7106, "step": 530 }, { "epoch": 0.20094607379375592, "grad_norm": 0.24515121345379196, "learning_rate": 0.0009254689031627482, "loss": 0.7071, "step": 531 }, { "epoch": 0.20132450331125828, "grad_norm": 0.2610779076135333, "learning_rate": 0.0009251465355083148, "loss": 0.7021, "step": 532 }, { "epoch": 0.20170293282876064, "grad_norm": 0.3053910129688108, "learning_rate": 0.000924823528590122, "loss": 0.7194, "step": 533 }, { "epoch": 0.202081362346263, "grad_norm": 0.2544885414078993, "learning_rate": 0.0009244998828938531, "loss": 0.7276, "step": 534 }, { "epoch": 0.20245979186376536, "grad_norm": 0.23932670846923215, "learning_rate": 0.000924175598906152, "loss": 0.7211, "step": 535 }, { "epoch": 0.20283822138126775, "grad_norm": 0.8426939395261552, "learning_rate": 0.0009238506771146222, "loss": 0.7758, "step": 536 }, { "epoch": 0.2032166508987701, "grad_norm": 0.2990108006433971, "learning_rate": 0.0009235251180078268, "loss": 0.6854, "step": 537 }, { "epoch": 0.20359508041627247, "grad_norm": 5.204567282624486, "learning_rate": 0.0009231989220752862, "loss": 0.7203, "step": 538 }, { "epoch": 0.20397350993377483, "grad_norm": 0.5186341937765667, "learning_rate": 0.0009228720898074793, "loss": 0.7634, "step": 539 }, { "epoch": 0.2043519394512772, "grad_norm": 0.4915522457054819, "learning_rate": 0.0009225446216958413, "loss": 0.7283, "step": 540 }, { "epoch": 0.20473036896877955, "grad_norm": 0.4011192576995025, "learning_rate": 0.0009222165182327636, "loss": 0.7072, "step": 541 }, { "epoch": 0.20510879848628194, "grad_norm": 0.282005236474753, "learning_rate": 0.0009218877799115928, "loss": 0.7231, "step": 542 }, { "epoch": 0.2054872280037843, "grad_norm": 2.643251816726128, "learning_rate": 0.0009215584072266305, "loss": 0.7455, "step": 543 }, { "epoch": 0.20586565752128666, "grad_norm": 0.2647447023879314, "learning_rate": 0.0009212284006731318, "loss": 0.7241, "step": 544 }, { "epoch": 0.20624408703878902, "grad_norm": 0.24396804050094656, "learning_rate": 0.0009208977607473046, "loss": 0.749, "step": 545 }, { "epoch": 0.20662251655629138, "grad_norm": 0.356512779226154, "learning_rate": 0.00092056648794631, "loss": 0.7058, "step": 546 }, { "epoch": 0.20700094607379377, "grad_norm": 0.2805440440363066, "learning_rate": 0.0009202345827682601, "loss": 0.7236, "step": 547 }, { "epoch": 0.20737937559129613, "grad_norm": 0.2730427071794903, "learning_rate": 0.000919902045712218, "loss": 0.6792, "step": 548 }, { "epoch": 0.2077578051087985, "grad_norm": 0.3277537198092039, "learning_rate": 0.000919568877278197, "loss": 0.7128, "step": 549 }, { "epoch": 0.20813623462630085, "grad_norm": 0.347423872276074, "learning_rate": 0.0009192350779671594, "loss": 0.6873, "step": 550 }, { "epoch": 0.2085146641438032, "grad_norm": 0.26609488607400417, "learning_rate": 0.0009189006482810167, "loss": 0.7065, "step": 551 }, { "epoch": 0.2088930936613056, "grad_norm": 0.25582132711775785, "learning_rate": 0.0009185655887226278, "loss": 0.7278, "step": 552 }, { "epoch": 0.20927152317880796, "grad_norm": 0.4118079398327244, "learning_rate": 0.0009182298997957989, "loss": 0.6974, "step": 553 }, { "epoch": 0.20964995269631032, "grad_norm": 0.3216714062345679, "learning_rate": 0.0009178935820052823, "loss": 0.7143, "step": 554 }, { "epoch": 0.21002838221381268, "grad_norm": 0.23528990195026728, "learning_rate": 0.0009175566358567764, "loss": 0.708, "step": 555 }, { "epoch": 0.21040681173131504, "grad_norm": 0.3121982190198337, "learning_rate": 0.0009172190618569236, "loss": 0.7392, "step": 556 }, { "epoch": 0.2107852412488174, "grad_norm": 1.2905217090127177, "learning_rate": 0.0009168808605133112, "loss": 0.6892, "step": 557 }, { "epoch": 0.21116367076631978, "grad_norm": 0.24447707834531593, "learning_rate": 0.0009165420323344693, "loss": 0.6953, "step": 558 }, { "epoch": 0.21154210028382214, "grad_norm": 0.2783071414441072, "learning_rate": 0.0009162025778298708, "loss": 0.7166, "step": 559 }, { "epoch": 0.2119205298013245, "grad_norm": 0.35425557628703547, "learning_rate": 0.0009158624975099299, "loss": 0.7288, "step": 560 }, { "epoch": 0.21229895931882686, "grad_norm": 0.2508784892909177, "learning_rate": 0.0009155217918860024, "loss": 0.6914, "step": 561 }, { "epoch": 0.21267738883632922, "grad_norm": 19.1140176992547, "learning_rate": 0.0009151804614703839, "loss": 0.7789, "step": 562 }, { "epoch": 0.2130558183538316, "grad_norm": 1.4608927837199082, "learning_rate": 0.0009148385067763095, "loss": 0.7062, "step": 563 }, { "epoch": 0.21343424787133397, "grad_norm": 0.6283673886304277, "learning_rate": 0.0009144959283179533, "loss": 0.7182, "step": 564 }, { "epoch": 0.21381267738883633, "grad_norm": 0.5667020880156339, "learning_rate": 0.0009141527266104269, "loss": 0.7658, "step": 565 }, { "epoch": 0.2141911069063387, "grad_norm": 0.4404514079013945, "learning_rate": 0.0009138089021697793, "loss": 0.7273, "step": 566 }, { "epoch": 0.21456953642384105, "grad_norm": 0.32396232851923745, "learning_rate": 0.0009134644555129958, "loss": 0.6837, "step": 567 }, { "epoch": 0.21494796594134344, "grad_norm": 0.33195151325282546, "learning_rate": 0.0009131193871579975, "loss": 0.7142, "step": 568 }, { "epoch": 0.2153263954588458, "grad_norm": 0.5840577554688643, "learning_rate": 0.0009127736976236396, "loss": 0.7337, "step": 569 }, { "epoch": 0.21570482497634816, "grad_norm": 0.3761587717026099, "learning_rate": 0.0009124273874297122, "loss": 0.7293, "step": 570 }, { "epoch": 0.21608325449385052, "grad_norm": 0.3328639192215923, "learning_rate": 0.0009120804570969381, "loss": 0.6816, "step": 571 }, { "epoch": 0.21646168401135288, "grad_norm": 0.25950046859728304, "learning_rate": 0.0009117329071469726, "loss": 0.7158, "step": 572 }, { "epoch": 0.21684011352885524, "grad_norm": 1.5972096161644342, "learning_rate": 0.000911384738102403, "loss": 0.7058, "step": 573 }, { "epoch": 0.21721854304635763, "grad_norm": 0.22169299652882682, "learning_rate": 0.0009110359504867472, "loss": 0.7087, "step": 574 }, { "epoch": 0.21759697256385999, "grad_norm": 0.5373666597913932, "learning_rate": 0.000910686544824453, "loss": 0.7038, "step": 575 }, { "epoch": 0.21797540208136235, "grad_norm": 0.40347745111038724, "learning_rate": 0.0009103365216408983, "loss": 0.7174, "step": 576 }, { "epoch": 0.2183538315988647, "grad_norm": 0.2401263141278395, "learning_rate": 0.0009099858814623886, "loss": 0.7192, "step": 577 }, { "epoch": 0.21873226111636707, "grad_norm": 0.23431441795661043, "learning_rate": 0.0009096346248161578, "loss": 0.7357, "step": 578 }, { "epoch": 0.21911069063386945, "grad_norm": 0.28663654708144953, "learning_rate": 0.0009092827522303662, "loss": 0.7102, "step": 579 }, { "epoch": 0.2194891201513718, "grad_norm": 0.24055426319567197, "learning_rate": 0.0009089302642341008, "loss": 0.679, "step": 580 }, { "epoch": 0.21986754966887417, "grad_norm": 0.2502980988178069, "learning_rate": 0.0009085771613573737, "loss": 0.6916, "step": 581 }, { "epoch": 0.22024597918637653, "grad_norm": 1.1379534121632966, "learning_rate": 0.0009082234441311213, "loss": 0.707, "step": 582 }, { "epoch": 0.2206244087038789, "grad_norm": 0.32482637205700965, "learning_rate": 0.0009078691130872044, "loss": 0.6908, "step": 583 }, { "epoch": 0.22100283822138128, "grad_norm": 0.599663101423535, "learning_rate": 0.0009075141687584057, "loss": 0.711, "step": 584 }, { "epoch": 0.22138126773888364, "grad_norm": 0.312669124138043, "learning_rate": 0.0009071586116784312, "loss": 0.7186, "step": 585 }, { "epoch": 0.221759697256386, "grad_norm": 0.2938633294329555, "learning_rate": 0.0009068024423819078, "loss": 0.6941, "step": 586 }, { "epoch": 0.22213812677388836, "grad_norm": 0.2784599489309487, "learning_rate": 0.0009064456614043825, "loss": 0.7056, "step": 587 }, { "epoch": 0.22251655629139072, "grad_norm": 0.5808343806273306, "learning_rate": 0.0009060882692823229, "loss": 0.7236, "step": 588 }, { "epoch": 0.22289498580889308, "grad_norm": 0.2973680793362583, "learning_rate": 0.0009057302665531149, "loss": 0.7051, "step": 589 }, { "epoch": 0.22327341532639547, "grad_norm": 1.1153935253738578, "learning_rate": 0.0009053716537550625, "loss": 0.7057, "step": 590 }, { "epoch": 0.22365184484389783, "grad_norm": 0.4588263381619234, "learning_rate": 0.0009050124314273876, "loss": 0.7048, "step": 591 }, { "epoch": 0.2240302743614002, "grad_norm": 0.41542997014978117, "learning_rate": 0.0009046526001102279, "loss": 0.6766, "step": 592 }, { "epoch": 0.22440870387890255, "grad_norm": 0.30348356512119057, "learning_rate": 0.0009042921603446374, "loss": 0.6857, "step": 593 }, { "epoch": 0.2247871333964049, "grad_norm": 0.8488364727032865, "learning_rate": 0.0009039311126725847, "loss": 0.7021, "step": 594 }, { "epoch": 0.2251655629139073, "grad_norm": 0.3333561437471218, "learning_rate": 0.0009035694576369522, "loss": 0.7157, "step": 595 }, { "epoch": 0.22554399243140966, "grad_norm": 0.376000103894686, "learning_rate": 0.0009032071957815361, "loss": 0.7065, "step": 596 }, { "epoch": 0.22592242194891202, "grad_norm": 1.5338798827350935, "learning_rate": 0.0009028443276510447, "loss": 0.7331, "step": 597 }, { "epoch": 0.22630085146641438, "grad_norm": 0.30424359581130644, "learning_rate": 0.000902480853791098, "loss": 0.7107, "step": 598 }, { "epoch": 0.22667928098391674, "grad_norm": 0.38204758966057767, "learning_rate": 0.0009021167747482267, "loss": 0.7198, "step": 599 }, { "epoch": 0.22705771050141912, "grad_norm": 0.33387105257335065, "learning_rate": 0.0009017520910698716, "loss": 0.6903, "step": 600 }, { "epoch": 0.22743614001892148, "grad_norm": 0.2783907750931062, "learning_rate": 0.0009013868033043827, "loss": 0.6915, "step": 601 }, { "epoch": 0.22781456953642384, "grad_norm": 0.48936832725487056, "learning_rate": 0.000901020912001018, "loss": 0.69, "step": 602 }, { "epoch": 0.2281929990539262, "grad_norm": 0.31144002915928043, "learning_rate": 0.0009006544177099434, "loss": 0.6768, "step": 603 }, { "epoch": 0.22857142857142856, "grad_norm": 0.3111360107947153, "learning_rate": 0.0009002873209822312, "loss": 0.6906, "step": 604 }, { "epoch": 0.22894985808893092, "grad_norm": 1.8973351563029177, "learning_rate": 0.0008999196223698598, "loss": 0.7169, "step": 605 }, { "epoch": 0.2293282876064333, "grad_norm": 0.35128188059831417, "learning_rate": 0.0008995513224257122, "loss": 0.7392, "step": 606 }, { "epoch": 0.22970671712393567, "grad_norm": 0.829071871132628, "learning_rate": 0.0008991824217035759, "loss": 0.7187, "step": 607 }, { "epoch": 0.23008514664143803, "grad_norm": 0.5383458003302197, "learning_rate": 0.0008988129207581417, "loss": 0.7058, "step": 608 }, { "epoch": 0.2304635761589404, "grad_norm": 0.4184028118532764, "learning_rate": 0.000898442820145003, "loss": 0.7186, "step": 609 }, { "epoch": 0.23084200567644275, "grad_norm": 0.6420818139796153, "learning_rate": 0.0008980721204206548, "loss": 0.7236, "step": 610 }, { "epoch": 0.23122043519394514, "grad_norm": 0.41666669448216664, "learning_rate": 0.0008977008221424927, "loss": 0.6991, "step": 611 }, { "epoch": 0.2315988647114475, "grad_norm": 0.27783822587386486, "learning_rate": 0.0008973289258688126, "loss": 0.7074, "step": 612 }, { "epoch": 0.23197729422894986, "grad_norm": 0.3780403665138832, "learning_rate": 0.0008969564321588095, "loss": 0.6964, "step": 613 }, { "epoch": 0.23235572374645222, "grad_norm": 0.2831948903958402, "learning_rate": 0.0008965833415725768, "loss": 0.6866, "step": 614 }, { "epoch": 0.23273415326395458, "grad_norm": 0.2839455311693417, "learning_rate": 0.0008962096546711051, "loss": 0.6844, "step": 615 }, { "epoch": 0.23311258278145697, "grad_norm": 4.0158869438975735, "learning_rate": 0.0008958353720162819, "loss": 0.7392, "step": 616 }, { "epoch": 0.23349101229895933, "grad_norm": 1.547046228044839, "learning_rate": 0.0008954604941708906, "loss": 0.7265, "step": 617 }, { "epoch": 0.2338694418164617, "grad_norm": 0.4069733781730824, "learning_rate": 0.0008950850216986091, "loss": 0.6807, "step": 618 }, { "epoch": 0.23424787133396405, "grad_norm": 0.3992355805971877, "learning_rate": 0.00089470895516401, "loss": 0.6993, "step": 619 }, { "epoch": 0.2346263008514664, "grad_norm": 0.49444290384428824, "learning_rate": 0.0008943322951325583, "loss": 0.7419, "step": 620 }, { "epoch": 0.23500473036896877, "grad_norm": 0.3928997942449429, "learning_rate": 0.0008939550421706124, "loss": 0.6897, "step": 621 }, { "epoch": 0.23538315988647115, "grad_norm": 8.093025558041038, "learning_rate": 0.0008935771968454216, "loss": 0.7227, "step": 622 }, { "epoch": 0.23576158940397351, "grad_norm": 0.5608961233535892, "learning_rate": 0.0008931987597251261, "loss": 0.7052, "step": 623 }, { "epoch": 0.23614001892147587, "grad_norm": 0.4344431658987114, "learning_rate": 0.0008928197313787558, "loss": 0.7199, "step": 624 }, { "epoch": 0.23651844843897823, "grad_norm": 0.3562052924563004, "learning_rate": 0.0008924401123762298, "loss": 0.677, "step": 625 }, { "epoch": 0.2368968779564806, "grad_norm": 0.27795463766348716, "learning_rate": 0.0008920599032883553, "loss": 0.6738, "step": 626 }, { "epoch": 0.23727530747398298, "grad_norm": 0.37545583755952866, "learning_rate": 0.0008916791046868264, "loss": 0.7103, "step": 627 }, { "epoch": 0.23765373699148534, "grad_norm": 0.43926194164867755, "learning_rate": 0.0008912977171442242, "loss": 0.7126, "step": 628 }, { "epoch": 0.2380321665089877, "grad_norm": 0.2549512632395183, "learning_rate": 0.000890915741234015, "loss": 0.7148, "step": 629 }, { "epoch": 0.23841059602649006, "grad_norm": 0.2584686599083744, "learning_rate": 0.0008905331775305496, "loss": 0.7277, "step": 630 }, { "epoch": 0.23878902554399242, "grad_norm": 0.20055136303696453, "learning_rate": 0.0008901500266090632, "loss": 0.7079, "step": 631 }, { "epoch": 0.2391674550614948, "grad_norm": 2.556475745495092, "learning_rate": 0.0008897662890456735, "loss": 0.7146, "step": 632 }, { "epoch": 0.23954588457899717, "grad_norm": 0.2540459761929498, "learning_rate": 0.0008893819654173803, "loss": 0.682, "step": 633 }, { "epoch": 0.23992431409649953, "grad_norm": 0.28445973967729826, "learning_rate": 0.000888997056302065, "loss": 0.7077, "step": 634 }, { "epoch": 0.2403027436140019, "grad_norm": 0.5027307802203694, "learning_rate": 0.0008886115622784889, "loss": 0.7319, "step": 635 }, { "epoch": 0.24068117313150425, "grad_norm": 0.22359251300568228, "learning_rate": 0.000888225483926293, "loss": 0.7058, "step": 636 }, { "epoch": 0.2410596026490066, "grad_norm": 0.8604270737382697, "learning_rate": 0.000887838821825997, "loss": 0.6848, "step": 637 }, { "epoch": 0.241438032166509, "grad_norm": 0.29953421902167604, "learning_rate": 0.0008874515765589981, "loss": 0.705, "step": 638 }, { "epoch": 0.24181646168401136, "grad_norm": 0.21209843257828928, "learning_rate": 0.0008870637487075708, "loss": 0.6944, "step": 639 }, { "epoch": 0.24219489120151372, "grad_norm": 0.2660699511607692, "learning_rate": 0.0008866753388548649, "loss": 0.709, "step": 640 }, { "epoch": 0.24257332071901608, "grad_norm": 0.21318263404479892, "learning_rate": 0.0008862863475849061, "loss": 0.7139, "step": 641 }, { "epoch": 0.24295175023651844, "grad_norm": 1.545697390121761, "learning_rate": 0.0008858967754825939, "loss": 0.7065, "step": 642 }, { "epoch": 0.24333017975402083, "grad_norm": 0.41458005768074535, "learning_rate": 0.0008855066231337009, "loss": 0.6783, "step": 643 }, { "epoch": 0.24370860927152319, "grad_norm": 0.3839161980178078, "learning_rate": 0.0008851158911248728, "loss": 0.6799, "step": 644 }, { "epoch": 0.24408703878902555, "grad_norm": 0.33892342488310545, "learning_rate": 0.0008847245800436266, "loss": 0.7111, "step": 645 }, { "epoch": 0.2444654683065279, "grad_norm": 0.262931589843143, "learning_rate": 0.0008843326904783498, "loss": 0.7214, "step": 646 }, { "epoch": 0.24484389782403027, "grad_norm": 0.31554992793307074, "learning_rate": 0.0008839402230183001, "loss": 0.6969, "step": 647 }, { "epoch": 0.24522232734153265, "grad_norm": 0.3017898468494826, "learning_rate": 0.0008835471782536038, "loss": 0.6756, "step": 648 }, { "epoch": 0.245600756859035, "grad_norm": 0.28520432158482306, "learning_rate": 0.0008831535567752557, "loss": 0.6848, "step": 649 }, { "epoch": 0.24597918637653737, "grad_norm": 0.23913010283398112, "learning_rate": 0.0008827593591751172, "loss": 0.6899, "step": 650 }, { "epoch": 0.24635761589403973, "grad_norm": 0.24229533348249946, "learning_rate": 0.0008823645860459164, "loss": 0.7048, "step": 651 }, { "epoch": 0.2467360454115421, "grad_norm": 0.22785576485418108, "learning_rate": 0.0008819692379812467, "loss": 0.7189, "step": 652 }, { "epoch": 0.24711447492904445, "grad_norm": 0.3114266080656945, "learning_rate": 0.0008815733155755658, "loss": 0.7124, "step": 653 }, { "epoch": 0.24749290444654684, "grad_norm": 0.22955736087070286, "learning_rate": 0.0008811768194241952, "loss": 0.6881, "step": 654 }, { "epoch": 0.2478713339640492, "grad_norm": 0.25510979553644064, "learning_rate": 0.000880779750123319, "loss": 0.6824, "step": 655 }, { "epoch": 0.24824976348155156, "grad_norm": 0.23542263809721484, "learning_rate": 0.0008803821082699832, "loss": 0.7075, "step": 656 }, { "epoch": 0.24862819299905392, "grad_norm": 0.24255675192521542, "learning_rate": 0.0008799838944620946, "loss": 0.6843, "step": 657 }, { "epoch": 0.24900662251655628, "grad_norm": 0.2891289789203938, "learning_rate": 0.0008795851092984203, "loss": 0.7098, "step": 658 }, { "epoch": 0.24938505203405867, "grad_norm": 0.24507167287206547, "learning_rate": 0.0008791857533785859, "loss": 0.722, "step": 659 }, { "epoch": 0.24976348155156103, "grad_norm": 0.21410931854870882, "learning_rate": 0.0008787858273030757, "loss": 0.6877, "step": 660 }, { "epoch": 0.25014191106906336, "grad_norm": 0.4972567255582753, "learning_rate": 0.0008783853316732313, "loss": 0.7126, "step": 661 }, { "epoch": 0.25052034058656575, "grad_norm": 0.18990381452333582, "learning_rate": 0.0008779842670912504, "loss": 0.6996, "step": 662 }, { "epoch": 0.25089877010406814, "grad_norm": 0.26228844144250707, "learning_rate": 0.0008775826341601866, "loss": 0.7239, "step": 663 }, { "epoch": 0.25127719962157047, "grad_norm": 1.6689001694450543, "learning_rate": 0.0008771804334839476, "loss": 0.7172, "step": 664 }, { "epoch": 0.25165562913907286, "grad_norm": 0.27725602495304696, "learning_rate": 0.0008767776656672952, "loss": 0.6889, "step": 665 }, { "epoch": 0.2520340586565752, "grad_norm": 0.205340843857023, "learning_rate": 0.0008763743313158439, "loss": 0.6941, "step": 666 }, { "epoch": 0.2524124881740776, "grad_norm": 0.22568273738102715, "learning_rate": 0.0008759704310360597, "loss": 0.6839, "step": 667 }, { "epoch": 0.25279091769157996, "grad_norm": 0.22821057981011408, "learning_rate": 0.0008755659654352599, "loss": 0.6761, "step": 668 }, { "epoch": 0.2531693472090823, "grad_norm": 0.22934624921812063, "learning_rate": 0.000875160935121612, "loss": 0.714, "step": 669 }, { "epoch": 0.2535477767265847, "grad_norm": 0.24538792707969503, "learning_rate": 0.0008747553407041321, "loss": 0.6953, "step": 670 }, { "epoch": 0.253926206244087, "grad_norm": 0.2207775496856464, "learning_rate": 0.0008743491827926849, "loss": 0.6928, "step": 671 }, { "epoch": 0.2543046357615894, "grad_norm": 0.21734743404488477, "learning_rate": 0.0008739424619979824, "loss": 0.7201, "step": 672 }, { "epoch": 0.2546830652790918, "grad_norm": 0.24426370951276594, "learning_rate": 0.0008735351789315824, "loss": 0.6816, "step": 673 }, { "epoch": 0.2550614947965941, "grad_norm": 0.18741335456310848, "learning_rate": 0.000873127334205889, "loss": 0.6826, "step": 674 }, { "epoch": 0.2554399243140965, "grad_norm": 0.20122405181075587, "learning_rate": 0.0008727189284341502, "loss": 0.7405, "step": 675 }, { "epoch": 0.25581835383159884, "grad_norm": 0.20754310025279926, "learning_rate": 0.0008723099622304578, "loss": 0.7117, "step": 676 }, { "epoch": 0.25619678334910123, "grad_norm": 0.24132336764649504, "learning_rate": 0.0008719004362097465, "loss": 0.702, "step": 677 }, { "epoch": 0.2565752128666036, "grad_norm": 0.20132259167375077, "learning_rate": 0.0008714903509877925, "loss": 0.6971, "step": 678 }, { "epoch": 0.25695364238410595, "grad_norm": 0.1970717733459467, "learning_rate": 0.0008710797071812126, "loss": 0.6718, "step": 679 }, { "epoch": 0.25733207190160834, "grad_norm": 0.22202237567173197, "learning_rate": 0.0008706685054074644, "loss": 0.7198, "step": 680 }, { "epoch": 0.25771050141911067, "grad_norm": 0.20510691486091284, "learning_rate": 0.0008702567462848432, "loss": 0.6978, "step": 681 }, { "epoch": 0.25808893093661306, "grad_norm": 0.2848327578517784, "learning_rate": 0.0008698444304324835, "loss": 0.7201, "step": 682 }, { "epoch": 0.25846736045411545, "grad_norm": 0.22437889879393727, "learning_rate": 0.0008694315584703562, "loss": 0.7188, "step": 683 }, { "epoch": 0.2588457899716178, "grad_norm": 0.20597556705910733, "learning_rate": 0.0008690181310192686, "loss": 0.6895, "step": 684 }, { "epoch": 0.25922421948912017, "grad_norm": 0.21729906016237396, "learning_rate": 0.0008686041487008636, "loss": 0.7193, "step": 685 }, { "epoch": 0.2596026490066225, "grad_norm": 1.0258672278718608, "learning_rate": 0.0008681896121376176, "loss": 0.6832, "step": 686 }, { "epoch": 0.2599810785241249, "grad_norm": 0.24761486730588297, "learning_rate": 0.0008677745219528414, "loss": 0.7393, "step": 687 }, { "epoch": 0.2603595080416273, "grad_norm": 0.2441925298828393, "learning_rate": 0.0008673588787706773, "loss": 0.672, "step": 688 }, { "epoch": 0.2607379375591296, "grad_norm": 0.2465113774666491, "learning_rate": 0.0008669426832160996, "loss": 0.67, "step": 689 }, { "epoch": 0.261116367076632, "grad_norm": 0.2509788483393139, "learning_rate": 0.0008665259359149131, "loss": 0.719, "step": 690 }, { "epoch": 0.2614947965941343, "grad_norm": 0.2831093205285011, "learning_rate": 0.0008661086374937524, "loss": 0.7018, "step": 691 }, { "epoch": 0.2618732261116367, "grad_norm": 0.2250055054150011, "learning_rate": 0.0008656907885800805, "loss": 0.6866, "step": 692 }, { "epoch": 0.26225165562913905, "grad_norm": 0.19896092280022598, "learning_rate": 0.000865272389802188, "loss": 0.6986, "step": 693 }, { "epoch": 0.26263008514664143, "grad_norm": 0.29425165935379066, "learning_rate": 0.0008648534417891926, "loss": 0.6941, "step": 694 }, { "epoch": 0.2630085146641438, "grad_norm": 0.21082011241383836, "learning_rate": 0.0008644339451710382, "loss": 0.6887, "step": 695 }, { "epoch": 0.26338694418164615, "grad_norm": 0.19383310796689582, "learning_rate": 0.0008640139005784924, "loss": 0.6946, "step": 696 }, { "epoch": 0.26376537369914854, "grad_norm": 0.19362471898540595, "learning_rate": 0.0008635933086431481, "loss": 0.6877, "step": 697 }, { "epoch": 0.2641438032166509, "grad_norm": 0.20935754347380933, "learning_rate": 0.0008631721699974204, "loss": 0.6842, "step": 698 }, { "epoch": 0.26452223273415326, "grad_norm": 0.2114959142360183, "learning_rate": 0.0008627504852745468, "loss": 0.6762, "step": 699 }, { "epoch": 0.26490066225165565, "grad_norm": 0.23319882272301304, "learning_rate": 0.0008623282551085856, "loss": 0.6732, "step": 700 }, { "epoch": 0.265279091769158, "grad_norm": 0.2041714562288951, "learning_rate": 0.0008619054801344155, "loss": 0.6857, "step": 701 }, { "epoch": 0.26565752128666037, "grad_norm": 0.21483998728838394, "learning_rate": 0.0008614821609877343, "loss": 0.7086, "step": 702 }, { "epoch": 0.2660359508041627, "grad_norm": 0.2324871324239393, "learning_rate": 0.0008610582983050581, "loss": 0.6861, "step": 703 }, { "epoch": 0.2664143803216651, "grad_norm": 0.20217040306573794, "learning_rate": 0.00086063389272372, "loss": 0.6853, "step": 704 }, { "epoch": 0.2667928098391675, "grad_norm": 0.2278947955493039, "learning_rate": 0.00086020894488187, "loss": 0.7071, "step": 705 }, { "epoch": 0.2671712393566698, "grad_norm": 0.23906142290658094, "learning_rate": 0.0008597834554184729, "loss": 0.6745, "step": 706 }, { "epoch": 0.2675496688741722, "grad_norm": 0.25175796757375885, "learning_rate": 0.0008593574249733079, "loss": 0.694, "step": 707 }, { "epoch": 0.26792809839167453, "grad_norm": 0.21502859660084445, "learning_rate": 0.0008589308541869682, "loss": 0.6916, "step": 708 }, { "epoch": 0.2683065279091769, "grad_norm": 0.20463601549591218, "learning_rate": 0.0008585037437008589, "loss": 0.695, "step": 709 }, { "epoch": 0.2686849574266793, "grad_norm": 0.2019937725222575, "learning_rate": 0.0008580760941571966, "loss": 0.6893, "step": 710 }, { "epoch": 0.26906338694418164, "grad_norm": 0.18397747682893922, "learning_rate": 0.0008576479061990092, "loss": 0.6685, "step": 711 }, { "epoch": 0.269441816461684, "grad_norm": 0.19836485877312812, "learning_rate": 0.0008572191804701333, "loss": 0.6804, "step": 712 }, { "epoch": 0.26982024597918636, "grad_norm": 0.18037548214748178, "learning_rate": 0.0008567899176152144, "loss": 0.6835, "step": 713 }, { "epoch": 0.27019867549668874, "grad_norm": 0.19391242802051237, "learning_rate": 0.0008563601182797058, "loss": 0.672, "step": 714 }, { "epoch": 0.27057710501419113, "grad_norm": 1.0549807670611908, "learning_rate": 0.0008559297831098674, "loss": 0.7269, "step": 715 }, { "epoch": 0.27095553453169346, "grad_norm": 0.2559559181302699, "learning_rate": 0.0008554989127527648, "loss": 0.6987, "step": 716 }, { "epoch": 0.27133396404919585, "grad_norm": 0.21944699622097746, "learning_rate": 0.0008550675078562679, "loss": 0.6964, "step": 717 }, { "epoch": 0.2717123935666982, "grad_norm": 0.20141286211724033, "learning_rate": 0.0008546355690690513, "loss": 0.7007, "step": 718 }, { "epoch": 0.2720908230842006, "grad_norm": 0.20268655187200965, "learning_rate": 0.0008542030970405913, "loss": 0.6649, "step": 719 }, { "epoch": 0.27246925260170296, "grad_norm": 0.2652779162796573, "learning_rate": 0.0008537700924211666, "loss": 0.6916, "step": 720 }, { "epoch": 0.2728476821192053, "grad_norm": 0.25532443785568654, "learning_rate": 0.000853336555861857, "loss": 0.6748, "step": 721 }, { "epoch": 0.2732261116367077, "grad_norm": 0.2300351476765343, "learning_rate": 0.0008529024880145414, "loss": 0.7074, "step": 722 }, { "epoch": 0.27360454115421, "grad_norm": 0.23520057235181027, "learning_rate": 0.000852467889531898, "loss": 0.658, "step": 723 }, { "epoch": 0.2739829706717124, "grad_norm": 0.25960682195279977, "learning_rate": 0.0008520327610674027, "loss": 0.6844, "step": 724 }, { "epoch": 0.27436140018921473, "grad_norm": 0.258070787458878, "learning_rate": 0.0008515971032753288, "loss": 0.6787, "step": 725 }, { "epoch": 0.2747398297067171, "grad_norm": 0.1985935986425992, "learning_rate": 0.0008511609168107447, "loss": 0.6986, "step": 726 }, { "epoch": 0.2751182592242195, "grad_norm": 0.22243986155517645, "learning_rate": 0.0008507242023295143, "loss": 0.7091, "step": 727 }, { "epoch": 0.27549668874172184, "grad_norm": 0.26742933828474785, "learning_rate": 0.0008502869604882952, "loss": 0.6883, "step": 728 }, { "epoch": 0.27587511825922423, "grad_norm": 0.200165993008908, "learning_rate": 0.0008498491919445383, "loss": 0.6836, "step": 729 }, { "epoch": 0.27625354777672656, "grad_norm": 0.25499249433242277, "learning_rate": 0.000849410897356486, "loss": 0.6886, "step": 730 }, { "epoch": 0.27663197729422895, "grad_norm": 0.25200872768793275, "learning_rate": 0.0008489720773831717, "loss": 0.6982, "step": 731 }, { "epoch": 0.27701040681173134, "grad_norm": 0.20328999200740874, "learning_rate": 0.0008485327326844195, "loss": 0.7033, "step": 732 }, { "epoch": 0.27738883632923367, "grad_norm": 0.2287810324897248, "learning_rate": 0.0008480928639208414, "loss": 0.6932, "step": 733 }, { "epoch": 0.27776726584673606, "grad_norm": 0.20816805760720886, "learning_rate": 0.0008476524717538384, "loss": 0.7046, "step": 734 }, { "epoch": 0.2781456953642384, "grad_norm": 0.26390387606865856, "learning_rate": 0.0008472115568455978, "loss": 0.6928, "step": 735 }, { "epoch": 0.2785241248817408, "grad_norm": 0.19951288164110778, "learning_rate": 0.0008467701198590934, "loss": 0.6643, "step": 736 }, { "epoch": 0.27890255439924316, "grad_norm": 0.2488192600126296, "learning_rate": 0.0008463281614580836, "loss": 0.6713, "step": 737 }, { "epoch": 0.2792809839167455, "grad_norm": 0.22548926127317043, "learning_rate": 0.0008458856823071111, "loss": 0.6784, "step": 738 }, { "epoch": 0.2796594134342479, "grad_norm": 0.225096961163091, "learning_rate": 0.0008454426830715014, "loss": 0.6838, "step": 739 }, { "epoch": 0.2800378429517502, "grad_norm": 0.23772490543656852, "learning_rate": 0.0008449991644173624, "loss": 0.6989, "step": 740 }, { "epoch": 0.2804162724692526, "grad_norm": 0.24818958472474098, "learning_rate": 0.0008445551270115825, "loss": 0.6835, "step": 741 }, { "epoch": 0.280794701986755, "grad_norm": 0.2518022958122864, "learning_rate": 0.0008441105715218304, "loss": 0.6881, "step": 742 }, { "epoch": 0.2811731315042573, "grad_norm": 0.17657831467214155, "learning_rate": 0.0008436654986165541, "loss": 0.6925, "step": 743 }, { "epoch": 0.2815515610217597, "grad_norm": 0.17662285030748173, "learning_rate": 0.000843219908964979, "loss": 0.7204, "step": 744 }, { "epoch": 0.28192999053926204, "grad_norm": 0.44053775451170485, "learning_rate": 0.0008427738032371077, "loss": 0.6786, "step": 745 }, { "epoch": 0.28230842005676443, "grad_norm": 0.24120153184846355, "learning_rate": 0.0008423271821037191, "loss": 0.7022, "step": 746 }, { "epoch": 0.2826868495742668, "grad_norm": 0.20849526291264037, "learning_rate": 0.0008418800462363667, "loss": 0.6887, "step": 747 }, { "epoch": 0.28306527909176915, "grad_norm": 0.24947817552697493, "learning_rate": 0.000841432396307378, "loss": 0.6697, "step": 748 }, { "epoch": 0.28344370860927154, "grad_norm": 0.25463223472058927, "learning_rate": 0.0008409842329898538, "loss": 0.6995, "step": 749 }, { "epoch": 0.28382213812677387, "grad_norm": 0.22418379893357254, "learning_rate": 0.0008405355569576666, "loss": 0.6675, "step": 750 }, { "epoch": 0.28420056764427626, "grad_norm": 0.2104554142228094, "learning_rate": 0.0008400863688854596, "loss": 0.6797, "step": 751 }, { "epoch": 0.28457899716177865, "grad_norm": 0.23482601956702845, "learning_rate": 0.0008396366694486466, "loss": 0.6826, "step": 752 }, { "epoch": 0.284957426679281, "grad_norm": 0.2440840541148025, "learning_rate": 0.0008391864593234094, "loss": 0.6517, "step": 753 }, { "epoch": 0.28533585619678337, "grad_norm": 0.23732910501578008, "learning_rate": 0.0008387357391866986, "loss": 0.6913, "step": 754 }, { "epoch": 0.2857142857142857, "grad_norm": 0.25198924374993453, "learning_rate": 0.0008382845097162308, "loss": 0.6758, "step": 755 }, { "epoch": 0.2860927152317881, "grad_norm": 0.3004584747993546, "learning_rate": 0.0008378327715904894, "loss": 0.7171, "step": 756 }, { "epoch": 0.2864711447492904, "grad_norm": 0.2454990512859675, "learning_rate": 0.0008373805254887215, "loss": 0.6845, "step": 757 }, { "epoch": 0.2868495742667928, "grad_norm": 0.22892297807312767, "learning_rate": 0.0008369277720909391, "loss": 0.6802, "step": 758 }, { "epoch": 0.2872280037842952, "grad_norm": 0.2350344599152239, "learning_rate": 0.0008364745120779164, "loss": 0.7004, "step": 759 }, { "epoch": 0.2876064333017975, "grad_norm": 0.2139616805399487, "learning_rate": 0.0008360207461311895, "loss": 0.6889, "step": 760 }, { "epoch": 0.2879848628192999, "grad_norm": 0.2775970430635097, "learning_rate": 0.0008355664749330551, "loss": 0.6738, "step": 761 }, { "epoch": 0.28836329233680225, "grad_norm": 0.2226279391309467, "learning_rate": 0.0008351116991665697, "loss": 0.6943, "step": 762 }, { "epoch": 0.28874172185430463, "grad_norm": 0.2384375934800631, "learning_rate": 0.0008346564195155486, "loss": 0.6971, "step": 763 }, { "epoch": 0.289120151371807, "grad_norm": 0.22808734177988302, "learning_rate": 0.0008342006366645645, "loss": 0.6727, "step": 764 }, { "epoch": 0.28949858088930935, "grad_norm": 0.23124960983088802, "learning_rate": 0.0008337443512989472, "loss": 0.6625, "step": 765 }, { "epoch": 0.28987701040681174, "grad_norm": 0.28759799303122524, "learning_rate": 0.0008332875641047817, "loss": 0.6563, "step": 766 }, { "epoch": 0.2902554399243141, "grad_norm": 0.21114980645120066, "learning_rate": 0.0008328302757689074, "loss": 0.6825, "step": 767 }, { "epoch": 0.29063386944181646, "grad_norm": 0.22314870484036303, "learning_rate": 0.0008323724869789176, "loss": 0.6714, "step": 768 }, { "epoch": 0.29101229895931885, "grad_norm": 0.19197961022906532, "learning_rate": 0.0008319141984231582, "loss": 0.6546, "step": 769 }, { "epoch": 0.2913907284768212, "grad_norm": 0.19014186137042963, "learning_rate": 0.0008314554107907261, "loss": 0.6899, "step": 770 }, { "epoch": 0.29176915799432357, "grad_norm": 0.1956733993218678, "learning_rate": 0.0008309961247714691, "loss": 0.6565, "step": 771 }, { "epoch": 0.2921475875118259, "grad_norm": 0.21548466884351525, "learning_rate": 0.0008305363410559839, "loss": 0.6974, "step": 772 }, { "epoch": 0.2925260170293283, "grad_norm": 0.2110274264479149, "learning_rate": 0.0008300760603356159, "loss": 0.6574, "step": 773 }, { "epoch": 0.2929044465468307, "grad_norm": 0.18680886205026145, "learning_rate": 0.0008296152833024577, "loss": 0.6659, "step": 774 }, { "epoch": 0.293282876064333, "grad_norm": 0.18096640779234074, "learning_rate": 0.0008291540106493485, "loss": 0.6578, "step": 775 }, { "epoch": 0.2936613055818354, "grad_norm": 0.18497981707240949, "learning_rate": 0.0008286922430698722, "loss": 0.6962, "step": 776 }, { "epoch": 0.29403973509933773, "grad_norm": 0.2318052639550537, "learning_rate": 0.000828229981258357, "loss": 0.6912, "step": 777 }, { "epoch": 0.2944181646168401, "grad_norm": 0.2128390590966841, "learning_rate": 0.0008277672259098746, "loss": 0.6393, "step": 778 }, { "epoch": 0.2947965941343425, "grad_norm": 0.21347700958344742, "learning_rate": 0.0008273039777202383, "loss": 0.6851, "step": 779 }, { "epoch": 0.29517502365184484, "grad_norm": 0.20513421154551215, "learning_rate": 0.000826840237386003, "loss": 0.672, "step": 780 }, { "epoch": 0.2955534531693472, "grad_norm": 0.24452965830324494, "learning_rate": 0.0008263760056044632, "loss": 0.7292, "step": 781 }, { "epoch": 0.29593188268684956, "grad_norm": 0.2148940547657484, "learning_rate": 0.0008259112830736521, "loss": 0.6541, "step": 782 }, { "epoch": 0.29631031220435194, "grad_norm": 0.1908503654186657, "learning_rate": 0.0008254460704923419, "loss": 0.6662, "step": 783 }, { "epoch": 0.29668874172185433, "grad_norm": 0.22670599898295843, "learning_rate": 0.0008249803685600403, "loss": 0.6706, "step": 784 }, { "epoch": 0.29706717123935666, "grad_norm": 0.21094260386190958, "learning_rate": 0.0008245141779769919, "loss": 0.6858, "step": 785 }, { "epoch": 0.29744560075685905, "grad_norm": 0.2597054002975567, "learning_rate": 0.0008240474994441753, "loss": 0.696, "step": 786 }, { "epoch": 0.2978240302743614, "grad_norm": 0.246769046681706, "learning_rate": 0.0008235803336633032, "loss": 0.6528, "step": 787 }, { "epoch": 0.29820245979186377, "grad_norm": 0.2105029539820703, "learning_rate": 0.0008231126813368206, "loss": 0.6861, "step": 788 }, { "epoch": 0.2985808893093661, "grad_norm": 0.24209539138574326, "learning_rate": 0.0008226445431679046, "loss": 0.6772, "step": 789 }, { "epoch": 0.2989593188268685, "grad_norm": 0.24059965693789445, "learning_rate": 0.0008221759198604624, "loss": 0.6984, "step": 790 }, { "epoch": 0.2993377483443709, "grad_norm": 0.22508441613040567, "learning_rate": 0.0008217068121191307, "loss": 0.6571, "step": 791 }, { "epoch": 0.2997161778618732, "grad_norm": 0.2314451676380332, "learning_rate": 0.000821237220649275, "loss": 0.6618, "step": 792 }, { "epoch": 0.3000946073793756, "grad_norm": 0.20014850720595292, "learning_rate": 0.0008207671461569877, "loss": 0.6633, "step": 793 }, { "epoch": 0.30047303689687793, "grad_norm": 0.2730391871034465, "learning_rate": 0.0008202965893490876, "loss": 0.6738, "step": 794 }, { "epoch": 0.3008514664143803, "grad_norm": 0.26411822650833805, "learning_rate": 0.0008198255509331189, "loss": 0.6905, "step": 795 }, { "epoch": 0.3012298959318827, "grad_norm": 0.32407271301965146, "learning_rate": 0.0008193540316173498, "loss": 0.6789, "step": 796 }, { "epoch": 0.30160832544938504, "grad_norm": 0.20670665630623256, "learning_rate": 0.0008188820321107717, "loss": 0.6742, "step": 797 }, { "epoch": 0.3019867549668874, "grad_norm": 0.24952643961321547, "learning_rate": 0.0008184095531230978, "loss": 0.6762, "step": 798 }, { "epoch": 0.30236518448438976, "grad_norm": 0.2223326774649571, "learning_rate": 0.0008179365953647624, "loss": 0.674, "step": 799 }, { "epoch": 0.30274361400189215, "grad_norm": 0.23580727046220087, "learning_rate": 0.00081746315954692, "loss": 0.653, "step": 800 }, { "epoch": 0.30312204351939454, "grad_norm": 0.9507396115265361, "learning_rate": 0.0008169892463814433, "loss": 0.6791, "step": 801 }, { "epoch": 0.30350047303689687, "grad_norm": 0.21035180193675804, "learning_rate": 0.0008165148565809236, "loss": 0.6698, "step": 802 }, { "epoch": 0.30387890255439926, "grad_norm": 0.1934047992409655, "learning_rate": 0.0008160399908586679, "loss": 0.6808, "step": 803 }, { "epoch": 0.3042573320719016, "grad_norm": 0.2482040988325467, "learning_rate": 0.0008155646499286995, "loss": 0.6682, "step": 804 }, { "epoch": 0.304635761589404, "grad_norm": 0.24285936990686494, "learning_rate": 0.000815088834505756, "loss": 0.6709, "step": 805 }, { "epoch": 0.30501419110690636, "grad_norm": 0.2569397981040288, "learning_rate": 0.0008146125453052885, "loss": 0.6804, "step": 806 }, { "epoch": 0.3053926206244087, "grad_norm": 0.1952279957114141, "learning_rate": 0.0008141357830434606, "loss": 0.7037, "step": 807 }, { "epoch": 0.3057710501419111, "grad_norm": 0.2876387665515686, "learning_rate": 0.000813658548437147, "loss": 0.6804, "step": 808 }, { "epoch": 0.3061494796594134, "grad_norm": 0.29663036958630473, "learning_rate": 0.0008131808422039326, "loss": 0.675, "step": 809 }, { "epoch": 0.3065279091769158, "grad_norm": 0.2751252268556719, "learning_rate": 0.0008127026650621118, "loss": 0.6703, "step": 810 }, { "epoch": 0.3069063386944182, "grad_norm": 0.21016514628132915, "learning_rate": 0.0008122240177306868, "loss": 0.6766, "step": 811 }, { "epoch": 0.3072847682119205, "grad_norm": 0.29536345561984984, "learning_rate": 0.0008117449009293668, "loss": 0.6792, "step": 812 }, { "epoch": 0.3076631977294229, "grad_norm": 0.23425124397883898, "learning_rate": 0.0008112653153785669, "loss": 0.6794, "step": 813 }, { "epoch": 0.30804162724692524, "grad_norm": 0.26262669571188824, "learning_rate": 0.0008107852617994073, "loss": 0.6822, "step": 814 }, { "epoch": 0.30842005676442763, "grad_norm": 0.23590321607529977, "learning_rate": 0.0008103047409137114, "loss": 0.6651, "step": 815 }, { "epoch": 0.30879848628193, "grad_norm": 0.2093449361055458, "learning_rate": 0.0008098237534440058, "loss": 0.694, "step": 816 }, { "epoch": 0.30917691579943235, "grad_norm": 0.2683557085344811, "learning_rate": 0.0008093423001135185, "loss": 0.676, "step": 817 }, { "epoch": 0.30955534531693474, "grad_norm": 0.21244964073013414, "learning_rate": 0.0008088603816461778, "loss": 0.6641, "step": 818 }, { "epoch": 0.30993377483443707, "grad_norm": 0.22952947110810473, "learning_rate": 0.0008083779987666115, "loss": 0.669, "step": 819 }, { "epoch": 0.31031220435193946, "grad_norm": 0.18659550338148748, "learning_rate": 0.0008078951522001458, "loss": 0.6628, "step": 820 }, { "epoch": 0.3106906338694418, "grad_norm": 0.2079529398863514, "learning_rate": 0.0008074118426728043, "loss": 0.6693, "step": 821 }, { "epoch": 0.3110690633869442, "grad_norm": 0.2060000993638123, "learning_rate": 0.0008069280709113061, "loss": 0.6824, "step": 822 }, { "epoch": 0.31144749290444657, "grad_norm": 0.2234667198708806, "learning_rate": 0.0008064438376430656, "loss": 0.7002, "step": 823 }, { "epoch": 0.3118259224219489, "grad_norm": 0.19633109558384043, "learning_rate": 0.0008059591435961917, "loss": 0.6772, "step": 824 }, { "epoch": 0.3122043519394513, "grad_norm": 0.23608807912311064, "learning_rate": 0.0008054739894994854, "loss": 0.6554, "step": 825 }, { "epoch": 0.3125827814569536, "grad_norm": 1.854817486958587, "learning_rate": 0.0008049883760824397, "loss": 0.7007, "step": 826 }, { "epoch": 0.312961210974456, "grad_norm": 0.20446640697421192, "learning_rate": 0.0008045023040752384, "loss": 0.6637, "step": 827 }, { "epoch": 0.3133396404919584, "grad_norm": 0.24113755132955322, "learning_rate": 0.0008040157742087547, "loss": 0.7005, "step": 828 }, { "epoch": 0.3137180700094607, "grad_norm": 0.276449184587352, "learning_rate": 0.0008035287872145502, "loss": 0.6787, "step": 829 }, { "epoch": 0.3140964995269631, "grad_norm": 0.26398088220891813, "learning_rate": 0.000803041343824874, "loss": 0.6789, "step": 830 }, { "epoch": 0.31447492904446545, "grad_norm": 0.29008961246674864, "learning_rate": 0.0008025534447726612, "loss": 0.692, "step": 831 }, { "epoch": 0.31485335856196783, "grad_norm": 0.26012985966261554, "learning_rate": 0.0008020650907915325, "loss": 0.668, "step": 832 }, { "epoch": 0.3152317880794702, "grad_norm": 0.20433788216230137, "learning_rate": 0.0008015762826157922, "loss": 0.6619, "step": 833 }, { "epoch": 0.31561021759697255, "grad_norm": 0.22044807652116447, "learning_rate": 0.0008010870209804277, "loss": 0.68, "step": 834 }, { "epoch": 0.31598864711447494, "grad_norm": 3.352433524955507, "learning_rate": 0.0008005973066211084, "loss": 0.6897, "step": 835 }, { "epoch": 0.3163670766319773, "grad_norm": 0.30030430718082296, "learning_rate": 0.0008001071402741842, "loss": 0.6876, "step": 836 }, { "epoch": 0.31674550614947966, "grad_norm": 0.2142189825266794, "learning_rate": 0.0007996165226766845, "loss": 0.6756, "step": 837 }, { "epoch": 0.31712393566698205, "grad_norm": 0.23339781501718931, "learning_rate": 0.0007991254545663178, "loss": 0.6844, "step": 838 }, { "epoch": 0.3175023651844844, "grad_norm": 0.28677327731571756, "learning_rate": 0.0007986339366814693, "loss": 0.6474, "step": 839 }, { "epoch": 0.31788079470198677, "grad_norm": 0.27027219019752396, "learning_rate": 0.0007981419697612009, "loss": 0.6713, "step": 840 }, { "epoch": 0.3182592242194891, "grad_norm": 0.25258250323865766, "learning_rate": 0.0007976495545452498, "loss": 0.682, "step": 841 }, { "epoch": 0.3186376537369915, "grad_norm": 0.2510356610847418, "learning_rate": 0.0007971566917740268, "loss": 0.6955, "step": 842 }, { "epoch": 0.3190160832544939, "grad_norm": 0.2768519971518866, "learning_rate": 0.000796663382188616, "loss": 0.6603, "step": 843 }, { "epoch": 0.3193945127719962, "grad_norm": 0.27422567940241005, "learning_rate": 0.0007961696265307734, "loss": 0.6427, "step": 844 }, { "epoch": 0.3197729422894986, "grad_norm": 0.43362227337470144, "learning_rate": 0.0007956754255429255, "loss": 0.6405, "step": 845 }, { "epoch": 0.32015137180700093, "grad_norm": 0.23825792292243272, "learning_rate": 0.0007951807799681684, "loss": 0.6433, "step": 846 }, { "epoch": 0.3205298013245033, "grad_norm": 0.24881729120357388, "learning_rate": 0.0007946856905502671, "loss": 0.6684, "step": 847 }, { "epoch": 0.3209082308420057, "grad_norm": 0.2401833829028102, "learning_rate": 0.0007941901580336535, "loss": 0.6807, "step": 848 }, { "epoch": 0.32128666035950804, "grad_norm": 0.22323861841000334, "learning_rate": 0.0007936941831634259, "loss": 0.674, "step": 849 }, { "epoch": 0.3216650898770104, "grad_norm": 0.22499140716409996, "learning_rate": 0.0007931977666853478, "loss": 0.6982, "step": 850 }, { "epoch": 0.32204351939451276, "grad_norm": 0.243231109200235, "learning_rate": 0.0007927009093458469, "loss": 0.6644, "step": 851 }, { "epoch": 0.32242194891201514, "grad_norm": 0.19568624975730425, "learning_rate": 0.0007922036118920134, "loss": 0.6583, "step": 852 }, { "epoch": 0.3228003784295175, "grad_norm": 0.1968980707547269, "learning_rate": 0.0007917058750715996, "loss": 0.6747, "step": 853 }, { "epoch": 0.32317880794701986, "grad_norm": 0.279428877681056, "learning_rate": 0.000791207699633018, "loss": 0.6692, "step": 854 }, { "epoch": 0.32355723746452225, "grad_norm": 0.23371049139103625, "learning_rate": 0.0007907090863253412, "loss": 0.6579, "step": 855 }, { "epoch": 0.3239356669820246, "grad_norm": 0.30787789332741333, "learning_rate": 0.0007902100358982998, "loss": 0.6733, "step": 856 }, { "epoch": 0.32431409649952697, "grad_norm": 0.2104493603628439, "learning_rate": 0.0007897105491022818, "loss": 0.6829, "step": 857 }, { "epoch": 0.3246925260170293, "grad_norm": 0.20756884718186824, "learning_rate": 0.0007892106266883315, "loss": 0.6702, "step": 858 }, { "epoch": 0.3250709555345317, "grad_norm": 0.21473772727416462, "learning_rate": 0.0007887102694081478, "loss": 0.663, "step": 859 }, { "epoch": 0.3254493850520341, "grad_norm": 0.22531217006283544, "learning_rate": 0.0007882094780140838, "loss": 0.6628, "step": 860 }, { "epoch": 0.3258278145695364, "grad_norm": 0.28053136248871546, "learning_rate": 0.0007877082532591453, "loss": 0.6968, "step": 861 }, { "epoch": 0.3262062440870388, "grad_norm": 0.215608380547956, "learning_rate": 0.0007872065958969897, "loss": 0.6569, "step": 862 }, { "epoch": 0.32658467360454113, "grad_norm": 0.24437459796862693, "learning_rate": 0.0007867045066819245, "loss": 0.6822, "step": 863 }, { "epoch": 0.3269631031220435, "grad_norm": 0.19773736578444068, "learning_rate": 0.0007862019863689074, "loss": 0.6767, "step": 864 }, { "epoch": 0.3273415326395459, "grad_norm": 0.20913338028419112, "learning_rate": 0.0007856990357135436, "loss": 0.6836, "step": 865 }, { "epoch": 0.32771996215704824, "grad_norm": 0.20696275405688486, "learning_rate": 0.0007851956554720856, "loss": 0.674, "step": 866 }, { "epoch": 0.3280983916745506, "grad_norm": 0.2836928507100033, "learning_rate": 0.0007846918464014318, "loss": 0.6697, "step": 867 }, { "epoch": 0.32847682119205296, "grad_norm": 0.2291920586448095, "learning_rate": 0.0007841876092591253, "loss": 0.6791, "step": 868 }, { "epoch": 0.32885525070955535, "grad_norm": 0.1786073061119343, "learning_rate": 0.0007836829448033533, "loss": 0.6386, "step": 869 }, { "epoch": 0.32923368022705773, "grad_norm": 0.20361359228023493, "learning_rate": 0.0007831778537929451, "loss": 0.6804, "step": 870 }, { "epoch": 0.32961210974456007, "grad_norm": 0.2064687897786223, "learning_rate": 0.0007826723369873714, "loss": 0.6702, "step": 871 }, { "epoch": 0.32999053926206245, "grad_norm": 0.20907953359723638, "learning_rate": 0.0007821663951467434, "loss": 0.6683, "step": 872 }, { "epoch": 0.3303689687795648, "grad_norm": 0.21942591125322533, "learning_rate": 0.000781660029031811, "loss": 0.6549, "step": 873 }, { "epoch": 0.3307473982970672, "grad_norm": 0.18869982723630974, "learning_rate": 0.0007811532394039624, "loss": 0.6707, "step": 874 }, { "epoch": 0.33112582781456956, "grad_norm": 0.23991819059026642, "learning_rate": 0.0007806460270252227, "loss": 0.6576, "step": 875 }, { "epoch": 0.3315042573320719, "grad_norm": 3.348276799662872, "learning_rate": 0.0007801383926582521, "loss": 0.6947, "step": 876 }, { "epoch": 0.3318826868495743, "grad_norm": 0.3407261234630749, "learning_rate": 0.0007796303370663458, "loss": 0.7152, "step": 877 }, { "epoch": 0.3322611163670766, "grad_norm": 0.19127480216735912, "learning_rate": 0.0007791218610134323, "loss": 0.6921, "step": 878 }, { "epoch": 0.332639545884579, "grad_norm": 0.22550252718260042, "learning_rate": 0.0007786129652640724, "loss": 0.6604, "step": 879 }, { "epoch": 0.3330179754020814, "grad_norm": 0.22409378251030426, "learning_rate": 0.0007781036505834575, "loss": 0.6589, "step": 880 }, { "epoch": 0.3333964049195837, "grad_norm": 0.24893248898076362, "learning_rate": 0.0007775939177374093, "loss": 0.6973, "step": 881 }, { "epoch": 0.3337748344370861, "grad_norm": 0.24050568557847105, "learning_rate": 0.0007770837674923783, "loss": 0.6579, "step": 882 }, { "epoch": 0.33415326395458844, "grad_norm": 0.24877806898119142, "learning_rate": 0.0007765732006154427, "loss": 0.6584, "step": 883 }, { "epoch": 0.33453169347209083, "grad_norm": 0.22411037244895934, "learning_rate": 0.0007760622178743066, "loss": 0.6537, "step": 884 }, { "epoch": 0.33491012298959316, "grad_norm": 0.2156823696995395, "learning_rate": 0.0007755508200373001, "loss": 0.668, "step": 885 }, { "epoch": 0.33528855250709555, "grad_norm": 0.21370779917550994, "learning_rate": 0.0007750390078733769, "loss": 0.6401, "step": 886 }, { "epoch": 0.33566698202459794, "grad_norm": 0.21569000383845485, "learning_rate": 0.0007745267821521142, "loss": 0.6622, "step": 887 }, { "epoch": 0.33604541154210027, "grad_norm": 0.23032930972738713, "learning_rate": 0.0007740141436437105, "loss": 0.6687, "step": 888 }, { "epoch": 0.33642384105960266, "grad_norm": 0.22004024579112444, "learning_rate": 0.0007735010931189855, "loss": 0.6488, "step": 889 }, { "epoch": 0.336802270577105, "grad_norm": 0.2094686903500319, "learning_rate": 0.0007729876313493781, "loss": 0.6557, "step": 890 }, { "epoch": 0.3371807000946074, "grad_norm": 0.24023180084089937, "learning_rate": 0.0007724737591069455, "loss": 0.6689, "step": 891 }, { "epoch": 0.33755912961210977, "grad_norm": 0.2164005488804971, "learning_rate": 0.0007719594771643623, "loss": 0.6657, "step": 892 }, { "epoch": 0.3379375591296121, "grad_norm": 0.23357190983716655, "learning_rate": 0.0007714447862949192, "loss": 0.6566, "step": 893 }, { "epoch": 0.3383159886471145, "grad_norm": 0.19703398219368895, "learning_rate": 0.0007709296872725215, "loss": 0.6547, "step": 894 }, { "epoch": 0.3386944181646168, "grad_norm": 0.745494262156418, "learning_rate": 0.0007704141808716885, "loss": 0.658, "step": 895 }, { "epoch": 0.3390728476821192, "grad_norm": 0.277202005834671, "learning_rate": 0.0007698982678675517, "loss": 0.6548, "step": 896 }, { "epoch": 0.3394512771996216, "grad_norm": 0.33961141251785826, "learning_rate": 0.0007693819490358544, "loss": 0.6834, "step": 897 }, { "epoch": 0.3398297067171239, "grad_norm": 0.3165901026690292, "learning_rate": 0.0007688652251529498, "loss": 0.6721, "step": 898 }, { "epoch": 0.3402081362346263, "grad_norm": 0.2767192364111685, "learning_rate": 0.0007683480969958004, "loss": 0.6957, "step": 899 }, { "epoch": 0.34058656575212864, "grad_norm": 0.224912231930253, "learning_rate": 0.000767830565341976, "loss": 0.666, "step": 900 }, { "epoch": 0.34096499526963103, "grad_norm": 0.21480472383639315, "learning_rate": 0.0007673126309696539, "loss": 0.6513, "step": 901 }, { "epoch": 0.3413434247871334, "grad_norm": 0.3192789802735685, "learning_rate": 0.0007667942946576168, "loss": 0.6478, "step": 902 }, { "epoch": 0.34172185430463575, "grad_norm": 0.23692649563386428, "learning_rate": 0.0007662755571852509, "loss": 0.6769, "step": 903 }, { "epoch": 0.34210028382213814, "grad_norm": 0.24689561148202652, "learning_rate": 0.0007657564193325468, "loss": 0.6691, "step": 904 }, { "epoch": 0.3424787133396405, "grad_norm": 0.2034357518239654, "learning_rate": 0.0007652368818800964, "loss": 0.6851, "step": 905 }, { "epoch": 0.34285714285714286, "grad_norm": 0.2216204005208952, "learning_rate": 0.0007647169456090926, "loss": 0.6792, "step": 906 }, { "epoch": 0.34323557237464525, "grad_norm": 0.29779697605913735, "learning_rate": 0.0007641966113013281, "loss": 0.6765, "step": 907 }, { "epoch": 0.3436140018921476, "grad_norm": 0.2365658644055243, "learning_rate": 0.0007636758797391938, "loss": 0.6533, "step": 908 }, { "epoch": 0.34399243140964997, "grad_norm": 0.23377190720565608, "learning_rate": 0.0007631547517056783, "loss": 0.6587, "step": 909 }, { "epoch": 0.3443708609271523, "grad_norm": 0.21927154050850417, "learning_rate": 0.0007626332279843661, "loss": 0.6439, "step": 910 }, { "epoch": 0.3447492904446547, "grad_norm": 0.2414691793646043, "learning_rate": 0.0007621113093594367, "loss": 0.6628, "step": 911 }, { "epoch": 0.3451277199621571, "grad_norm": 0.21877827631892796, "learning_rate": 0.0007615889966156635, "loss": 0.6713, "step": 912 }, { "epoch": 0.3455061494796594, "grad_norm": 0.21466370754751962, "learning_rate": 0.0007610662905384125, "loss": 0.6761, "step": 913 }, { "epoch": 0.3458845789971618, "grad_norm": 0.2498526958774928, "learning_rate": 0.0007605431919136409, "loss": 0.6942, "step": 914 }, { "epoch": 0.34626300851466413, "grad_norm": 0.2822388643005827, "learning_rate": 0.0007600197015278964, "loss": 0.6601, "step": 915 }, { "epoch": 0.3466414380321665, "grad_norm": 0.213293420702876, "learning_rate": 0.0007594958201683158, "loss": 0.6571, "step": 916 }, { "epoch": 0.34701986754966885, "grad_norm": 0.224344063674276, "learning_rate": 0.0007589715486226235, "loss": 0.6707, "step": 917 }, { "epoch": 0.34739829706717124, "grad_norm": 0.2532203947849705, "learning_rate": 0.0007584468876791306, "loss": 0.6679, "step": 918 }, { "epoch": 0.3477767265846736, "grad_norm": 0.25412316959827097, "learning_rate": 0.0007579218381267343, "loss": 0.6443, "step": 919 }, { "epoch": 0.34815515610217596, "grad_norm": 0.20977972327793007, "learning_rate": 0.0007573964007549155, "loss": 0.6605, "step": 920 }, { "epoch": 0.34853358561967834, "grad_norm": 0.19993503677792157, "learning_rate": 0.0007568705763537383, "loss": 0.6732, "step": 921 }, { "epoch": 0.3489120151371807, "grad_norm": 0.20587897379340014, "learning_rate": 0.0007563443657138489, "loss": 0.6518, "step": 922 }, { "epoch": 0.34929044465468306, "grad_norm": 0.22996349231563726, "learning_rate": 0.0007558177696264743, "loss": 0.6478, "step": 923 }, { "epoch": 0.34966887417218545, "grad_norm": 0.19980581587158663, "learning_rate": 0.0007552907888834211, "loss": 0.6481, "step": 924 }, { "epoch": 0.3500473036896878, "grad_norm": 0.21332686463963502, "learning_rate": 0.000754763424277074, "loss": 0.6482, "step": 925 }, { "epoch": 0.35042573320719017, "grad_norm": 0.20405587227940122, "learning_rate": 0.0007542356766003953, "loss": 0.6702, "step": 926 }, { "epoch": 0.3508041627246925, "grad_norm": 0.2402620377159603, "learning_rate": 0.0007537075466469228, "loss": 0.6835, "step": 927 }, { "epoch": 0.3511825922421949, "grad_norm": 0.21565069031745313, "learning_rate": 0.0007531790352107696, "loss": 0.6732, "step": 928 }, { "epoch": 0.3515610217596973, "grad_norm": 2.0026258873391987, "learning_rate": 0.0007526501430866222, "loss": 0.6647, "step": 929 }, { "epoch": 0.3519394512771996, "grad_norm": 0.2041779516630533, "learning_rate": 0.0007521208710697393, "loss": 0.6581, "step": 930 }, { "epoch": 0.352317880794702, "grad_norm": 0.26684640114737446, "learning_rate": 0.0007515912199559514, "loss": 0.6326, "step": 931 }, { "epoch": 0.35269631031220433, "grad_norm": 0.21494984174759393, "learning_rate": 0.0007510611905416582, "loss": 0.6633, "step": 932 }, { "epoch": 0.3530747398297067, "grad_norm": 0.23082538264114905, "learning_rate": 0.0007505307836238289, "loss": 0.6656, "step": 933 }, { "epoch": 0.3534531693472091, "grad_norm": 0.24244692966466516, "learning_rate": 0.00075, "loss": 0.6685, "step": 934 }, { "epoch": 0.35383159886471144, "grad_norm": 0.23990862556077408, "learning_rate": 0.0007494688404682747, "loss": 0.6752, "step": 935 }, { "epoch": 0.3542100283822138, "grad_norm": 0.22553962793655605, "learning_rate": 0.0007489373058273211, "loss": 0.6347, "step": 936 }, { "epoch": 0.35458845789971616, "grad_norm": 0.21431521513501964, "learning_rate": 0.0007484053968763714, "loss": 0.6364, "step": 937 }, { "epoch": 0.35496688741721855, "grad_norm": 0.2110927681437712, "learning_rate": 0.000747873114415221, "loss": 0.6316, "step": 938 }, { "epoch": 0.35534531693472093, "grad_norm": 0.27018272062826765, "learning_rate": 0.0007473404592442263, "loss": 0.6671, "step": 939 }, { "epoch": 0.35572374645222327, "grad_norm": 0.3098993568326954, "learning_rate": 0.0007468074321643047, "loss": 0.68, "step": 940 }, { "epoch": 0.35610217596972565, "grad_norm": 0.22718732276342896, "learning_rate": 0.0007462740339769323, "loss": 0.672, "step": 941 }, { "epoch": 0.356480605487228, "grad_norm": 0.23902515855254672, "learning_rate": 0.0007457402654841436, "loss": 0.6749, "step": 942 }, { "epoch": 0.3568590350047304, "grad_norm": 0.227057040548698, "learning_rate": 0.0007452061274885298, "loss": 0.6321, "step": 943 }, { "epoch": 0.35723746452223276, "grad_norm": 0.2823312426669447, "learning_rate": 0.0007446716207932375, "loss": 0.6365, "step": 944 }, { "epoch": 0.3576158940397351, "grad_norm": 0.22513780215186277, "learning_rate": 0.000744136746201968, "loss": 0.6607, "step": 945 }, { "epoch": 0.3579943235572375, "grad_norm": 0.2151116867844519, "learning_rate": 0.0007436015045189756, "loss": 0.6618, "step": 946 }, { "epoch": 0.3583727530747398, "grad_norm": 0.22144109851476826, "learning_rate": 0.0007430658965490667, "loss": 0.6442, "step": 947 }, { "epoch": 0.3587511825922422, "grad_norm": 0.2492607631368218, "learning_rate": 0.0007425299230975982, "loss": 0.6673, "step": 948 }, { "epoch": 0.35912961210974453, "grad_norm": 0.21468719500446448, "learning_rate": 0.0007419935849704766, "loss": 0.6604, "step": 949 }, { "epoch": 0.3595080416272469, "grad_norm": 0.23207169286281745, "learning_rate": 0.0007414568829741572, "loss": 0.6649, "step": 950 }, { "epoch": 0.3598864711447493, "grad_norm": 0.18820575173881807, "learning_rate": 0.0007409198179156419, "loss": 0.633, "step": 951 }, { "epoch": 0.36026490066225164, "grad_norm": 0.23359926115188204, "learning_rate": 0.0007403823906024786, "loss": 0.67, "step": 952 }, { "epoch": 0.36064333017975403, "grad_norm": 0.25602888359914083, "learning_rate": 0.00073984460184276, "loss": 0.6563, "step": 953 }, { "epoch": 0.36102175969725636, "grad_norm": 0.23275210858177633, "learning_rate": 0.0007393064524451224, "loss": 0.6735, "step": 954 }, { "epoch": 0.36140018921475875, "grad_norm": 0.7436496186034277, "learning_rate": 0.0007387679432187442, "loss": 0.65, "step": 955 }, { "epoch": 0.36177861873226114, "grad_norm": 0.19237355631137254, "learning_rate": 0.0007382290749733447, "loss": 0.6605, "step": 956 }, { "epoch": 0.36215704824976347, "grad_norm": 0.2203822377261059, "learning_rate": 0.0007376898485191834, "loss": 0.6499, "step": 957 }, { "epoch": 0.36253547776726586, "grad_norm": 0.2507108773815716, "learning_rate": 0.0007371502646670582, "loss": 0.6997, "step": 958 }, { "epoch": 0.3629139072847682, "grad_norm": 0.2279098992473354, "learning_rate": 0.0007366103242283044, "loss": 0.6374, "step": 959 }, { "epoch": 0.3632923368022706, "grad_norm": 0.21700841017271114, "learning_rate": 0.0007360700280147936, "loss": 0.6509, "step": 960 }, { "epoch": 0.36367076631977296, "grad_norm": 0.25582003108947066, "learning_rate": 0.0007355293768389321, "loss": 0.6382, "step": 961 }, { "epoch": 0.3640491958372753, "grad_norm": 0.22361846944464628, "learning_rate": 0.00073498837151366, "loss": 0.6506, "step": 962 }, { "epoch": 0.3644276253547777, "grad_norm": 0.208216871054741, "learning_rate": 0.0007344470128524503, "loss": 0.6609, "step": 963 }, { "epoch": 0.36480605487228, "grad_norm": 0.22398322358914874, "learning_rate": 0.0007339053016693068, "loss": 0.6429, "step": 964 }, { "epoch": 0.3651844843897824, "grad_norm": 0.21480601586241385, "learning_rate": 0.0007333632387787635, "loss": 0.656, "step": 965 }, { "epoch": 0.3655629139072848, "grad_norm": 0.2167953456790355, "learning_rate": 0.0007328208249958834, "loss": 0.6443, "step": 966 }, { "epoch": 0.3659413434247871, "grad_norm": 0.22114921318977657, "learning_rate": 0.0007322780611362568, "loss": 0.6625, "step": 967 }, { "epoch": 0.3663197729422895, "grad_norm": 0.2281922065032165, "learning_rate": 0.0007317349480160008, "loss": 0.6698, "step": 968 }, { "epoch": 0.36669820245979184, "grad_norm": 0.2390496574642358, "learning_rate": 0.0007311914864517575, "loss": 0.6543, "step": 969 }, { "epoch": 0.36707663197729423, "grad_norm": 0.26505546659615453, "learning_rate": 0.0007306476772606926, "loss": 0.6683, "step": 970 }, { "epoch": 0.3674550614947966, "grad_norm": 0.222061215418185, "learning_rate": 0.000730103521260495, "loss": 0.6713, "step": 971 }, { "epoch": 0.36783349101229895, "grad_norm": 0.25450960511029647, "learning_rate": 0.0007295590192693747, "loss": 0.6584, "step": 972 }, { "epoch": 0.36821192052980134, "grad_norm": 0.2446011205795513, "learning_rate": 0.0007290141721060622, "loss": 0.6614, "step": 973 }, { "epoch": 0.36859035004730367, "grad_norm": 0.19684164355032555, "learning_rate": 0.0007284689805898069, "loss": 0.6638, "step": 974 }, { "epoch": 0.36896877956480606, "grad_norm": 0.20731921232614198, "learning_rate": 0.0007279234455403759, "loss": 0.6555, "step": 975 }, { "epoch": 0.36934720908230845, "grad_norm": 0.3143130991723886, "learning_rate": 0.000727377567778053, "loss": 0.6818, "step": 976 }, { "epoch": 0.3697256385998108, "grad_norm": 0.2826264923609695, "learning_rate": 0.0007268313481236372, "loss": 0.6798, "step": 977 }, { "epoch": 0.37010406811731317, "grad_norm": 0.2313301266574286, "learning_rate": 0.0007262847873984416, "loss": 0.664, "step": 978 }, { "epoch": 0.3704824976348155, "grad_norm": 0.3013245765688757, "learning_rate": 0.0007257378864242923, "loss": 0.6555, "step": 979 }, { "epoch": 0.3708609271523179, "grad_norm": 0.22817668068113126, "learning_rate": 0.0007251906460235267, "loss": 0.6833, "step": 980 }, { "epoch": 0.3712393566698202, "grad_norm": 0.23001491993216305, "learning_rate": 0.0007246430670189931, "loss": 0.7035, "step": 981 }, { "epoch": 0.3716177861873226, "grad_norm": 0.2167116977284043, "learning_rate": 0.0007240951502340482, "loss": 0.6261, "step": 982 }, { "epoch": 0.371996215704825, "grad_norm": 0.21393467434937524, "learning_rate": 0.000723546896492557, "loss": 0.6702, "step": 983 }, { "epoch": 0.3723746452223273, "grad_norm": 0.285917858845838, "learning_rate": 0.0007229983066188914, "loss": 0.6746, "step": 984 }, { "epoch": 0.3727530747398297, "grad_norm": 0.23941841185022553, "learning_rate": 0.0007224493814379282, "loss": 0.6703, "step": 985 }, { "epoch": 0.37313150425733205, "grad_norm": 0.20689250687572378, "learning_rate": 0.000721900121775049, "loss": 0.6636, "step": 986 }, { "epoch": 0.37350993377483444, "grad_norm": 0.24460164601613538, "learning_rate": 0.0007213505284561375, "loss": 0.6484, "step": 987 }, { "epoch": 0.3738883632923368, "grad_norm": 0.20569919818588378, "learning_rate": 0.00072080060230758, "loss": 0.6546, "step": 988 }, { "epoch": 0.37426679280983915, "grad_norm": 0.2042906583347083, "learning_rate": 0.0007202503441562625, "loss": 0.6875, "step": 989 }, { "epoch": 0.37464522232734154, "grad_norm": 0.752424889741565, "learning_rate": 0.0007196997548295708, "loss": 0.6593, "step": 990 }, { "epoch": 0.3750236518448439, "grad_norm": 0.2048651452170197, "learning_rate": 0.0007191488351553882, "loss": 0.6565, "step": 991 }, { "epoch": 0.37540208136234626, "grad_norm": 0.25291625469578516, "learning_rate": 0.0007185975859620952, "loss": 0.6497, "step": 992 }, { "epoch": 0.37578051087984865, "grad_norm": 0.2696061514012576, "learning_rate": 0.0007180460080785671, "loss": 0.6581, "step": 993 }, { "epoch": 0.376158940397351, "grad_norm": 0.21377493553945806, "learning_rate": 0.0007174941023341741, "loss": 0.6603, "step": 994 }, { "epoch": 0.37653736991485337, "grad_norm": 0.20624711330879034, "learning_rate": 0.0007169418695587791, "loss": 0.6632, "step": 995 }, { "epoch": 0.3769157994323557, "grad_norm": 0.22044512767481847, "learning_rate": 0.0007163893105827366, "loss": 0.645, "step": 996 }, { "epoch": 0.3772942289498581, "grad_norm": 0.2021579648044074, "learning_rate": 0.0007158364262368919, "loss": 0.62, "step": 997 }, { "epoch": 0.3776726584673605, "grad_norm": 0.2724525969948071, "learning_rate": 0.0007152832173525793, "loss": 0.6532, "step": 998 }, { "epoch": 0.3780510879848628, "grad_norm": 0.2097351279368481, "learning_rate": 0.000714729684761621, "loss": 0.6771, "step": 999 }, { "epoch": 0.3784295175023652, "grad_norm": 0.194206789128711, "learning_rate": 0.0007141758292963263, "loss": 0.6642, "step": 1000 }, { "epoch": 0.37880794701986753, "grad_norm": 0.23121270832737695, "learning_rate": 0.0007136216517894898, "loss": 0.6667, "step": 1001 }, { "epoch": 0.3791863765373699, "grad_norm": 0.22938577335637647, "learning_rate": 0.0007130671530743901, "loss": 0.6685, "step": 1002 }, { "epoch": 0.3795648060548723, "grad_norm": 0.2131921924599144, "learning_rate": 0.0007125123339847893, "loss": 0.6577, "step": 1003 }, { "epoch": 0.37994323557237464, "grad_norm": 0.22541287211563701, "learning_rate": 0.0007119571953549304, "loss": 0.6509, "step": 1004 }, { "epoch": 0.380321665089877, "grad_norm": 0.1935396991242371, "learning_rate": 0.0007114017380195378, "loss": 0.6543, "step": 1005 }, { "epoch": 0.38070009460737936, "grad_norm": 0.23606428283096956, "learning_rate": 0.0007108459628138144, "loss": 0.6634, "step": 1006 }, { "epoch": 0.38107852412488175, "grad_norm": 0.27336446604167575, "learning_rate": 0.0007102898705734417, "loss": 0.6745, "step": 1007 }, { "epoch": 0.38145695364238413, "grad_norm": 0.20849242195987872, "learning_rate": 0.0007097334621345774, "loss": 0.6737, "step": 1008 }, { "epoch": 0.38183538315988647, "grad_norm": 0.20898824313853198, "learning_rate": 0.0007091767383338546, "loss": 0.6694, "step": 1009 }, { "epoch": 0.38221381267738885, "grad_norm": 0.20434826447079138, "learning_rate": 0.0007086197000083812, "loss": 0.6648, "step": 1010 }, { "epoch": 0.3825922421948912, "grad_norm": 0.2657511776075352, "learning_rate": 0.0007080623479957372, "loss": 0.6606, "step": 1011 }, { "epoch": 0.3829706717123936, "grad_norm": 0.2859578557924595, "learning_rate": 0.0007075046831339751, "loss": 0.6812, "step": 1012 }, { "epoch": 0.3833491012298959, "grad_norm": 0.22392131070624977, "learning_rate": 0.000706946706261617, "loss": 0.6405, "step": 1013 }, { "epoch": 0.3837275307473983, "grad_norm": 0.20328731653894486, "learning_rate": 0.0007063884182176548, "loss": 0.63, "step": 1014 }, { "epoch": 0.3841059602649007, "grad_norm": 0.22384528515395963, "learning_rate": 0.0007058298198415478, "loss": 0.6667, "step": 1015 }, { "epoch": 0.384484389782403, "grad_norm": 0.24034991238512904, "learning_rate": 0.0007052709119732226, "loss": 0.6341, "step": 1016 }, { "epoch": 0.3848628192999054, "grad_norm": 0.2605608477413124, "learning_rate": 0.0007047116954530704, "loss": 0.6463, "step": 1017 }, { "epoch": 0.38524124881740773, "grad_norm": 0.309579722745813, "learning_rate": 0.0007041521711219468, "loss": 0.6591, "step": 1018 }, { "epoch": 0.3856196783349101, "grad_norm": 0.21683691703177935, "learning_rate": 0.0007035923398211702, "loss": 0.6567, "step": 1019 }, { "epoch": 0.3859981078524125, "grad_norm": 0.21823251345366199, "learning_rate": 0.000703032202392521, "loss": 0.6548, "step": 1020 }, { "epoch": 0.38637653736991484, "grad_norm": 0.19508453416665233, "learning_rate": 0.000702471759678239, "loss": 0.6838, "step": 1021 }, { "epoch": 0.38675496688741723, "grad_norm": 0.23946287748256562, "learning_rate": 0.0007019110125210243, "loss": 0.656, "step": 1022 }, { "epoch": 0.38713339640491956, "grad_norm": 0.19843810599527417, "learning_rate": 0.0007013499617640333, "loss": 0.6481, "step": 1023 }, { "epoch": 0.38751182592242195, "grad_norm": 0.23173096488916373, "learning_rate": 0.00070078860825088, "loss": 0.6179, "step": 1024 }, { "epoch": 0.38789025543992434, "grad_norm": 0.20417428432253693, "learning_rate": 0.0007002269528256334, "loss": 0.6619, "step": 1025 }, { "epoch": 0.38826868495742667, "grad_norm": 0.2189796468474038, "learning_rate": 0.0006996649963328159, "loss": 0.6622, "step": 1026 }, { "epoch": 0.38864711447492906, "grad_norm": 0.20830959045697053, "learning_rate": 0.0006991027396174032, "loss": 0.6627, "step": 1027 }, { "epoch": 0.3890255439924314, "grad_norm": 0.23714451034629502, "learning_rate": 0.0006985401835248226, "loss": 0.6631, "step": 1028 }, { "epoch": 0.3894039735099338, "grad_norm": 0.2058043020801087, "learning_rate": 0.0006979773289009508, "loss": 0.6619, "step": 1029 }, { "epoch": 0.38978240302743616, "grad_norm": 0.1997447456246483, "learning_rate": 0.0006974141765921139, "loss": 0.6601, "step": 1030 }, { "epoch": 0.3901608325449385, "grad_norm": 0.19520730046501478, "learning_rate": 0.0006968507274450857, "loss": 0.6518, "step": 1031 }, { "epoch": 0.3905392620624409, "grad_norm": 0.2036497783197479, "learning_rate": 0.000696286982307086, "loss": 0.637, "step": 1032 }, { "epoch": 0.3909176915799432, "grad_norm": 0.238451693501121, "learning_rate": 0.0006957229420257796, "loss": 0.6711, "step": 1033 }, { "epoch": 0.3912961210974456, "grad_norm": 0.22401083101755925, "learning_rate": 0.0006951586074492755, "loss": 0.6496, "step": 1034 }, { "epoch": 0.391674550614948, "grad_norm": 0.2219270175079365, "learning_rate": 0.000694593979426125, "loss": 0.6696, "step": 1035 }, { "epoch": 0.3920529801324503, "grad_norm": 0.22860484541524526, "learning_rate": 0.0006940290588053205, "loss": 0.6808, "step": 1036 }, { "epoch": 0.3924314096499527, "grad_norm": 0.21581803402625754, "learning_rate": 0.0006934638464362946, "loss": 0.6554, "step": 1037 }, { "epoch": 0.39280983916745504, "grad_norm": 0.22266883145306032, "learning_rate": 0.0006928983431689185, "loss": 0.6402, "step": 1038 }, { "epoch": 0.39318826868495743, "grad_norm": 0.20408549998472728, "learning_rate": 0.0006923325498535006, "loss": 0.6493, "step": 1039 }, { "epoch": 0.3935666982024598, "grad_norm": 0.19422386043021306, "learning_rate": 0.0006917664673407858, "loss": 0.626, "step": 1040 }, { "epoch": 0.39394512771996215, "grad_norm": 0.19769938152884114, "learning_rate": 0.0006912000964819536, "loss": 0.6515, "step": 1041 }, { "epoch": 0.39432355723746454, "grad_norm": 0.19879200987711937, "learning_rate": 0.000690633438128617, "loss": 0.6652, "step": 1042 }, { "epoch": 0.39470198675496687, "grad_norm": 0.1946117801280272, "learning_rate": 0.0006900664931328214, "loss": 0.6465, "step": 1043 }, { "epoch": 0.39508041627246926, "grad_norm": 0.20989916580306128, "learning_rate": 0.0006894992623470433, "loss": 0.6338, "step": 1044 }, { "epoch": 0.3954588457899716, "grad_norm": 0.21244460814464497, "learning_rate": 0.000688931746624189, "loss": 0.6605, "step": 1045 }, { "epoch": 0.395837275307474, "grad_norm": 0.1832861149102408, "learning_rate": 0.0006883639468175926, "loss": 0.6635, "step": 1046 }, { "epoch": 0.39621570482497637, "grad_norm": 0.2125983638164642, "learning_rate": 0.0006877958637810162, "loss": 0.6436, "step": 1047 }, { "epoch": 0.3965941343424787, "grad_norm": 0.23421707395751715, "learning_rate": 0.0006872274983686472, "loss": 0.6554, "step": 1048 }, { "epoch": 0.3969725638599811, "grad_norm": 0.2486700280348931, "learning_rate": 0.000686658851435098, "loss": 0.6726, "step": 1049 }, { "epoch": 0.3973509933774834, "grad_norm": 0.21699061629226948, "learning_rate": 0.0006860899238354039, "loss": 0.6446, "step": 1050 }, { "epoch": 0.3977294228949858, "grad_norm": 0.2500847434419687, "learning_rate": 0.0006855207164250226, "loss": 0.6481, "step": 1051 }, { "epoch": 0.3981078524124882, "grad_norm": 0.2349319516232334, "learning_rate": 0.000684951230059832, "loss": 0.6292, "step": 1052 }, { "epoch": 0.3984862819299905, "grad_norm": 0.2101065906718322, "learning_rate": 0.0006843814655961301, "loss": 0.6661, "step": 1053 }, { "epoch": 0.3988647114474929, "grad_norm": 0.23790116560811977, "learning_rate": 0.0006838114238906327, "loss": 0.6475, "step": 1054 }, { "epoch": 0.39924314096499525, "grad_norm": 0.21652402117611116, "learning_rate": 0.0006832411058004725, "loss": 0.615, "step": 1055 }, { "epoch": 0.39962157048249763, "grad_norm": 0.24390092144997735, "learning_rate": 0.0006826705121831977, "loss": 0.6588, "step": 1056 }, { "epoch": 0.4, "grad_norm": 0.23611899651811122, "learning_rate": 0.0006820996438967708, "loss": 0.6562, "step": 1057 }, { "epoch": 0.40037842951750235, "grad_norm": 0.19072350552121262, "learning_rate": 0.0006815285017995676, "loss": 0.6292, "step": 1058 }, { "epoch": 0.40075685903500474, "grad_norm": 0.21733367892475042, "learning_rate": 0.0006809570867503754, "loss": 0.6423, "step": 1059 }, { "epoch": 0.4011352885525071, "grad_norm": 0.21746943513916073, "learning_rate": 0.0006803853996083918, "loss": 0.6443, "step": 1060 }, { "epoch": 0.40151371807000946, "grad_norm": 0.20148003687779292, "learning_rate": 0.0006798134412332235, "loss": 0.6333, "step": 1061 }, { "epoch": 0.40189214758751185, "grad_norm": 0.24301401740821335, "learning_rate": 0.0006792412124848856, "loss": 0.6573, "step": 1062 }, { "epoch": 0.4022705771050142, "grad_norm": 0.23062576990335037, "learning_rate": 0.000678668714223799, "loss": 0.6728, "step": 1063 }, { "epoch": 0.40264900662251657, "grad_norm": 0.20368632371790482, "learning_rate": 0.0006780959473107902, "loss": 0.6506, "step": 1064 }, { "epoch": 0.4030274361400189, "grad_norm": 0.21091889369990338, "learning_rate": 0.0006775229126070899, "loss": 0.6577, "step": 1065 }, { "epoch": 0.4034058656575213, "grad_norm": 0.24258675275807332, "learning_rate": 0.0006769496109743308, "loss": 0.6539, "step": 1066 }, { "epoch": 0.4037842951750237, "grad_norm": 0.22435484022569027, "learning_rate": 0.0006763760432745475, "loss": 0.64, "step": 1067 }, { "epoch": 0.404162724692526, "grad_norm": 0.20931679144259072, "learning_rate": 0.0006758022103701744, "loss": 0.6332, "step": 1068 }, { "epoch": 0.4045411542100284, "grad_norm": 0.19994563839141005, "learning_rate": 0.0006752281131240451, "loss": 0.666, "step": 1069 }, { "epoch": 0.40491958372753073, "grad_norm": 0.30870068785662147, "learning_rate": 0.00067465375239939, "loss": 0.6379, "step": 1070 }, { "epoch": 0.4052980132450331, "grad_norm": 0.27415858964607653, "learning_rate": 0.0006740791290598361, "loss": 0.6336, "step": 1071 }, { "epoch": 0.4056764427625355, "grad_norm": 0.2616878035739243, "learning_rate": 0.0006735042439694054, "loss": 0.6557, "step": 1072 }, { "epoch": 0.40605487228003784, "grad_norm": 0.1947611082726169, "learning_rate": 0.0006729290979925131, "loss": 0.6541, "step": 1073 }, { "epoch": 0.4064333017975402, "grad_norm": 0.21285182704338537, "learning_rate": 0.0006723536919939669, "loss": 0.6446, "step": 1074 }, { "epoch": 0.40681173131504256, "grad_norm": 1.1178562323611376, "learning_rate": 0.0006717780268389654, "loss": 0.6613, "step": 1075 }, { "epoch": 0.40719016083254495, "grad_norm": 0.26431068461009627, "learning_rate": 0.0006712021033930972, "loss": 0.6433, "step": 1076 }, { "epoch": 0.4075685903500473, "grad_norm": 0.20654934451676307, "learning_rate": 0.0006706259225223386, "loss": 0.6394, "step": 1077 }, { "epoch": 0.40794701986754967, "grad_norm": 0.20474957987983278, "learning_rate": 0.0006700494850930534, "loss": 0.6702, "step": 1078 }, { "epoch": 0.40832544938505205, "grad_norm": 0.20407737698150297, "learning_rate": 0.0006694727919719913, "loss": 0.6529, "step": 1079 }, { "epoch": 0.4087038789025544, "grad_norm": 0.2688866384382385, "learning_rate": 0.0006688958440262861, "loss": 0.6567, "step": 1080 }, { "epoch": 0.4090823084200568, "grad_norm": 0.22114176751087392, "learning_rate": 0.0006683186421234552, "loss": 0.6713, "step": 1081 }, { "epoch": 0.4094607379375591, "grad_norm": 0.2764708523382016, "learning_rate": 0.0006677411871313974, "loss": 0.6176, "step": 1082 }, { "epoch": 0.4098391674550615, "grad_norm": 0.22656180940720114, "learning_rate": 0.0006671634799183924, "loss": 0.6193, "step": 1083 }, { "epoch": 0.4102175969725639, "grad_norm": 45.010250026209306, "learning_rate": 0.000666585521353099, "loss": 0.7048, "step": 1084 }, { "epoch": 0.4105960264900662, "grad_norm": 2.2838765530737812, "learning_rate": 0.0006660073123045537, "loss": 0.658, "step": 1085 }, { "epoch": 0.4109744560075686, "grad_norm": 0.3376594149305728, "learning_rate": 0.0006654288536421704, "loss": 0.6669, "step": 1086 }, { "epoch": 0.41135288552507093, "grad_norm": 0.2759683244667578, "learning_rate": 0.0006648501462357373, "loss": 0.6649, "step": 1087 }, { "epoch": 0.4117313150425733, "grad_norm": 0.2059650678228818, "learning_rate": 0.0006642711909554174, "loss": 0.6479, "step": 1088 }, { "epoch": 0.4121097445600757, "grad_norm": 0.21034882666775395, "learning_rate": 0.0006636919886717461, "loss": 0.6544, "step": 1089 }, { "epoch": 0.41248817407757804, "grad_norm": 0.22749609807619245, "learning_rate": 0.0006631125402556303, "loss": 0.6506, "step": 1090 }, { "epoch": 0.41286660359508043, "grad_norm": 0.2866816026286053, "learning_rate": 0.0006625328465783469, "loss": 0.6499, "step": 1091 }, { "epoch": 0.41324503311258276, "grad_norm": 0.25036775838670366, "learning_rate": 0.0006619529085115414, "loss": 0.6348, "step": 1092 }, { "epoch": 0.41362346263008515, "grad_norm": 0.2202964935845974, "learning_rate": 0.0006613727269272275, "loss": 0.6542, "step": 1093 }, { "epoch": 0.41400189214758754, "grad_norm": 0.2452411338236911, "learning_rate": 0.0006607923026977842, "loss": 0.6617, "step": 1094 }, { "epoch": 0.41438032166508987, "grad_norm": 0.22518708941093746, "learning_rate": 0.0006602116366959557, "loss": 0.6308, "step": 1095 }, { "epoch": 0.41475875118259226, "grad_norm": 0.21098750016342246, "learning_rate": 0.0006596307297948498, "loss": 0.659, "step": 1096 }, { "epoch": 0.4151371807000946, "grad_norm": 0.2031598061304769, "learning_rate": 0.0006590495828679365, "loss": 0.6433, "step": 1097 }, { "epoch": 0.415515610217597, "grad_norm": 0.23963955378524612, "learning_rate": 0.0006584681967890466, "loss": 0.6597, "step": 1098 }, { "epoch": 0.41589403973509936, "grad_norm": 0.23393180587793422, "learning_rate": 0.0006578865724323706, "loss": 0.6376, "step": 1099 }, { "epoch": 0.4162724692526017, "grad_norm": 0.2882061852830121, "learning_rate": 0.0006573047106724573, "loss": 0.6393, "step": 1100 }, { "epoch": 0.4166508987701041, "grad_norm": 0.23977628311925184, "learning_rate": 0.0006567226123842124, "loss": 0.6309, "step": 1101 }, { "epoch": 0.4170293282876064, "grad_norm": 0.20640002595756704, "learning_rate": 0.0006561402784428974, "loss": 0.6307, "step": 1102 }, { "epoch": 0.4174077578051088, "grad_norm": 0.3299906688151207, "learning_rate": 0.0006555577097241277, "loss": 0.6461, "step": 1103 }, { "epoch": 0.4177861873226112, "grad_norm": 0.19163664750311477, "learning_rate": 0.0006549749071038723, "loss": 0.6769, "step": 1104 }, { "epoch": 0.4181646168401135, "grad_norm": 0.24581683943395588, "learning_rate": 0.0006543918714584515, "loss": 0.6554, "step": 1105 }, { "epoch": 0.4185430463576159, "grad_norm": 0.3475931731876635, "learning_rate": 0.0006538086036645362, "loss": 0.6268, "step": 1106 }, { "epoch": 0.41892147587511824, "grad_norm": 0.20917591337330776, "learning_rate": 0.0006532251045991462, "loss": 0.6458, "step": 1107 }, { "epoch": 0.41929990539262063, "grad_norm": 0.25218875719251044, "learning_rate": 0.0006526413751396492, "loss": 0.6324, "step": 1108 }, { "epoch": 0.41967833491012296, "grad_norm": 0.23335287215958472, "learning_rate": 0.000652057416163759, "loss": 0.6657, "step": 1109 }, { "epoch": 0.42005676442762535, "grad_norm": 0.23277303469988417, "learning_rate": 0.000651473228549535, "loss": 0.6629, "step": 1110 }, { "epoch": 0.42043519394512774, "grad_norm": 0.18366309134810616, "learning_rate": 0.0006508888131753803, "loss": 0.6531, "step": 1111 }, { "epoch": 0.42081362346263007, "grad_norm": 0.21587680922966618, "learning_rate": 0.0006503041709200399, "loss": 0.6378, "step": 1112 }, { "epoch": 0.42119205298013246, "grad_norm": 0.2026460484574509, "learning_rate": 0.0006497193026626007, "loss": 0.6566, "step": 1113 }, { "epoch": 0.4215704824976348, "grad_norm": 0.23562832146992557, "learning_rate": 0.000649134209282489, "loss": 0.6377, "step": 1114 }, { "epoch": 0.4219489120151372, "grad_norm": 0.20773303549034472, "learning_rate": 0.0006485488916594697, "loss": 0.6375, "step": 1115 }, { "epoch": 0.42232734153263957, "grad_norm": 0.26926060232258175, "learning_rate": 0.0006479633506736446, "loss": 0.6342, "step": 1116 }, { "epoch": 0.4227057710501419, "grad_norm": 7.18352232623207, "learning_rate": 0.0006473775872054521, "loss": 0.6502, "step": 1117 }, { "epoch": 0.4230842005676443, "grad_norm": 0.2850861920558444, "learning_rate": 0.0006467916021356643, "loss": 0.6823, "step": 1118 }, { "epoch": 0.4234626300851466, "grad_norm": 0.20919187886020504, "learning_rate": 0.000646205396345387, "loss": 0.6278, "step": 1119 }, { "epoch": 0.423841059602649, "grad_norm": 0.23283764972152562, "learning_rate": 0.0006456189707160577, "loss": 0.6602, "step": 1120 }, { "epoch": 0.4242194891201514, "grad_norm": 0.23939325041458248, "learning_rate": 0.0006450323261294444, "loss": 0.6958, "step": 1121 }, { "epoch": 0.4245979186376537, "grad_norm": 0.24114727616479248, "learning_rate": 0.0006444454634676447, "loss": 0.6351, "step": 1122 }, { "epoch": 0.4249763481551561, "grad_norm": 0.24401244804218014, "learning_rate": 0.0006438583836130834, "loss": 0.6471, "step": 1123 }, { "epoch": 0.42535477767265845, "grad_norm": 0.21540305421739486, "learning_rate": 0.0006432710874485128, "loss": 0.6553, "step": 1124 }, { "epoch": 0.42573320719016083, "grad_norm": 0.21670208826352733, "learning_rate": 0.0006426835758570098, "loss": 0.6493, "step": 1125 }, { "epoch": 0.4261116367076632, "grad_norm": 0.25631082906041164, "learning_rate": 0.0006420958497219752, "loss": 0.6595, "step": 1126 }, { "epoch": 0.42649006622516555, "grad_norm": 0.3039969786906372, "learning_rate": 0.0006415079099271327, "loss": 0.6369, "step": 1127 }, { "epoch": 0.42686849574266794, "grad_norm": 0.19693095078227518, "learning_rate": 0.0006409197573565272, "loss": 0.6561, "step": 1128 }, { "epoch": 0.4272469252601703, "grad_norm": 0.21464486019357362, "learning_rate": 0.0006403313928945235, "loss": 0.6676, "step": 1129 }, { "epoch": 0.42762535477767266, "grad_norm": 0.210658047259702, "learning_rate": 0.0006397428174258048, "loss": 0.6483, "step": 1130 }, { "epoch": 0.42800378429517505, "grad_norm": 0.19957560012750566, "learning_rate": 0.0006391540318353719, "loss": 0.656, "step": 1131 }, { "epoch": 0.4283822138126774, "grad_norm": 0.19907830467542362, "learning_rate": 0.0006385650370085414, "loss": 0.6191, "step": 1132 }, { "epoch": 0.42876064333017977, "grad_norm": 0.2116564402087292, "learning_rate": 0.0006379758338309445, "loss": 0.6386, "step": 1133 }, { "epoch": 0.4291390728476821, "grad_norm": 0.2260179058161613, "learning_rate": 0.0006373864231885259, "loss": 0.6705, "step": 1134 }, { "epoch": 0.4295175023651845, "grad_norm": 0.20894053703494875, "learning_rate": 0.0006367968059675417, "loss": 0.6532, "step": 1135 }, { "epoch": 0.4298959318826869, "grad_norm": 0.2396659794910547, "learning_rate": 0.0006362069830545594, "loss": 0.644, "step": 1136 }, { "epoch": 0.4302743614001892, "grad_norm": 0.23639659835416194, "learning_rate": 0.000635616955336455, "loss": 0.6177, "step": 1137 }, { "epoch": 0.4306527909176916, "grad_norm": 0.21732930795664512, "learning_rate": 0.000635026723700413, "loss": 0.6608, "step": 1138 }, { "epoch": 0.43103122043519393, "grad_norm": 0.22295243657346897, "learning_rate": 0.0006344362890339242, "loss": 0.6369, "step": 1139 }, { "epoch": 0.4314096499526963, "grad_norm": 0.25970267486762766, "learning_rate": 0.0006338456522247851, "loss": 0.6419, "step": 1140 }, { "epoch": 0.43178807947019865, "grad_norm": 0.2697577699456119, "learning_rate": 0.0006332548141610954, "loss": 0.6231, "step": 1141 }, { "epoch": 0.43216650898770104, "grad_norm": 0.2281034289573507, "learning_rate": 0.0006326637757312582, "loss": 0.6367, "step": 1142 }, { "epoch": 0.4325449385052034, "grad_norm": 0.22103888366689828, "learning_rate": 0.0006320725378239775, "loss": 0.6185, "step": 1143 }, { "epoch": 0.43292336802270576, "grad_norm": 0.18141019212314966, "learning_rate": 0.0006314811013282573, "loss": 0.648, "step": 1144 }, { "epoch": 0.43330179754020814, "grad_norm": 0.1926336251035288, "learning_rate": 0.0006308894671334002, "loss": 0.6504, "step": 1145 }, { "epoch": 0.4336802270577105, "grad_norm": 0.26384359348847763, "learning_rate": 0.0006302976361290061, "loss": 0.657, "step": 1146 }, { "epoch": 0.43405865657521286, "grad_norm": 0.23807551147564623, "learning_rate": 0.0006297056092049707, "loss": 0.65, "step": 1147 }, { "epoch": 0.43443708609271525, "grad_norm": 0.19456659938024015, "learning_rate": 0.0006291133872514843, "loss": 0.6789, "step": 1148 }, { "epoch": 0.4348155156102176, "grad_norm": 0.21406669542976625, "learning_rate": 0.0006285209711590306, "loss": 0.6622, "step": 1149 }, { "epoch": 0.43519394512771997, "grad_norm": 0.20905156704265823, "learning_rate": 0.0006279283618183853, "loss": 0.6422, "step": 1150 }, { "epoch": 0.4355723746452223, "grad_norm": 0.21786914313318165, "learning_rate": 0.0006273355601206143, "loss": 0.6956, "step": 1151 }, { "epoch": 0.4359508041627247, "grad_norm": 0.18264713143853764, "learning_rate": 0.0006267425669570732, "loss": 0.6799, "step": 1152 }, { "epoch": 0.4363292336802271, "grad_norm": 0.19247106280441545, "learning_rate": 0.0006261493832194052, "loss": 0.6252, "step": 1153 }, { "epoch": 0.4367076631977294, "grad_norm": 0.2210387993991227, "learning_rate": 0.0006255560097995399, "loss": 0.6366, "step": 1154 }, { "epoch": 0.4370860927152318, "grad_norm": 0.21323740127651086, "learning_rate": 0.0006249624475896925, "loss": 0.6284, "step": 1155 }, { "epoch": 0.43746452223273413, "grad_norm": 0.19203528369736306, "learning_rate": 0.0006243686974823617, "loss": 0.6622, "step": 1156 }, { "epoch": 0.4378429517502365, "grad_norm": 0.1817855257000669, "learning_rate": 0.0006237747603703291, "loss": 0.6423, "step": 1157 }, { "epoch": 0.4382213812677389, "grad_norm": 0.18519810473113923, "learning_rate": 0.0006231806371466574, "loss": 0.6306, "step": 1158 }, { "epoch": 0.43859981078524124, "grad_norm": 0.2509232104402681, "learning_rate": 0.0006225863287046887, "loss": 0.6279, "step": 1159 }, { "epoch": 0.4389782403027436, "grad_norm": 0.2392118137283396, "learning_rate": 0.0006219918359380443, "loss": 0.6587, "step": 1160 }, { "epoch": 0.43935666982024596, "grad_norm": 0.22332369166765714, "learning_rate": 0.0006213971597406221, "loss": 0.6484, "step": 1161 }, { "epoch": 0.43973509933774835, "grad_norm": 0.2169780060454296, "learning_rate": 0.000620802301006596, "loss": 0.6295, "step": 1162 }, { "epoch": 0.44011352885525074, "grad_norm": 0.2640548750098149, "learning_rate": 0.0006202072606304144, "loss": 0.6425, "step": 1163 }, { "epoch": 0.44049195837275307, "grad_norm": 0.22857094771577288, "learning_rate": 0.000619612039506799, "loss": 0.6585, "step": 1164 }, { "epoch": 0.44087038789025546, "grad_norm": 0.27828189517786545, "learning_rate": 0.0006190166385307427, "loss": 0.6765, "step": 1165 }, { "epoch": 0.4412488174077578, "grad_norm": 0.20777685031716456, "learning_rate": 0.0006184210585975096, "loss": 0.67, "step": 1166 }, { "epoch": 0.4416272469252602, "grad_norm": 0.22054414171125764, "learning_rate": 0.0006178253006026323, "loss": 0.6375, "step": 1167 }, { "epoch": 0.44200567644276256, "grad_norm": 0.19089174212444002, "learning_rate": 0.0006172293654419116, "loss": 0.6477, "step": 1168 }, { "epoch": 0.4423841059602649, "grad_norm": 0.2203364998345959, "learning_rate": 0.0006166332540114139, "loss": 0.6384, "step": 1169 }, { "epoch": 0.4427625354777673, "grad_norm": 0.20833629764991093, "learning_rate": 0.0006160369672074717, "loss": 0.6362, "step": 1170 }, { "epoch": 0.4431409649952696, "grad_norm": 0.26033143397115405, "learning_rate": 0.0006154405059266803, "loss": 0.6523, "step": 1171 }, { "epoch": 0.443519394512772, "grad_norm": 0.20088371452841433, "learning_rate": 0.0006148438710658979, "loss": 0.635, "step": 1172 }, { "epoch": 0.44389782403027433, "grad_norm": 0.1686387395093308, "learning_rate": 0.0006142470635222435, "loss": 0.6418, "step": 1173 }, { "epoch": 0.4442762535477767, "grad_norm": 0.23969516096591806, "learning_rate": 0.0006136500841930957, "loss": 0.6494, "step": 1174 }, { "epoch": 0.4446546830652791, "grad_norm": 0.20368747594364442, "learning_rate": 0.0006130529339760917, "loss": 0.6586, "step": 1175 }, { "epoch": 0.44503311258278144, "grad_norm": 2.080360185725023, "learning_rate": 0.0006124556137691252, "loss": 0.695, "step": 1176 }, { "epoch": 0.44541154210028383, "grad_norm": 0.24087324713132033, "learning_rate": 0.0006118581244703458, "loss": 0.6498, "step": 1177 }, { "epoch": 0.44578997161778616, "grad_norm": 0.2414254921586684, "learning_rate": 0.0006112604669781572, "loss": 0.6574, "step": 1178 }, { "epoch": 0.44616840113528855, "grad_norm": 0.24051521137474338, "learning_rate": 0.0006106626421912162, "loss": 0.6086, "step": 1179 }, { "epoch": 0.44654683065279094, "grad_norm": 0.22163940265306353, "learning_rate": 0.0006100646510084312, "loss": 0.6364, "step": 1180 }, { "epoch": 0.44692526017029327, "grad_norm": 0.2443554429060421, "learning_rate": 0.0006094664943289604, "loss": 0.6388, "step": 1181 }, { "epoch": 0.44730368968779566, "grad_norm": 0.24085111459727962, "learning_rate": 0.000608868173052211, "loss": 0.6619, "step": 1182 }, { "epoch": 0.447682119205298, "grad_norm": 0.20193051372097373, "learning_rate": 0.0006082696880778379, "loss": 0.6413, "step": 1183 }, { "epoch": 0.4480605487228004, "grad_norm": 0.24559226484052302, "learning_rate": 0.000607671040305742, "loss": 0.6402, "step": 1184 }, { "epoch": 0.44843897824030277, "grad_norm": 0.2503619851621073, "learning_rate": 0.000607072230636069, "loss": 0.6277, "step": 1185 }, { "epoch": 0.4488174077578051, "grad_norm": 0.19655091579064274, "learning_rate": 0.0006064732599692079, "loss": 0.6301, "step": 1186 }, { "epoch": 0.4491958372753075, "grad_norm": 0.2608046153339578, "learning_rate": 0.0006058741292057901, "loss": 0.6443, "step": 1187 }, { "epoch": 0.4495742667928098, "grad_norm": 0.23354327922851897, "learning_rate": 0.0006052748392466876, "loss": 0.6096, "step": 1188 }, { "epoch": 0.4499526963103122, "grad_norm": 0.23394244332559716, "learning_rate": 0.0006046753909930115, "loss": 0.6566, "step": 1189 }, { "epoch": 0.4503311258278146, "grad_norm": 0.1952855854563458, "learning_rate": 0.0006040757853461113, "loss": 0.6443, "step": 1190 }, { "epoch": 0.4507095553453169, "grad_norm": 0.22319860658755458, "learning_rate": 0.0006034760232075729, "loss": 0.6421, "step": 1191 }, { "epoch": 0.4510879848628193, "grad_norm": 0.21198370883364887, "learning_rate": 0.0006028761054792176, "loss": 0.6441, "step": 1192 }, { "epoch": 0.45146641438032165, "grad_norm": 0.2182422603663727, "learning_rate": 0.0006022760330631005, "loss": 0.6595, "step": 1193 }, { "epoch": 0.45184484389782403, "grad_norm": 0.24760645717823346, "learning_rate": 0.0006016758068615097, "loss": 0.6397, "step": 1194 }, { "epoch": 0.4522232734153264, "grad_norm": 0.22072697400895314, "learning_rate": 0.0006010754277769641, "loss": 0.6617, "step": 1195 }, { "epoch": 0.45260170293282875, "grad_norm": 0.20270212795336104, "learning_rate": 0.0006004748967122128, "loss": 0.6582, "step": 1196 }, { "epoch": 0.45298013245033114, "grad_norm": 0.22565041483450654, "learning_rate": 0.0005998742145702331, "loss": 0.6652, "step": 1197 }, { "epoch": 0.4533585619678335, "grad_norm": 0.23359863920421692, "learning_rate": 0.0005992733822542299, "loss": 0.6485, "step": 1198 }, { "epoch": 0.45373699148533586, "grad_norm": 0.22988223753981207, "learning_rate": 0.0005986724006676333, "loss": 0.6535, "step": 1199 }, { "epoch": 0.45411542100283825, "grad_norm": 1.053280775282011, "learning_rate": 0.0005980712707140985, "loss": 0.6389, "step": 1200 }, { "epoch": 0.4544938505203406, "grad_norm": 0.22445706718214703, "learning_rate": 0.0005974699932975034, "loss": 0.6481, "step": 1201 }, { "epoch": 0.45487228003784297, "grad_norm": 0.19128042094439404, "learning_rate": 0.0005968685693219476, "loss": 0.6371, "step": 1202 }, { "epoch": 0.4552507095553453, "grad_norm": 0.24989750370348832, "learning_rate": 0.0005962669996917514, "loss": 0.6423, "step": 1203 }, { "epoch": 0.4556291390728477, "grad_norm": 0.21915877839918183, "learning_rate": 0.0005956652853114537, "loss": 0.6601, "step": 1204 }, { "epoch": 0.45600756859035, "grad_norm": 0.2421801781316668, "learning_rate": 0.0005950634270858112, "loss": 0.6267, "step": 1205 }, { "epoch": 0.4563859981078524, "grad_norm": 0.2169347015652061, "learning_rate": 0.0005944614259197972, "loss": 0.6383, "step": 1206 }, { "epoch": 0.4567644276253548, "grad_norm": 4.748406393859944, "learning_rate": 0.0005938592827185994, "loss": 0.6447, "step": 1207 }, { "epoch": 0.45714285714285713, "grad_norm": 0.33657864446003577, "learning_rate": 0.0005932569983876194, "loss": 0.6477, "step": 1208 }, { "epoch": 0.4575212866603595, "grad_norm": 0.28578068023508046, "learning_rate": 0.0005926545738324712, "loss": 0.6294, "step": 1209 }, { "epoch": 0.45789971617786185, "grad_norm": 0.2941189630037208, "learning_rate": 0.0005920520099589791, "loss": 0.6722, "step": 1210 }, { "epoch": 0.45827814569536424, "grad_norm": 0.4463312141662774, "learning_rate": 0.0005914493076731774, "loss": 0.6448, "step": 1211 }, { "epoch": 0.4586565752128666, "grad_norm": 10.86033290233369, "learning_rate": 0.000590846467881308, "loss": 0.6699, "step": 1212 }, { "epoch": 0.45903500473036896, "grad_norm": 0.34005934355157247, "learning_rate": 0.00059024349148982, "loss": 0.645, "step": 1213 }, { "epoch": 0.45941343424787134, "grad_norm": 0.297869947522688, "learning_rate": 0.0005896403794053679, "loss": 0.6646, "step": 1214 }, { "epoch": 0.4597918637653737, "grad_norm": 3.784783721953006, "learning_rate": 0.0005890371325348099, "loss": 0.6304, "step": 1215 }, { "epoch": 0.46017029328287606, "grad_norm": 0.4543662171649694, "learning_rate": 0.000588433751785207, "loss": 0.6737, "step": 1216 }, { "epoch": 0.46054872280037845, "grad_norm": 0.6742951474716232, "learning_rate": 0.0005878302380638216, "loss": 0.6269, "step": 1217 }, { "epoch": 0.4609271523178808, "grad_norm": 0.6347851559495284, "learning_rate": 0.0005872265922781161, "loss": 0.6505, "step": 1218 }, { "epoch": 0.46130558183538317, "grad_norm": 0.4778633031039711, "learning_rate": 0.0005866228153357514, "loss": 0.6372, "step": 1219 }, { "epoch": 0.4616840113528855, "grad_norm": 0.2865795644798127, "learning_rate": 0.0005860189081445854, "loss": 0.6581, "step": 1220 }, { "epoch": 0.4620624408703879, "grad_norm": 0.3845817397605693, "learning_rate": 0.0005854148716126721, "loss": 0.636, "step": 1221 }, { "epoch": 0.4624408703878903, "grad_norm": 0.3050118944709795, "learning_rate": 0.0005848107066482599, "loss": 0.6591, "step": 1222 }, { "epoch": 0.4628192999053926, "grad_norm": 2.5445300581557118, "learning_rate": 0.0005842064141597904, "loss": 0.6421, "step": 1223 }, { "epoch": 0.463197729422895, "grad_norm": 0.39305685280509783, "learning_rate": 0.0005836019950558969, "loss": 0.6217, "step": 1224 }, { "epoch": 0.46357615894039733, "grad_norm": 0.2972912630712438, "learning_rate": 0.000582997450245403, "loss": 0.6461, "step": 1225 }, { "epoch": 0.4639545884578997, "grad_norm": 2.480942664342258, "learning_rate": 0.0005823927806373211, "loss": 0.6804, "step": 1226 }, { "epoch": 0.4643330179754021, "grad_norm": 0.32588928224428076, "learning_rate": 0.0005817879871408519, "loss": 0.6714, "step": 1227 }, { "epoch": 0.46471144749290444, "grad_norm": 0.2965421316309275, "learning_rate": 0.000581183070665382, "loss": 0.6417, "step": 1228 }, { "epoch": 0.4650898770104068, "grad_norm": 0.3628581054045029, "learning_rate": 0.0005805780321204826, "loss": 0.6397, "step": 1229 }, { "epoch": 0.46546830652790916, "grad_norm": 0.3196232754135301, "learning_rate": 0.000579972872415909, "loss": 0.6351, "step": 1230 }, { "epoch": 0.46584673604541155, "grad_norm": 1.442473721851605, "learning_rate": 0.0005793675924615986, "loss": 0.6364, "step": 1231 }, { "epoch": 0.46622516556291393, "grad_norm": 0.3216554124169529, "learning_rate": 0.0005787621931676691, "loss": 0.6383, "step": 1232 }, { "epoch": 0.46660359508041627, "grad_norm": 0.36813953429255186, "learning_rate": 0.0005781566754444181, "loss": 0.6265, "step": 1233 }, { "epoch": 0.46698202459791865, "grad_norm": 0.4185239366706879, "learning_rate": 0.0005775510402023213, "loss": 0.6365, "step": 1234 }, { "epoch": 0.467360454115421, "grad_norm": 0.42766939828024586, "learning_rate": 0.000576945288352031, "loss": 0.6595, "step": 1235 }, { "epoch": 0.4677388836329234, "grad_norm": 0.33047946254240274, "learning_rate": 0.0005763394208043747, "loss": 0.6193, "step": 1236 }, { "epoch": 0.4681173131504257, "grad_norm": 0.2974345305074747, "learning_rate": 0.0005757334384703538, "loss": 0.6485, "step": 1237 }, { "epoch": 0.4684957426679281, "grad_norm": 0.5427705142178216, "learning_rate": 0.000575127342261143, "loss": 0.6389, "step": 1238 }, { "epoch": 0.4688741721854305, "grad_norm": 0.2235397828349514, "learning_rate": 0.0005745211330880871, "loss": 0.6583, "step": 1239 }, { "epoch": 0.4692526017029328, "grad_norm": 0.22774026022319327, "learning_rate": 0.000573914811862702, "loss": 0.668, "step": 1240 }, { "epoch": 0.4696310312204352, "grad_norm": 0.2542482859986032, "learning_rate": 0.0005733083794966709, "loss": 0.6507, "step": 1241 }, { "epoch": 0.47000946073793753, "grad_norm": 0.3243673282884747, "learning_rate": 0.0005727018369018449, "loss": 0.6125, "step": 1242 }, { "epoch": 0.4703878902554399, "grad_norm": 0.26434345196911085, "learning_rate": 0.0005720951849902407, "loss": 0.6725, "step": 1243 }, { "epoch": 0.4707663197729423, "grad_norm": 0.258168581500093, "learning_rate": 0.000571488424674039, "loss": 0.656, "step": 1244 }, { "epoch": 0.47114474929044464, "grad_norm": 0.27956289680498847, "learning_rate": 0.000570881556865584, "loss": 0.6502, "step": 1245 }, { "epoch": 0.47152317880794703, "grad_norm": 0.26155566358120663, "learning_rate": 0.0005702745824773811, "loss": 0.6142, "step": 1246 }, { "epoch": 0.47190160832544936, "grad_norm": 0.3294774973092657, "learning_rate": 0.0005696675024220963, "loss": 0.6401, "step": 1247 }, { "epoch": 0.47228003784295175, "grad_norm": 0.23256819791305675, "learning_rate": 0.0005690603176125543, "loss": 0.6408, "step": 1248 }, { "epoch": 0.47265846736045414, "grad_norm": 0.22810511927002627, "learning_rate": 0.0005684530289617376, "loss": 0.6108, "step": 1249 }, { "epoch": 0.47303689687795647, "grad_norm": 2.0420916627492076, "learning_rate": 0.0005678456373827842, "loss": 0.6399, "step": 1250 }, { "epoch": 0.47341532639545886, "grad_norm": 0.2605534845797768, "learning_rate": 0.0005672381437889876, "loss": 0.6573, "step": 1251 }, { "epoch": 0.4737937559129612, "grad_norm": 0.2591126919522417, "learning_rate": 0.0005666305490937943, "loss": 0.6534, "step": 1252 }, { "epoch": 0.4741721854304636, "grad_norm": 0.24110653583149627, "learning_rate": 0.0005660228542108026, "loss": 0.6269, "step": 1253 }, { "epoch": 0.47455061494796597, "grad_norm": 0.22387109580328438, "learning_rate": 0.000565415060053762, "loss": 0.6342, "step": 1254 }, { "epoch": 0.4749290444654683, "grad_norm": 0.28217621456488806, "learning_rate": 0.0005648071675365708, "loss": 0.635, "step": 1255 }, { "epoch": 0.4753074739829707, "grad_norm": 0.24880704266452244, "learning_rate": 0.0005641991775732756, "loss": 0.6444, "step": 1256 }, { "epoch": 0.475685903500473, "grad_norm": 0.2585159588117338, "learning_rate": 0.000563591091078069, "loss": 0.6372, "step": 1257 }, { "epoch": 0.4760643330179754, "grad_norm": 0.22935676637360583, "learning_rate": 0.0005629829089652894, "loss": 0.6459, "step": 1258 }, { "epoch": 0.4764427625354778, "grad_norm": 0.25516587831980464, "learning_rate": 0.0005623746321494186, "loss": 0.6497, "step": 1259 }, { "epoch": 0.4768211920529801, "grad_norm": 0.18007141026288842, "learning_rate": 0.0005617662615450805, "loss": 0.67, "step": 1260 }, { "epoch": 0.4771996215704825, "grad_norm": 0.2426093565144289, "learning_rate": 0.0005611577980670407, "loss": 0.6521, "step": 1261 }, { "epoch": 0.47757805108798485, "grad_norm": 0.19066351204487675, "learning_rate": 0.0005605492426302042, "loss": 0.6328, "step": 1262 }, { "epoch": 0.47795648060548723, "grad_norm": 0.24539975297207356, "learning_rate": 0.0005599405961496137, "loss": 0.6491, "step": 1263 }, { "epoch": 0.4783349101229896, "grad_norm": 0.2457147131584258, "learning_rate": 0.0005593318595404496, "loss": 0.6583, "step": 1264 }, { "epoch": 0.47871333964049195, "grad_norm": 0.2289576534582798, "learning_rate": 0.0005587230337180275, "loss": 0.6376, "step": 1265 }, { "epoch": 0.47909176915799434, "grad_norm": 0.24406932552625657, "learning_rate": 0.0005581141195977969, "loss": 0.6355, "step": 1266 }, { "epoch": 0.4794701986754967, "grad_norm": 0.2397530238126936, "learning_rate": 0.0005575051180953406, "loss": 0.6223, "step": 1267 }, { "epoch": 0.47984862819299906, "grad_norm": 0.20997301998415274, "learning_rate": 0.0005568960301263724, "loss": 0.6391, "step": 1268 }, { "epoch": 0.4802270577105014, "grad_norm": 0.20915017725690552, "learning_rate": 0.0005562868566067362, "loss": 0.6367, "step": 1269 }, { "epoch": 0.4806054872280038, "grad_norm": 0.23034642092675583, "learning_rate": 0.0005556775984524044, "loss": 0.6485, "step": 1270 }, { "epoch": 0.48098391674550617, "grad_norm": 0.25471090314560846, "learning_rate": 0.000555068256579477, "loss": 0.6528, "step": 1271 }, { "epoch": 0.4813623462630085, "grad_norm": 0.24655291697030418, "learning_rate": 0.0005544588319041796, "loss": 0.6384, "step": 1272 }, { "epoch": 0.4817407757805109, "grad_norm": 0.19328662197200028, "learning_rate": 0.0005538493253428625, "loss": 0.6472, "step": 1273 }, { "epoch": 0.4821192052980132, "grad_norm": 0.2308329759146291, "learning_rate": 0.0005532397378119989, "loss": 0.6389, "step": 1274 }, { "epoch": 0.4824976348155156, "grad_norm": 0.23095370042266092, "learning_rate": 0.0005526300702281838, "loss": 0.6494, "step": 1275 }, { "epoch": 0.482876064333018, "grad_norm": 0.18971523361830486, "learning_rate": 0.0005520203235081328, "loss": 0.6534, "step": 1276 }, { "epoch": 0.48325449385052033, "grad_norm": 0.18412079137419377, "learning_rate": 0.0005514104985686802, "loss": 0.6484, "step": 1277 }, { "epoch": 0.4836329233680227, "grad_norm": 0.21630872291578307, "learning_rate": 0.0005508005963267782, "loss": 0.6338, "step": 1278 }, { "epoch": 0.48401135288552505, "grad_norm": 0.28277097154752256, "learning_rate": 0.0005501906176994948, "loss": 0.6549, "step": 1279 }, { "epoch": 0.48438978240302744, "grad_norm": 0.216849758812047, "learning_rate": 0.0005495805636040134, "loss": 0.6382, "step": 1280 }, { "epoch": 0.4847682119205298, "grad_norm": 2.3901508334778585, "learning_rate": 0.0005489704349576305, "loss": 0.6414, "step": 1281 }, { "epoch": 0.48514664143803216, "grad_norm": 0.2259728387708823, "learning_rate": 0.0005483602326777548, "loss": 0.5978, "step": 1282 }, { "epoch": 0.48552507095553454, "grad_norm": 0.23638804913414116, "learning_rate": 0.0005477499576819059, "loss": 0.6435, "step": 1283 }, { "epoch": 0.4859035004730369, "grad_norm": 0.2653125159058044, "learning_rate": 0.0005471396108877122, "loss": 0.6574, "step": 1284 }, { "epoch": 0.48628192999053926, "grad_norm": 0.24097979045801585, "learning_rate": 0.0005465291932129108, "loss": 0.6409, "step": 1285 }, { "epoch": 0.48666035950804165, "grad_norm": 1.496135661282649, "learning_rate": 0.0005459187055753446, "loss": 0.6457, "step": 1286 }, { "epoch": 0.487038789025544, "grad_norm": 0.20838258677643323, "learning_rate": 0.0005453081488929624, "loss": 0.6415, "step": 1287 }, { "epoch": 0.48741721854304637, "grad_norm": 0.22447478366800597, "learning_rate": 0.0005446975240838164, "loss": 0.6226, "step": 1288 }, { "epoch": 0.4877956480605487, "grad_norm": 0.2529004496167667, "learning_rate": 0.0005440868320660614, "loss": 0.6349, "step": 1289 }, { "epoch": 0.4881740775780511, "grad_norm": 0.27923900831749165, "learning_rate": 0.0005434760737579532, "loss": 0.6649, "step": 1290 }, { "epoch": 0.4885525070955535, "grad_norm": 0.2693498111330117, "learning_rate": 0.0005428652500778471, "loss": 0.6197, "step": 1291 }, { "epoch": 0.4889309366130558, "grad_norm": 0.2830742115083551, "learning_rate": 0.0005422543619441972, "loss": 0.6326, "step": 1292 }, { "epoch": 0.4893093661305582, "grad_norm": 0.2075082035338918, "learning_rate": 0.000541643410275554, "loss": 0.6212, "step": 1293 }, { "epoch": 0.48968779564806053, "grad_norm": 0.2514201448582651, "learning_rate": 0.0005410323959905638, "loss": 0.6517, "step": 1294 }, { "epoch": 0.4900662251655629, "grad_norm": 0.26168870041696946, "learning_rate": 0.0005404213200079669, "loss": 0.6597, "step": 1295 }, { "epoch": 0.4904446546830653, "grad_norm": 0.25225155920826553, "learning_rate": 0.0005398101832465964, "loss": 0.6318, "step": 1296 }, { "epoch": 0.49082308420056764, "grad_norm": 0.2155569204026968, "learning_rate": 0.0005391989866253772, "loss": 0.6276, "step": 1297 }, { "epoch": 0.49120151371807, "grad_norm": 0.26062997982472796, "learning_rate": 0.0005385877310633233, "loss": 0.6667, "step": 1298 }, { "epoch": 0.49157994323557236, "grad_norm": 0.2059871779943209, "learning_rate": 0.0005379764174795381, "loss": 0.6154, "step": 1299 }, { "epoch": 0.49195837275307475, "grad_norm": 0.21221540270398268, "learning_rate": 0.0005373650467932121, "loss": 0.665, "step": 1300 }, { "epoch": 0.4923368022705771, "grad_norm": 0.1948165509619973, "learning_rate": 0.0005367536199236217, "loss": 0.6451, "step": 1301 }, { "epoch": 0.49271523178807947, "grad_norm": 0.1978790640663659, "learning_rate": 0.0005361421377901274, "loss": 0.6504, "step": 1302 }, { "epoch": 0.49309366130558185, "grad_norm": 0.2310009162614246, "learning_rate": 0.000535530601312173, "loss": 0.6196, "step": 1303 }, { "epoch": 0.4934720908230842, "grad_norm": 0.23893385773161976, "learning_rate": 0.0005349190114092842, "loss": 0.6262, "step": 1304 }, { "epoch": 0.4938505203405866, "grad_norm": 0.22299194127113234, "learning_rate": 0.0005343073690010671, "loss": 0.6395, "step": 1305 }, { "epoch": 0.4942289498580889, "grad_norm": 0.23492215186033943, "learning_rate": 0.0005336956750072063, "loss": 0.6139, "step": 1306 }, { "epoch": 0.4946073793755913, "grad_norm": 0.6954996949996664, "learning_rate": 0.0005330839303474641, "loss": 0.6504, "step": 1307 }, { "epoch": 0.4949858088930937, "grad_norm": 0.4602035562910017, "learning_rate": 0.0005324721359416793, "loss": 0.6735, "step": 1308 }, { "epoch": 0.495364238410596, "grad_norm": 0.26628183049233933, "learning_rate": 0.0005318602927097652, "loss": 0.6331, "step": 1309 }, { "epoch": 0.4957426679280984, "grad_norm": 0.2600639465904203, "learning_rate": 0.0005312484015717086, "loss": 0.6403, "step": 1310 }, { "epoch": 0.49612109744560073, "grad_norm": 0.20963805455666784, "learning_rate": 0.0005306364634475684, "loss": 0.6308, "step": 1311 }, { "epoch": 0.4964995269631031, "grad_norm": 0.27871216076674005, "learning_rate": 0.0005300244792574742, "loss": 0.6639, "step": 1312 }, { "epoch": 0.4968779564806055, "grad_norm": 0.26961279754986583, "learning_rate": 0.0005294124499216245, "loss": 0.6212, "step": 1313 }, { "epoch": 0.49725638599810784, "grad_norm": 0.2456967163463096, "learning_rate": 0.0005288003763602862, "loss": 0.6607, "step": 1314 }, { "epoch": 0.49763481551561023, "grad_norm": 0.2602176175753106, "learning_rate": 0.0005281882594937923, "loss": 0.6711, "step": 1315 }, { "epoch": 0.49801324503311256, "grad_norm": 0.2306174832827444, "learning_rate": 0.000527576100242541, "loss": 0.6253, "step": 1316 }, { "epoch": 0.49839167455061495, "grad_norm": 0.2089722977473, "learning_rate": 0.0005269638995269944, "loss": 0.6192, "step": 1317 }, { "epoch": 0.49877010406811734, "grad_norm": 0.2525013275158799, "learning_rate": 0.0005263516582676767, "loss": 0.6411, "step": 1318 }, { "epoch": 0.49914853358561967, "grad_norm": 0.21996126990885098, "learning_rate": 0.0005257393773851734, "loss": 0.6527, "step": 1319 }, { "epoch": 0.49952696310312206, "grad_norm": 0.21146267399459132, "learning_rate": 0.0005251270578001292, "loss": 0.6422, "step": 1320 }, { "epoch": 0.4999053926206244, "grad_norm": 0.26900973521488186, "learning_rate": 0.0005245147004332471, "loss": 0.6347, "step": 1321 }, { "epoch": 0.5002838221381267, "grad_norm": 0.28174466765518813, "learning_rate": 0.0005239023062052873, "loss": 0.6184, "step": 1322 }, { "epoch": 0.5006622516556292, "grad_norm": 3.3235711835294435, "learning_rate": 0.0005232898760370647, "loss": 0.6448, "step": 1323 }, { "epoch": 0.5010406811731315, "grad_norm": 0.25532230764575814, "learning_rate": 0.0005226774108494491, "loss": 0.5975, "step": 1324 }, { "epoch": 0.5014191106906338, "grad_norm": 0.9461981037779655, "learning_rate": 0.0005220649115633622, "loss": 0.6428, "step": 1325 }, { "epoch": 0.5017975402081363, "grad_norm": 0.25515132221852627, "learning_rate": 0.0005214523790997772, "loss": 0.6391, "step": 1326 }, { "epoch": 0.5021759697256386, "grad_norm": 0.40509349091243546, "learning_rate": 0.0005208398143797176, "loss": 0.6484, "step": 1327 }, { "epoch": 0.5025543992431409, "grad_norm": 0.34715468412110184, "learning_rate": 0.0005202272183242547, "loss": 0.6118, "step": 1328 }, { "epoch": 0.5029328287606434, "grad_norm": 0.41691767364056037, "learning_rate": 0.0005196145918545074, "loss": 0.6703, "step": 1329 }, { "epoch": 0.5033112582781457, "grad_norm": 0.30505725237258813, "learning_rate": 0.0005190019358916404, "loss": 0.6377, "step": 1330 }, { "epoch": 0.503689687795648, "grad_norm": 0.29282716508779977, "learning_rate": 0.0005183892513568623, "loss": 0.629, "step": 1331 }, { "epoch": 0.5040681173131504, "grad_norm": 0.30396179436919696, "learning_rate": 0.0005177765391714249, "loss": 0.6537, "step": 1332 }, { "epoch": 0.5044465468306528, "grad_norm": 0.26009434143827825, "learning_rate": 0.0005171638002566218, "loss": 0.6274, "step": 1333 }, { "epoch": 0.5048249763481552, "grad_norm": 0.28726841939480485, "learning_rate": 0.0005165510355337865, "loss": 0.6444, "step": 1334 }, { "epoch": 0.5052034058656575, "grad_norm": 0.2569438432064984, "learning_rate": 0.0005159382459242914, "loss": 0.6407, "step": 1335 }, { "epoch": 0.5055818353831599, "grad_norm": 0.2743009558156543, "learning_rate": 0.0005153254323495464, "loss": 0.6572, "step": 1336 }, { "epoch": 0.5059602649006623, "grad_norm": 0.2645404778463481, "learning_rate": 0.0005147125957309972, "loss": 0.6496, "step": 1337 }, { "epoch": 0.5063386944181646, "grad_norm": 0.19893962341898733, "learning_rate": 0.0005140997369901243, "loss": 0.6241, "step": 1338 }, { "epoch": 0.506717123935667, "grad_norm": 0.22228062047806216, "learning_rate": 0.0005134868570484414, "loss": 0.6352, "step": 1339 }, { "epoch": 0.5070955534531694, "grad_norm": 0.21323262313578206, "learning_rate": 0.0005128739568274944, "loss": 0.6356, "step": 1340 }, { "epoch": 0.5074739829706717, "grad_norm": 0.21473561079625494, "learning_rate": 0.0005122610372488592, "loss": 0.6456, "step": 1341 }, { "epoch": 0.507852412488174, "grad_norm": 0.2357655441736071, "learning_rate": 0.0005116480992341412, "loss": 0.6444, "step": 1342 }, { "epoch": 0.5082308420056765, "grad_norm": 0.18796364759082307, "learning_rate": 0.000511035143704973, "loss": 0.6455, "step": 1343 }, { "epoch": 0.5086092715231788, "grad_norm": 0.44237376829085273, "learning_rate": 0.0005104221715830145, "loss": 0.6394, "step": 1344 }, { "epoch": 0.5089877010406811, "grad_norm": 0.20095484795145788, "learning_rate": 0.0005098091837899495, "loss": 0.6016, "step": 1345 }, { "epoch": 0.5093661305581836, "grad_norm": 0.19094555745706687, "learning_rate": 0.000509196181247486, "loss": 0.6375, "step": 1346 }, { "epoch": 0.5097445600756859, "grad_norm": 0.22210523117767372, "learning_rate": 0.0005085831648773538, "loss": 0.6419, "step": 1347 }, { "epoch": 0.5101229895931882, "grad_norm": 0.2179946972842945, "learning_rate": 0.0005079701356013038, "loss": 0.6329, "step": 1348 }, { "epoch": 0.5105014191106906, "grad_norm": 0.19394927160057934, "learning_rate": 0.0005073570943411061, "loss": 0.6111, "step": 1349 }, { "epoch": 0.510879848628193, "grad_norm": 0.21254516710580768, "learning_rate": 0.0005067440420185491, "loss": 0.6466, "step": 1350 }, { "epoch": 0.5112582781456954, "grad_norm": 0.23129639482335584, "learning_rate": 0.0005061309795554374, "loss": 0.626, "step": 1351 }, { "epoch": 0.5116367076631977, "grad_norm": 0.2123509487366237, "learning_rate": 0.0005055179078735912, "loss": 0.6456, "step": 1352 }, { "epoch": 0.5120151371807001, "grad_norm": 0.20452479236453755, "learning_rate": 0.0005049048278948445, "loss": 0.6393, "step": 1353 }, { "epoch": 0.5123935666982025, "grad_norm": 0.202875736323376, "learning_rate": 0.0005042917405410436, "loss": 0.6611, "step": 1354 }, { "epoch": 0.5127719962157048, "grad_norm": 0.20914793298868575, "learning_rate": 0.0005036786467340459, "loss": 0.6524, "step": 1355 }, { "epoch": 0.5131504257332072, "grad_norm": 0.19361440620722797, "learning_rate": 0.0005030655473957192, "loss": 0.637, "step": 1356 }, { "epoch": 0.5135288552507096, "grad_norm": 0.20138573621240677, "learning_rate": 0.0005024524434479383, "loss": 0.6395, "step": 1357 }, { "epoch": 0.5139072847682119, "grad_norm": 0.23362283015327812, "learning_rate": 0.000501839335812586, "loss": 0.5974, "step": 1358 }, { "epoch": 0.5142857142857142, "grad_norm": 0.22063240673719062, "learning_rate": 0.0005012262254115502, "loss": 0.6664, "step": 1359 }, { "epoch": 0.5146641438032167, "grad_norm": 0.1874538305182987, "learning_rate": 0.0005006131131667233, "loss": 0.6293, "step": 1360 }, { "epoch": 0.515042573320719, "grad_norm": 0.20870387781316857, "learning_rate": 0.0005, "loss": 0.642, "step": 1361 }, { "epoch": 0.5154210028382213, "grad_norm": 0.20663202228198171, "learning_rate": 0.0004993868868332767, "loss": 0.6383, "step": 1362 }, { "epoch": 0.5157994323557238, "grad_norm": 0.2160579788186291, "learning_rate": 0.0004987737745884498, "loss": 0.6275, "step": 1363 }, { "epoch": 0.5161778618732261, "grad_norm": 0.21628191505707986, "learning_rate": 0.000498160664187414, "loss": 0.5967, "step": 1364 }, { "epoch": 0.5165562913907285, "grad_norm": 0.21075391727927067, "learning_rate": 0.0004975475565520618, "loss": 0.632, "step": 1365 }, { "epoch": 0.5169347209082309, "grad_norm": 0.22054797601644588, "learning_rate": 0.000496934452604281, "loss": 0.6478, "step": 1366 }, { "epoch": 0.5173131504257332, "grad_norm": 0.238762276194057, "learning_rate": 0.0004963213532659541, "loss": 0.6421, "step": 1367 }, { "epoch": 0.5176915799432356, "grad_norm": 0.27724147307713204, "learning_rate": 0.0004957082594589566, "loss": 0.6537, "step": 1368 }, { "epoch": 0.5180700094607379, "grad_norm": 0.21758727597181107, "learning_rate": 0.0004950951721051556, "loss": 0.6343, "step": 1369 }, { "epoch": 0.5184484389782403, "grad_norm": 0.23155604904342358, "learning_rate": 0.0004944820921264089, "loss": 0.6142, "step": 1370 }, { "epoch": 0.5188268684957427, "grad_norm": 0.25270067252623263, "learning_rate": 0.0004938690204445627, "loss": 0.6336, "step": 1371 }, { "epoch": 0.519205298013245, "grad_norm": 0.23368380670110708, "learning_rate": 0.0004932559579814511, "loss": 0.6654, "step": 1372 }, { "epoch": 0.5195837275307474, "grad_norm": 0.2619493664097282, "learning_rate": 0.000492642905658894, "loss": 0.6293, "step": 1373 }, { "epoch": 0.5199621570482498, "grad_norm": 0.2456941766731742, "learning_rate": 0.0004920298643986963, "loss": 0.6309, "step": 1374 }, { "epoch": 0.5203405865657521, "grad_norm": 0.2293378919993449, "learning_rate": 0.0004914168351226463, "loss": 0.6195, "step": 1375 }, { "epoch": 0.5207190160832545, "grad_norm": 0.2644182308960059, "learning_rate": 0.0004908038187525141, "loss": 0.6436, "step": 1376 }, { "epoch": 0.5210974456007569, "grad_norm": 0.19487306752081723, "learning_rate": 0.0004901908162100507, "loss": 0.6596, "step": 1377 }, { "epoch": 0.5214758751182592, "grad_norm": 0.2063431731647411, "learning_rate": 0.0004895778284169857, "loss": 0.626, "step": 1378 }, { "epoch": 0.5218543046357615, "grad_norm": 0.2016350312548953, "learning_rate": 0.000488964856295027, "loss": 0.6357, "step": 1379 }, { "epoch": 0.522232734153264, "grad_norm": 0.23237291990542405, "learning_rate": 0.000488351900765859, "loss": 0.6653, "step": 1380 }, { "epoch": 0.5226111636707663, "grad_norm": 0.20707061681201552, "learning_rate": 0.00048773896275114095, "loss": 0.6129, "step": 1381 }, { "epoch": 0.5229895931882687, "grad_norm": 0.24010246035589491, "learning_rate": 0.00048712604317250577, "loss": 0.629, "step": 1382 }, { "epoch": 0.5233680227057711, "grad_norm": 0.23982130374312766, "learning_rate": 0.0004865131429515587, "loss": 0.6143, "step": 1383 }, { "epoch": 0.5237464522232734, "grad_norm": 0.24753127991365884, "learning_rate": 0.00048590026300987586, "loss": 0.6329, "step": 1384 }, { "epoch": 0.5241248817407758, "grad_norm": 0.205799233701473, "learning_rate": 0.000485287404269003, "loss": 0.6489, "step": 1385 }, { "epoch": 0.5245033112582781, "grad_norm": 0.24323865213760873, "learning_rate": 0.0004846745676504537, "loss": 0.6468, "step": 1386 }, { "epoch": 0.5248817407757805, "grad_norm": 0.21896197062543415, "learning_rate": 0.00048406175407570876, "loss": 0.6386, "step": 1387 }, { "epoch": 0.5252601702932829, "grad_norm": 0.21115015048704355, "learning_rate": 0.00048344896446621364, "loss": 0.644, "step": 1388 }, { "epoch": 0.5256385998107852, "grad_norm": 0.2817161326569784, "learning_rate": 0.0004828361997433783, "loss": 0.6301, "step": 1389 }, { "epoch": 0.5260170293282876, "grad_norm": 0.21806282009758365, "learning_rate": 0.0004822234608285752, "loss": 0.6506, "step": 1390 }, { "epoch": 0.52639545884579, "grad_norm": 0.2113318994717386, "learning_rate": 0.0004816107486431379, "loss": 0.6139, "step": 1391 }, { "epoch": 0.5267738883632923, "grad_norm": 0.2289227014299795, "learning_rate": 0.0004809980641083598, "loss": 0.6313, "step": 1392 }, { "epoch": 0.5271523178807948, "grad_norm": 0.23608478912879094, "learning_rate": 0.00048038540814549267, "loss": 0.6233, "step": 1393 }, { "epoch": 0.5275307473982971, "grad_norm": 0.23884908836834406, "learning_rate": 0.0004797727816757454, "loss": 0.6434, "step": 1394 }, { "epoch": 0.5279091769157994, "grad_norm": 0.22972500942453133, "learning_rate": 0.0004791601856202825, "loss": 0.6149, "step": 1395 }, { "epoch": 0.5282876064333017, "grad_norm": 0.21757486296422274, "learning_rate": 0.00047854762090022274, "loss": 0.6245, "step": 1396 }, { "epoch": 0.5286660359508042, "grad_norm": 0.20524838963502973, "learning_rate": 0.000477935088436638, "loss": 0.6499, "step": 1397 }, { "epoch": 0.5290444654683065, "grad_norm": 0.19284878372375555, "learning_rate": 0.000477322589150551, "loss": 0.63, "step": 1398 }, { "epoch": 0.5294228949858089, "grad_norm": 0.221675847562226, "learning_rate": 0.0004767101239629353, "loss": 0.6434, "step": 1399 }, { "epoch": 0.5298013245033113, "grad_norm": 0.2124903617103595, "learning_rate": 0.0004760976937947128, "loss": 0.6406, "step": 1400 }, { "epoch": 0.5301797540208136, "grad_norm": 0.20990331879386923, "learning_rate": 0.0004754852995667529, "loss": 0.634, "step": 1401 }, { "epoch": 0.530558183538316, "grad_norm": 0.22521315981174433, "learning_rate": 0.0004748729421998709, "loss": 0.6288, "step": 1402 }, { "epoch": 0.5309366130558184, "grad_norm": 0.22715518150912709, "learning_rate": 0.0004742606226148267, "loss": 0.6263, "step": 1403 }, { "epoch": 0.5313150425733207, "grad_norm": 0.21875719501379196, "learning_rate": 0.00047364834173232334, "loss": 0.6424, "step": 1404 }, { "epoch": 0.5316934720908231, "grad_norm": 0.23775171630973935, "learning_rate": 0.0004730361004730057, "loss": 0.6346, "step": 1405 }, { "epoch": 0.5320719016083254, "grad_norm": 0.24549062178581998, "learning_rate": 0.0004724238997574591, "loss": 0.6437, "step": 1406 }, { "epoch": 0.5324503311258278, "grad_norm": 0.23037976632281976, "learning_rate": 0.00047181174050620777, "loss": 0.6375, "step": 1407 }, { "epoch": 0.5328287606433302, "grad_norm": 0.242881442142125, "learning_rate": 0.0004711996236397139, "loss": 0.6266, "step": 1408 }, { "epoch": 0.5332071901608325, "grad_norm": 0.19963760848168693, "learning_rate": 0.00047058755007837555, "loss": 0.6212, "step": 1409 }, { "epoch": 0.533585619678335, "grad_norm": 0.20080938364775508, "learning_rate": 0.00046997552074252584, "loss": 0.6245, "step": 1410 }, { "epoch": 0.5339640491958373, "grad_norm": 0.21590327916411406, "learning_rate": 0.0004693635365524316, "loss": 0.6116, "step": 1411 }, { "epoch": 0.5343424787133396, "grad_norm": 0.2610127334868444, "learning_rate": 0.00046875159842829146, "loss": 0.6459, "step": 1412 }, { "epoch": 0.534720908230842, "grad_norm": 0.28961572031458965, "learning_rate": 0.0004681397072902349, "loss": 0.6271, "step": 1413 }, { "epoch": 0.5350993377483444, "grad_norm": 0.2306304784449729, "learning_rate": 0.0004675278640583208, "loss": 0.6394, "step": 1414 }, { "epoch": 0.5354777672658467, "grad_norm": 0.20118623515605769, "learning_rate": 0.00046691606965253596, "loss": 0.6323, "step": 1415 }, { "epoch": 0.5358561967833491, "grad_norm": 0.23454556616510144, "learning_rate": 0.00046630432499279387, "loss": 0.629, "step": 1416 }, { "epoch": 0.5362346263008515, "grad_norm": 0.23537602435996738, "learning_rate": 0.0004656926309989329, "loss": 0.6327, "step": 1417 }, { "epoch": 0.5366130558183538, "grad_norm": 0.2069610620678066, "learning_rate": 0.0004650809885907158, "loss": 0.6057, "step": 1418 }, { "epoch": 0.5369914853358562, "grad_norm": 0.2333822683695073, "learning_rate": 0.00046446939868782707, "loss": 0.6097, "step": 1419 }, { "epoch": 0.5373699148533586, "grad_norm": 0.22304502643413765, "learning_rate": 0.00046385786220987275, "loss": 0.6184, "step": 1420 }, { "epoch": 0.5377483443708609, "grad_norm": 0.20575538701552323, "learning_rate": 0.0004632463800763784, "loss": 0.6499, "step": 1421 }, { "epoch": 0.5381267738883633, "grad_norm": 0.2206906002845674, "learning_rate": 0.0004626349532067879, "loss": 0.6531, "step": 1422 }, { "epoch": 0.5385052034058656, "grad_norm": 0.23380774357221865, "learning_rate": 0.0004620235825204619, "loss": 0.6339, "step": 1423 }, { "epoch": 0.538883632923368, "grad_norm": 0.2035766220939076, "learning_rate": 0.0004614122689366768, "loss": 0.6387, "step": 1424 }, { "epoch": 0.5392620624408704, "grad_norm": 0.22046749229532014, "learning_rate": 0.0004608010133746229, "loss": 0.6317, "step": 1425 }, { "epoch": 0.5396404919583727, "grad_norm": 0.22191070081370062, "learning_rate": 0.0004601898167534035, "loss": 0.6252, "step": 1426 }, { "epoch": 0.5400189214758752, "grad_norm": 0.2321498304540626, "learning_rate": 0.00045957867999203304, "loss": 0.6488, "step": 1427 }, { "epoch": 0.5403973509933775, "grad_norm": 0.25993057979389195, "learning_rate": 0.0004589676040094363, "loss": 0.6332, "step": 1428 }, { "epoch": 0.5407757805108798, "grad_norm": 0.2450420370822912, "learning_rate": 0.00045835658972444597, "loss": 0.614, "step": 1429 }, { "epoch": 0.5411542100283823, "grad_norm": 0.25404182629149846, "learning_rate": 0.00045774563805580276, "loss": 0.6236, "step": 1430 }, { "epoch": 0.5415326395458846, "grad_norm": 0.207232701159418, "learning_rate": 0.0004571347499221528, "loss": 0.6135, "step": 1431 }, { "epoch": 0.5419110690633869, "grad_norm": 0.8561441202901159, "learning_rate": 0.00045652392624204684, "loss": 0.6239, "step": 1432 }, { "epoch": 0.5422894985808893, "grad_norm": 0.21983582217326225, "learning_rate": 0.00045591316793393863, "loss": 0.6212, "step": 1433 }, { "epoch": 0.5426679280983917, "grad_norm": 0.21940521477749947, "learning_rate": 0.00045530247591618356, "loss": 0.6027, "step": 1434 }, { "epoch": 0.543046357615894, "grad_norm": 0.2207733143961657, "learning_rate": 0.0004546918511070376, "loss": 0.6495, "step": 1435 }, { "epoch": 0.5434247871333964, "grad_norm": 0.19699039710307462, "learning_rate": 0.00045408129442465534, "loss": 0.6127, "step": 1436 }, { "epoch": 0.5438032166508988, "grad_norm": 0.24980755939279342, "learning_rate": 0.00045347080678708926, "loss": 0.6181, "step": 1437 }, { "epoch": 0.5441816461684011, "grad_norm": 0.23067621010133946, "learning_rate": 0.0004528603891122878, "loss": 0.621, "step": 1438 }, { "epoch": 0.5445600756859035, "grad_norm": 0.22794540697561202, "learning_rate": 0.0004522500423180942, "loss": 0.6592, "step": 1439 }, { "epoch": 0.5449385052034059, "grad_norm": 0.2173914959896321, "learning_rate": 0.0004516397673222451, "loss": 0.6291, "step": 1440 }, { "epoch": 0.5453169347209083, "grad_norm": 0.26551004334489536, "learning_rate": 0.0004510295650423695, "loss": 0.6262, "step": 1441 }, { "epoch": 0.5456953642384106, "grad_norm": 0.21001003489757553, "learning_rate": 0.00045041943639598654, "loss": 0.6266, "step": 1442 }, { "epoch": 0.5460737937559129, "grad_norm": 0.2566205996708868, "learning_rate": 0.0004498093823005052, "loss": 0.6229, "step": 1443 }, { "epoch": 0.5464522232734154, "grad_norm": 0.21415617765600478, "learning_rate": 0.0004491994036732218, "loss": 0.6245, "step": 1444 }, { "epoch": 0.5468306527909177, "grad_norm": 0.22339891087147878, "learning_rate": 0.00044858950143131975, "loss": 0.5981, "step": 1445 }, { "epoch": 0.54720908230842, "grad_norm": 0.2782890569972234, "learning_rate": 0.00044797967649186723, "loss": 0.6245, "step": 1446 }, { "epoch": 0.5475875118259225, "grad_norm": 0.28944151748261143, "learning_rate": 0.00044736992977181616, "loss": 0.6486, "step": 1447 }, { "epoch": 0.5479659413434248, "grad_norm": 0.21268206188167227, "learning_rate": 0.0004467602621880011, "loss": 0.6407, "step": 1448 }, { "epoch": 0.5483443708609271, "grad_norm": 0.21560669593244086, "learning_rate": 0.00044615067465713753, "loss": 0.6395, "step": 1449 }, { "epoch": 0.5487228003784295, "grad_norm": 0.21560419922164722, "learning_rate": 0.00044554116809582034, "loss": 0.6413, "step": 1450 }, { "epoch": 0.5491012298959319, "grad_norm": 0.18400707793227838, "learning_rate": 0.00044493174342052303, "loss": 0.6477, "step": 1451 }, { "epoch": 0.5494796594134342, "grad_norm": 0.22783850747725637, "learning_rate": 0.00044432240154759555, "loss": 0.6494, "step": 1452 }, { "epoch": 0.5498580889309366, "grad_norm": 0.221258309122085, "learning_rate": 0.00044371314339326383, "loss": 0.6279, "step": 1453 }, { "epoch": 0.550236518448439, "grad_norm": 0.24806533204141856, "learning_rate": 0.0004431039698736276, "loss": 0.6183, "step": 1454 }, { "epoch": 0.5506149479659413, "grad_norm": 0.1852344090442795, "learning_rate": 0.00044249488190465936, "loss": 0.6302, "step": 1455 }, { "epoch": 0.5509933774834437, "grad_norm": 0.23309501972918972, "learning_rate": 0.000441885880402203, "loss": 0.611, "step": 1456 }, { "epoch": 0.5513718070009461, "grad_norm": 0.2240445207753737, "learning_rate": 0.00044127696628197256, "loss": 0.6289, "step": 1457 }, { "epoch": 0.5517502365184485, "grad_norm": 0.1875134800992585, "learning_rate": 0.00044066814045955037, "loss": 0.6317, "step": 1458 }, { "epoch": 0.5521286660359508, "grad_norm": 0.2029405453285743, "learning_rate": 0.0004400594038503864, "loss": 0.6375, "step": 1459 }, { "epoch": 0.5525070955534531, "grad_norm": 0.19268218821078417, "learning_rate": 0.000439450757369796, "loss": 0.6399, "step": 1460 }, { "epoch": 0.5528855250709556, "grad_norm": 0.2899869621568835, "learning_rate": 0.0004388422019329593, "loss": 0.6488, "step": 1461 }, { "epoch": 0.5532639545884579, "grad_norm": 0.22059893891835022, "learning_rate": 0.0004382337384549195, "loss": 0.5993, "step": 1462 }, { "epoch": 0.5536423841059602, "grad_norm": 0.20503635889452215, "learning_rate": 0.0004376253678505815, "loss": 0.6195, "step": 1463 }, { "epoch": 0.5540208136234627, "grad_norm": 0.2603762913240157, "learning_rate": 0.0004370170910347105, "loss": 0.6103, "step": 1464 }, { "epoch": 0.554399243140965, "grad_norm": 0.21100555311145378, "learning_rate": 0.000436408908921931, "loss": 0.635, "step": 1465 }, { "epoch": 0.5547776726584673, "grad_norm": 0.20668900289470715, "learning_rate": 0.00043580082242672446, "loss": 0.6479, "step": 1466 }, { "epoch": 0.5551561021759698, "grad_norm": 0.20971223162299477, "learning_rate": 0.00043519283246342923, "loss": 0.6731, "step": 1467 }, { "epoch": 0.5555345316934721, "grad_norm": 0.2448976833314735, "learning_rate": 0.00043458493994623804, "loss": 0.6329, "step": 1468 }, { "epoch": 0.5559129612109744, "grad_norm": 0.22689461208203165, "learning_rate": 0.00043397714578919744, "loss": 0.6088, "step": 1469 }, { "epoch": 0.5562913907284768, "grad_norm": 0.19063394791286184, "learning_rate": 0.0004333694509062058, "loss": 0.6348, "step": 1470 }, { "epoch": 0.5566698202459792, "grad_norm": 0.2214225925075639, "learning_rate": 0.00043276185621101234, "loss": 0.6105, "step": 1471 }, { "epoch": 0.5570482497634816, "grad_norm": 0.21002144939575818, "learning_rate": 0.00043215436261721563, "loss": 0.6269, "step": 1472 }, { "epoch": 0.5574266792809839, "grad_norm": 0.24616624884439736, "learning_rate": 0.0004315469710382623, "loss": 0.6248, "step": 1473 }, { "epoch": 0.5578051087984863, "grad_norm": 0.21007804266267285, "learning_rate": 0.0004309396823874455, "loss": 0.6423, "step": 1474 }, { "epoch": 0.5581835383159887, "grad_norm": 0.19382879323971008, "learning_rate": 0.0004303324975779036, "loss": 0.6121, "step": 1475 }, { "epoch": 0.558561967833491, "grad_norm": 0.21592501003839085, "learning_rate": 0.00042972541752261885, "loss": 0.5927, "step": 1476 }, { "epoch": 0.5589403973509933, "grad_norm": 0.1911675171068953, "learning_rate": 0.0004291184431344161, "loss": 0.627, "step": 1477 }, { "epoch": 0.5593188268684958, "grad_norm": 0.23073673831465347, "learning_rate": 0.000428511575325961, "loss": 0.6586, "step": 1478 }, { "epoch": 0.5596972563859981, "grad_norm": 0.20141322278485121, "learning_rate": 0.00042790481500975944, "loss": 0.6099, "step": 1479 }, { "epoch": 0.5600756859035004, "grad_norm": 0.19178349434663042, "learning_rate": 0.00042729816309815505, "loss": 0.6265, "step": 1480 }, { "epoch": 0.5604541154210029, "grad_norm": 0.2228824236377465, "learning_rate": 0.0004266916205033291, "loss": 0.6418, "step": 1481 }, { "epoch": 0.5608325449385052, "grad_norm": 0.2661055257524558, "learning_rate": 0.00042608518813729804, "loss": 0.6126, "step": 1482 }, { "epoch": 0.5612109744560075, "grad_norm": 0.20724271375182562, "learning_rate": 0.0004254788669119127, "loss": 0.6182, "step": 1483 }, { "epoch": 0.56158940397351, "grad_norm": 0.20848970604431621, "learning_rate": 0.00042487265773885705, "loss": 0.6035, "step": 1484 }, { "epoch": 0.5619678334910123, "grad_norm": 0.2256265248698857, "learning_rate": 0.0004242665615296461, "loss": 0.6277, "step": 1485 }, { "epoch": 0.5623462630085146, "grad_norm": 0.23611907540396393, "learning_rate": 0.0004236605791956254, "loss": 0.6166, "step": 1486 }, { "epoch": 0.562724692526017, "grad_norm": 0.20731383058214703, "learning_rate": 0.00042305471164796903, "loss": 0.6326, "step": 1487 }, { "epoch": 0.5631031220435194, "grad_norm": 0.20337624783593247, "learning_rate": 0.0004224489597976787, "loss": 0.5952, "step": 1488 }, { "epoch": 0.5634815515610218, "grad_norm": 0.2091744708097267, "learning_rate": 0.00042184332455558193, "loss": 0.6323, "step": 1489 }, { "epoch": 0.5638599810785241, "grad_norm": 0.21168051652452086, "learning_rate": 0.0004212378068323312, "loss": 0.6419, "step": 1490 }, { "epoch": 0.5642384105960265, "grad_norm": 0.20733291313063898, "learning_rate": 0.0004206324075384017, "loss": 0.6294, "step": 1491 }, { "epoch": 0.5646168401135289, "grad_norm": 0.20010354334528793, "learning_rate": 0.0004200271275840911, "loss": 0.6564, "step": 1492 }, { "epoch": 0.5649952696310312, "grad_norm": 0.22324711217646448, "learning_rate": 0.00041942196787951753, "loss": 0.6165, "step": 1493 }, { "epoch": 0.5653736991485336, "grad_norm": 0.22912587088093178, "learning_rate": 0.0004188169293346183, "loss": 0.643, "step": 1494 }, { "epoch": 0.565752128666036, "grad_norm": 0.19827882775765332, "learning_rate": 0.0004182120128591482, "loss": 0.6273, "step": 1495 }, { "epoch": 0.5661305581835383, "grad_norm": 0.20616233870122336, "learning_rate": 0.0004176072193626791, "loss": 0.6289, "step": 1496 }, { "epoch": 0.5665089877010406, "grad_norm": 0.26775930354840405, "learning_rate": 0.0004170025497545973, "loss": 0.6099, "step": 1497 }, { "epoch": 0.5668874172185431, "grad_norm": 0.25093952603312036, "learning_rate": 0.0004163980049441033, "loss": 0.6217, "step": 1498 }, { "epoch": 0.5672658467360454, "grad_norm": 0.21075067370477876, "learning_rate": 0.00041579358584020964, "loss": 0.6385, "step": 1499 }, { "epoch": 0.5676442762535477, "grad_norm": 0.2412889331325081, "learning_rate": 0.00041518929335174014, "loss": 0.6359, "step": 1500 }, { "epoch": 0.5680227057710502, "grad_norm": 0.21114418711974853, "learning_rate": 0.000414585128387328, "loss": 0.6507, "step": 1501 }, { "epoch": 0.5684011352885525, "grad_norm": 0.21594545352492733, "learning_rate": 0.0004139810918554147, "loss": 0.633, "step": 1502 }, { "epoch": 0.5687795648060548, "grad_norm": 0.2409804242734121, "learning_rate": 0.0004133771846642487, "loss": 0.6511, "step": 1503 }, { "epoch": 0.5691579943235573, "grad_norm": 0.22625664092692127, "learning_rate": 0.00041277340772188405, "loss": 0.6187, "step": 1504 }, { "epoch": 0.5695364238410596, "grad_norm": 0.22838322969358804, "learning_rate": 0.00041216976193617853, "loss": 0.6306, "step": 1505 }, { "epoch": 0.569914853358562, "grad_norm": 0.23089380185090252, "learning_rate": 0.0004115662482147932, "loss": 0.6118, "step": 1506 }, { "epoch": 0.5702932828760643, "grad_norm": 0.2225679088482689, "learning_rate": 0.00041096286746519035, "loss": 0.6044, "step": 1507 }, { "epoch": 0.5706717123935667, "grad_norm": 0.22867972847576948, "learning_rate": 0.0004103596205946323, "loss": 0.6213, "step": 1508 }, { "epoch": 0.5710501419110691, "grad_norm": 0.2244874374447337, "learning_rate": 0.0004097565085101801, "loss": 0.6079, "step": 1509 }, { "epoch": 0.5714285714285714, "grad_norm": 0.21929574905666366, "learning_rate": 0.0004091535321186921, "loss": 0.634, "step": 1510 }, { "epoch": 0.5718070009460738, "grad_norm": 0.21299761962008262, "learning_rate": 0.00040855069232682275, "loss": 0.6202, "step": 1511 }, { "epoch": 0.5721854304635762, "grad_norm": 0.22624197023590015, "learning_rate": 0.00040794799004102093, "loss": 0.6248, "step": 1512 }, { "epoch": 0.5725638599810785, "grad_norm": 0.230198499669477, "learning_rate": 0.0004073454261675288, "loss": 0.6299, "step": 1513 }, { "epoch": 0.5729422894985808, "grad_norm": 0.1960010788325864, "learning_rate": 0.00040674300161238063, "loss": 0.6031, "step": 1514 }, { "epoch": 0.5733207190160833, "grad_norm": 0.20251816188346722, "learning_rate": 0.00040614071728140076, "loss": 0.6268, "step": 1515 }, { "epoch": 0.5736991485335856, "grad_norm": 0.21847084659961266, "learning_rate": 0.00040553857408020307, "loss": 0.6342, "step": 1516 }, { "epoch": 0.5740775780510879, "grad_norm": 0.22484304463777585, "learning_rate": 0.00040493657291418896, "loss": 0.5956, "step": 1517 }, { "epoch": 0.5744560075685904, "grad_norm": 0.2259712561509403, "learning_rate": 0.0004043347146885465, "loss": 0.6401, "step": 1518 }, { "epoch": 0.5748344370860927, "grad_norm": 0.18505815088224137, "learning_rate": 0.00040373300030824885, "loss": 0.6072, "step": 1519 }, { "epoch": 0.575212866603595, "grad_norm": 0.22069087036679216, "learning_rate": 0.0004031314306780525, "loss": 0.6272, "step": 1520 }, { "epoch": 0.5755912961210975, "grad_norm": 0.22184557316631828, "learning_rate": 0.00040253000670249675, "loss": 0.6156, "step": 1521 }, { "epoch": 0.5759697256385998, "grad_norm": 0.21416920626714145, "learning_rate": 0.0004019287292859016, "loss": 0.6101, "step": 1522 }, { "epoch": 0.5763481551561022, "grad_norm": 0.20943322510826043, "learning_rate": 0.0004013275993323667, "loss": 0.6418, "step": 1523 }, { "epoch": 0.5767265846736045, "grad_norm": 0.220874314884058, "learning_rate": 0.00040072661774577024, "loss": 0.6137, "step": 1524 }, { "epoch": 0.5771050141911069, "grad_norm": 0.20698957203661256, "learning_rate": 0.00040012578542976705, "loss": 0.6165, "step": 1525 }, { "epoch": 0.5774834437086093, "grad_norm": 0.19013568973899575, "learning_rate": 0.0003995251032877874, "loss": 0.6216, "step": 1526 }, { "epoch": 0.5778618732261116, "grad_norm": 0.2061922057413535, "learning_rate": 0.0003989245722230361, "loss": 0.6215, "step": 1527 }, { "epoch": 0.578240302743614, "grad_norm": 0.26762952102127946, "learning_rate": 0.0003983241931384905, "loss": 0.6261, "step": 1528 }, { "epoch": 0.5786187322611164, "grad_norm": 0.22473503412450943, "learning_rate": 0.00039772396693689974, "loss": 0.6102, "step": 1529 }, { "epoch": 0.5789971617786187, "grad_norm": 0.27410069829057876, "learning_rate": 0.0003971238945207827, "loss": 0.6349, "step": 1530 }, { "epoch": 0.5793755912961212, "grad_norm": 0.24040275193533955, "learning_rate": 0.00039652397679242735, "loss": 0.6317, "step": 1531 }, { "epoch": 0.5797540208136235, "grad_norm": 0.2701395606141836, "learning_rate": 0.0003959242146538888, "loss": 0.6528, "step": 1532 }, { "epoch": 0.5801324503311258, "grad_norm": 0.20483767627758062, "learning_rate": 0.00039532460900698857, "loss": 0.637, "step": 1533 }, { "epoch": 0.5805108798486281, "grad_norm": 0.25800437722817715, "learning_rate": 0.0003947251607533125, "loss": 0.6321, "step": 1534 }, { "epoch": 0.5808893093661306, "grad_norm": 0.2075491132408025, "learning_rate": 0.00039412587079421, "loss": 0.6316, "step": 1535 }, { "epoch": 0.5812677388836329, "grad_norm": 0.22008197438452307, "learning_rate": 0.00039352674003079223, "loss": 0.621, "step": 1536 }, { "epoch": 0.5816461684011353, "grad_norm": 0.20227052622986677, "learning_rate": 0.0003929277693639313, "loss": 0.6217, "step": 1537 }, { "epoch": 0.5820245979186377, "grad_norm": 0.20423353756031098, "learning_rate": 0.00039232895969425825, "loss": 0.6394, "step": 1538 }, { "epoch": 0.58240302743614, "grad_norm": 0.23015177475721293, "learning_rate": 0.0003917303119221623, "loss": 0.6205, "step": 1539 }, { "epoch": 0.5827814569536424, "grad_norm": 0.208054855113413, "learning_rate": 0.0003911318269477892, "loss": 0.6181, "step": 1540 }, { "epoch": 0.5831598864711447, "grad_norm": 0.22856157607052713, "learning_rate": 0.00039053350567103985, "loss": 0.6225, "step": 1541 }, { "epoch": 0.5835383159886471, "grad_norm": 0.23347813350920563, "learning_rate": 0.0003899353489915689, "loss": 0.6174, "step": 1542 }, { "epoch": 0.5839167455061495, "grad_norm": 0.20847718914724758, "learning_rate": 0.00038933735780878376, "loss": 0.6145, "step": 1543 }, { "epoch": 0.5842951750236518, "grad_norm": 0.23387999714090493, "learning_rate": 0.00038873953302184284, "loss": 0.6168, "step": 1544 }, { "epoch": 0.5846736045411542, "grad_norm": 0.2537658009518815, "learning_rate": 0.0003881418755296544, "loss": 0.6105, "step": 1545 }, { "epoch": 0.5850520340586566, "grad_norm": 0.24689925533362375, "learning_rate": 0.00038754438623087506, "loss": 0.6317, "step": 1546 }, { "epoch": 0.5854304635761589, "grad_norm": 0.20801738484166224, "learning_rate": 0.0003869470660239085, "loss": 0.6328, "step": 1547 }, { "epoch": 0.5858088930936614, "grad_norm": 0.2257210133066878, "learning_rate": 0.0003863499158069044, "loss": 0.6151, "step": 1548 }, { "epoch": 0.5861873226111637, "grad_norm": 0.19950170617879792, "learning_rate": 0.0003857529364777567, "loss": 0.6072, "step": 1549 }, { "epoch": 0.586565752128666, "grad_norm": 0.21493779638258323, "learning_rate": 0.00038515612893410227, "loss": 0.6366, "step": 1550 }, { "epoch": 0.5869441816461684, "grad_norm": 0.19912534281718502, "learning_rate": 0.00038455949407331994, "loss": 0.6539, "step": 1551 }, { "epoch": 0.5873226111636708, "grad_norm": 0.22109226084770436, "learning_rate": 0.00038396303279252854, "loss": 0.62, "step": 1552 }, { "epoch": 0.5877010406811731, "grad_norm": 0.18864178703921483, "learning_rate": 0.00038336674598858617, "loss": 0.6068, "step": 1553 }, { "epoch": 0.5880794701986755, "grad_norm": 0.23526083911072138, "learning_rate": 0.0003827706345580886, "loss": 0.6052, "step": 1554 }, { "epoch": 0.5884578997161779, "grad_norm": 0.2071290145219073, "learning_rate": 0.00038217469939736776, "loss": 0.6169, "step": 1555 }, { "epoch": 0.5888363292336802, "grad_norm": 0.19508126876186158, "learning_rate": 0.00038157894140249045, "loss": 0.6649, "step": 1556 }, { "epoch": 0.5892147587511826, "grad_norm": 0.22725930431673252, "learning_rate": 0.0003809833614692573, "loss": 0.6242, "step": 1557 }, { "epoch": 0.589593188268685, "grad_norm": 0.24230121434791438, "learning_rate": 0.00038038796049320113, "loss": 0.6126, "step": 1558 }, { "epoch": 0.5899716177861873, "grad_norm": 0.23195574202945093, "learning_rate": 0.00037979273936958563, "loss": 0.6289, "step": 1559 }, { "epoch": 0.5903500473036897, "grad_norm": 0.25193641666339367, "learning_rate": 0.0003791976989934041, "loss": 0.6278, "step": 1560 }, { "epoch": 0.590728476821192, "grad_norm": 0.2557857786137724, "learning_rate": 0.000378602840259378, "loss": 0.6175, "step": 1561 }, { "epoch": 0.5911069063386944, "grad_norm": 0.1899979612592437, "learning_rate": 0.00037800816406195574, "loss": 0.6134, "step": 1562 }, { "epoch": 0.5914853358561968, "grad_norm": 0.2010194515808461, "learning_rate": 0.0003774136712953113, "loss": 0.6313, "step": 1563 }, { "epoch": 0.5918637653736991, "grad_norm": 0.23237361727534514, "learning_rate": 0.00037681936285334267, "loss": 0.6338, "step": 1564 }, { "epoch": 0.5922421948912016, "grad_norm": 0.20643730256304768, "learning_rate": 0.0003762252396296709, "loss": 0.619, "step": 1565 }, { "epoch": 0.5926206244087039, "grad_norm": 0.2380375632257292, "learning_rate": 0.0003756313025176384, "loss": 0.6264, "step": 1566 }, { "epoch": 0.5929990539262062, "grad_norm": 0.22515084058311147, "learning_rate": 0.0003750375524103077, "loss": 0.6407, "step": 1567 }, { "epoch": 0.5933774834437087, "grad_norm": 0.23222454894711256, "learning_rate": 0.0003744439902004603, "loss": 0.6336, "step": 1568 }, { "epoch": 0.593755912961211, "grad_norm": 0.22059978327489774, "learning_rate": 0.000373850616780595, "loss": 0.6363, "step": 1569 }, { "epoch": 0.5941343424787133, "grad_norm": 0.27166223531650535, "learning_rate": 0.0003732574330429268, "loss": 0.6409, "step": 1570 }, { "epoch": 0.5945127719962157, "grad_norm": 0.19139042613735202, "learning_rate": 0.0003726644398793857, "loss": 0.6207, "step": 1571 }, { "epoch": 0.5948912015137181, "grad_norm": 0.2106010261214944, "learning_rate": 0.0003720716381816148, "loss": 0.6249, "step": 1572 }, { "epoch": 0.5952696310312204, "grad_norm": 0.2077119899709748, "learning_rate": 0.00037147902884096946, "loss": 0.6334, "step": 1573 }, { "epoch": 0.5956480605487228, "grad_norm": 0.23123955735264126, "learning_rate": 0.00037088661274851584, "loss": 0.6117, "step": 1574 }, { "epoch": 0.5960264900662252, "grad_norm": 0.2277584410002432, "learning_rate": 0.0003702943907950295, "loss": 0.6595, "step": 1575 }, { "epoch": 0.5964049195837275, "grad_norm": 0.20741310532129295, "learning_rate": 0.0003697023638709941, "loss": 0.6387, "step": 1576 }, { "epoch": 0.5967833491012299, "grad_norm": 0.2231337797216679, "learning_rate": 0.0003691105328665999, "loss": 0.6346, "step": 1577 }, { "epoch": 0.5971617786187322, "grad_norm": 0.205437517874647, "learning_rate": 0.00036851889867174273, "loss": 0.6527, "step": 1578 }, { "epoch": 0.5975402081362347, "grad_norm": 0.2079829348879038, "learning_rate": 0.00036792746217602254, "loss": 0.6168, "step": 1579 }, { "epoch": 0.597918637653737, "grad_norm": 0.20561694251161516, "learning_rate": 0.0003673362242687418, "loss": 0.602, "step": 1580 }, { "epoch": 0.5982970671712393, "grad_norm": 0.24579937297552507, "learning_rate": 0.0003667451858389047, "loss": 0.6186, "step": 1581 }, { "epoch": 0.5986754966887418, "grad_norm": 0.2139089709012041, "learning_rate": 0.0003661543477752151, "loss": 0.5884, "step": 1582 }, { "epoch": 0.5990539262062441, "grad_norm": 0.2683697011368554, "learning_rate": 0.0003655637109660758, "loss": 0.6288, "step": 1583 }, { "epoch": 0.5994323557237464, "grad_norm": 0.21332479528450607, "learning_rate": 0.00036497327629958704, "loss": 0.6035, "step": 1584 }, { "epoch": 0.5998107852412489, "grad_norm": 0.21583424173689256, "learning_rate": 0.00036438304466354497, "loss": 0.5939, "step": 1585 }, { "epoch": 0.6001892147587512, "grad_norm": 0.207489675730353, "learning_rate": 0.0003637930169454407, "loss": 0.6287, "step": 1586 }, { "epoch": 0.6005676442762535, "grad_norm": 0.19789788926711135, "learning_rate": 0.00036320319403245827, "loss": 0.6297, "step": 1587 }, { "epoch": 0.6009460737937559, "grad_norm": 0.2256586857532055, "learning_rate": 0.0003626135768114742, "loss": 0.6008, "step": 1588 }, { "epoch": 0.6013245033112583, "grad_norm": 0.21023077370214593, "learning_rate": 0.0003620241661690555, "loss": 0.6394, "step": 1589 }, { "epoch": 0.6017029328287606, "grad_norm": 0.20737692027901575, "learning_rate": 0.0003614349629914586, "loss": 0.6141, "step": 1590 }, { "epoch": 0.602081362346263, "grad_norm": 0.2170469457032919, "learning_rate": 0.0003608459681646282, "loss": 0.6229, "step": 1591 }, { "epoch": 0.6024597918637654, "grad_norm": 0.232835681772426, "learning_rate": 0.0003602571825741953, "loss": 0.6232, "step": 1592 }, { "epoch": 0.6028382213812677, "grad_norm": 0.24175716437658012, "learning_rate": 0.0003596686071054767, "loss": 0.6344, "step": 1593 }, { "epoch": 0.6032166508987701, "grad_norm": 0.2159617206719502, "learning_rate": 0.0003590802426434728, "loss": 0.62, "step": 1594 }, { "epoch": 0.6035950804162725, "grad_norm": 0.19315975172827513, "learning_rate": 0.0003584920900728673, "loss": 0.6137, "step": 1595 }, { "epoch": 0.6039735099337749, "grad_norm": 0.20184133302074536, "learning_rate": 0.0003579041502780249, "loss": 0.6094, "step": 1596 }, { "epoch": 0.6043519394512772, "grad_norm": 0.19321305104081032, "learning_rate": 0.00035731642414299036, "loss": 0.5985, "step": 1597 }, { "epoch": 0.6047303689687795, "grad_norm": 0.18527092742345594, "learning_rate": 0.0003567289125514872, "loss": 0.6161, "step": 1598 }, { "epoch": 0.605108798486282, "grad_norm": 0.24833409508619242, "learning_rate": 0.0003561416163869166, "loss": 0.6187, "step": 1599 }, { "epoch": 0.6054872280037843, "grad_norm": 0.20104860955110682, "learning_rate": 0.0003555545365323555, "loss": 0.641, "step": 1600 }, { "epoch": 0.6058656575212866, "grad_norm": 0.22001532335994187, "learning_rate": 0.00035496767387055563, "loss": 0.6205, "step": 1601 }, { "epoch": 0.6062440870387891, "grad_norm": 0.24426159722626115, "learning_rate": 0.00035438102928394237, "loss": 0.6279, "step": 1602 }, { "epoch": 0.6066225165562914, "grad_norm": 0.20260397405214806, "learning_rate": 0.000353794603654613, "loss": 0.6164, "step": 1603 }, { "epoch": 0.6070009460737937, "grad_norm": 0.23896117213413506, "learning_rate": 0.0003532083978643357, "loss": 0.6439, "step": 1604 }, { "epoch": 0.6073793755912961, "grad_norm": 0.22115547107216002, "learning_rate": 0.00035262241279454787, "loss": 0.6523, "step": 1605 }, { "epoch": 0.6077578051087985, "grad_norm": 0.2713422883717193, "learning_rate": 0.0003520366493263554, "loss": 0.6225, "step": 1606 }, { "epoch": 0.6081362346263008, "grad_norm": 0.22236487916180453, "learning_rate": 0.00035145110834053047, "loss": 0.6335, "step": 1607 }, { "epoch": 0.6085146641438032, "grad_norm": 0.2451800991344555, "learning_rate": 0.00035086579071751106, "loss": 0.6192, "step": 1608 }, { "epoch": 0.6088930936613056, "grad_norm": 0.2511563677472461, "learning_rate": 0.0003502806973373993, "loss": 0.6234, "step": 1609 }, { "epoch": 0.609271523178808, "grad_norm": 0.19865033457843326, "learning_rate": 0.00034969582907996014, "loss": 0.6146, "step": 1610 }, { "epoch": 0.6096499526963103, "grad_norm": 0.20049475628281532, "learning_rate": 0.00034911118682461983, "loss": 0.662, "step": 1611 }, { "epoch": 0.6100283822138127, "grad_norm": 0.2232323605167567, "learning_rate": 0.000348526771450465, "loss": 0.6342, "step": 1612 }, { "epoch": 0.6104068117313151, "grad_norm": 0.27286693445047, "learning_rate": 0.00034794258383624114, "loss": 0.6163, "step": 1613 }, { "epoch": 0.6107852412488174, "grad_norm": 0.5385439493314178, "learning_rate": 0.000347358624860351, "loss": 0.6169, "step": 1614 }, { "epoch": 0.6111636707663197, "grad_norm": 0.21540922826902065, "learning_rate": 0.0003467748954008539, "loss": 0.5867, "step": 1615 }, { "epoch": 0.6115421002838222, "grad_norm": 0.2646682844426832, "learning_rate": 0.00034619139633546383, "loss": 0.6165, "step": 1616 }, { "epoch": 0.6119205298013245, "grad_norm": 0.20009347139583686, "learning_rate": 0.0003456081285415486, "loss": 0.6319, "step": 1617 }, { "epoch": 0.6122989593188268, "grad_norm": 0.20725135978441311, "learning_rate": 0.0003450250928961278, "loss": 0.639, "step": 1618 }, { "epoch": 0.6126773888363293, "grad_norm": 0.2089218152965985, "learning_rate": 0.0003444422902758724, "loss": 0.6286, "step": 1619 }, { "epoch": 0.6130558183538316, "grad_norm": 0.21999184594703083, "learning_rate": 0.0003438597215571027, "loss": 0.6115, "step": 1620 }, { "epoch": 0.6134342478713339, "grad_norm": 0.24113160346102105, "learning_rate": 0.0003432773876157876, "loss": 0.6269, "step": 1621 }, { "epoch": 0.6138126773888364, "grad_norm": 0.18696241998899676, "learning_rate": 0.00034269528932754273, "loss": 0.5995, "step": 1622 }, { "epoch": 0.6141911069063387, "grad_norm": 0.21175127764894, "learning_rate": 0.0003421134275676294, "loss": 0.6318, "step": 1623 }, { "epoch": 0.614569536423841, "grad_norm": 0.22588395584340082, "learning_rate": 0.00034153180321095343, "loss": 0.6435, "step": 1624 }, { "epoch": 0.6149479659413434, "grad_norm": 0.24519942526462152, "learning_rate": 0.00034095041713206367, "loss": 0.6028, "step": 1625 }, { "epoch": 0.6153263954588458, "grad_norm": 0.20894126804022528, "learning_rate": 0.0003403692702051503, "loss": 0.6166, "step": 1626 }, { "epoch": 0.6157048249763482, "grad_norm": 0.19943677901732723, "learning_rate": 0.0003397883633040445, "loss": 0.6228, "step": 1627 }, { "epoch": 0.6160832544938505, "grad_norm": 0.22914208836678357, "learning_rate": 0.00033920769730221603, "loss": 0.5744, "step": 1628 }, { "epoch": 0.6164616840113529, "grad_norm": 0.2542528867398264, "learning_rate": 0.00033862727307277265, "loss": 0.6366, "step": 1629 }, { "epoch": 0.6168401135288553, "grad_norm": 0.2709274892486551, "learning_rate": 0.0003380470914884586, "loss": 0.6207, "step": 1630 }, { "epoch": 0.6172185430463576, "grad_norm": 0.1993797250265513, "learning_rate": 0.00033746715342165324, "loss": 0.5882, "step": 1631 }, { "epoch": 0.61759697256386, "grad_norm": 0.26583211902014203, "learning_rate": 0.00033688745974436976, "loss": 0.6097, "step": 1632 }, { "epoch": 0.6179754020813624, "grad_norm": 0.22115005544536293, "learning_rate": 0.00033630801132825393, "loss": 0.6182, "step": 1633 }, { "epoch": 0.6183538315988647, "grad_norm": 0.24708868418303784, "learning_rate": 0.00033572880904458267, "loss": 0.5891, "step": 1634 }, { "epoch": 0.618732261116367, "grad_norm": 0.21573579436497184, "learning_rate": 0.00033514985376426276, "loss": 0.64, "step": 1635 }, { "epoch": 0.6191106906338695, "grad_norm": 0.22702104927559852, "learning_rate": 0.0003345711463578297, "loss": 0.6389, "step": 1636 }, { "epoch": 0.6194891201513718, "grad_norm": 0.2257705692550083, "learning_rate": 0.00033399268769544635, "loss": 0.6447, "step": 1637 }, { "epoch": 0.6198675496688741, "grad_norm": 0.24143212713661769, "learning_rate": 0.0003334144786469012, "loss": 0.6008, "step": 1638 }, { "epoch": 0.6202459791863766, "grad_norm": 0.20725677909911228, "learning_rate": 0.0003328365200816077, "loss": 0.6177, "step": 1639 }, { "epoch": 0.6206244087038789, "grad_norm": 0.23509938829428415, "learning_rate": 0.0003322588128686027, "loss": 0.5842, "step": 1640 }, { "epoch": 0.6210028382213812, "grad_norm": 0.20195030785962387, "learning_rate": 0.0003316813578765449, "loss": 0.6079, "step": 1641 }, { "epoch": 0.6213812677388836, "grad_norm": 0.25457332719929254, "learning_rate": 0.0003311041559737139, "loss": 0.6017, "step": 1642 }, { "epoch": 0.621759697256386, "grad_norm": 0.2052979154356468, "learning_rate": 0.00033052720802800883, "loss": 0.6097, "step": 1643 }, { "epoch": 0.6221381267738884, "grad_norm": 0.21495308261545656, "learning_rate": 0.0003299505149069467, "loss": 0.6402, "step": 1644 }, { "epoch": 0.6225165562913907, "grad_norm": 0.20394725533411034, "learning_rate": 0.0003293740774776615, "loss": 0.6143, "step": 1645 }, { "epoch": 0.6228949858088931, "grad_norm": 0.2227545738628305, "learning_rate": 0.00032879789660690285, "loss": 0.6375, "step": 1646 }, { "epoch": 0.6232734153263955, "grad_norm": 0.19053300842279589, "learning_rate": 0.00032822197316103457, "loss": 0.6205, "step": 1647 }, { "epoch": 0.6236518448438978, "grad_norm": 0.21424459661794285, "learning_rate": 0.0003276463080060331, "loss": 0.6099, "step": 1648 }, { "epoch": 0.6240302743614002, "grad_norm": 0.21002707216057706, "learning_rate": 0.000327070902007487, "loss": 0.6308, "step": 1649 }, { "epoch": 0.6244087038789026, "grad_norm": 0.21463935839791717, "learning_rate": 0.0003264957560305947, "loss": 0.6121, "step": 1650 }, { "epoch": 0.6247871333964049, "grad_norm": 0.23794704200483563, "learning_rate": 0.00032592087094016395, "loss": 0.6222, "step": 1651 }, { "epoch": 0.6251655629139072, "grad_norm": 0.23632924418539067, "learning_rate": 0.0003253462476006101, "loss": 0.6087, "step": 1652 }, { "epoch": 0.6255439924314097, "grad_norm": 0.2197891146314327, "learning_rate": 0.00032477188687595507, "loss": 0.6313, "step": 1653 }, { "epoch": 0.625922421948912, "grad_norm": 0.22267127404638107, "learning_rate": 0.0003241977896298256, "loss": 0.6372, "step": 1654 }, { "epoch": 0.6263008514664143, "grad_norm": 0.21556387636193786, "learning_rate": 0.0003236239567254526, "loss": 0.6258, "step": 1655 }, { "epoch": 0.6266792809839168, "grad_norm": 0.22108165384369546, "learning_rate": 0.0003230503890256693, "loss": 0.6419, "step": 1656 }, { "epoch": 0.6270577105014191, "grad_norm": 0.22219785552163862, "learning_rate": 0.0003224770873929101, "loss": 0.6141, "step": 1657 }, { "epoch": 0.6274361400189215, "grad_norm": 0.2237857478903925, "learning_rate": 0.0003219040526892098, "loss": 0.6506, "step": 1658 }, { "epoch": 0.6278145695364239, "grad_norm": 0.2342800901887234, "learning_rate": 0.0003213312857762011, "loss": 0.613, "step": 1659 }, { "epoch": 0.6281929990539262, "grad_norm": 0.24707430056040944, "learning_rate": 0.00032075878751511446, "loss": 0.6055, "step": 1660 }, { "epoch": 0.6285714285714286, "grad_norm": 0.24304540259106244, "learning_rate": 0.0003201865587667765, "loss": 0.6196, "step": 1661 }, { "epoch": 0.6289498580889309, "grad_norm": 0.23886550347521365, "learning_rate": 0.0003196146003916084, "loss": 0.6089, "step": 1662 }, { "epoch": 0.6293282876064333, "grad_norm": 0.24886827309664625, "learning_rate": 0.00031904291324962475, "loss": 0.6134, "step": 1663 }, { "epoch": 0.6297067171239357, "grad_norm": 0.21391278481933448, "learning_rate": 0.0003184714982004325, "loss": 0.6142, "step": 1664 }, { "epoch": 0.630085146641438, "grad_norm": 0.2412882846883348, "learning_rate": 0.0003179003561032293, "loss": 0.6431, "step": 1665 }, { "epoch": 0.6304635761589404, "grad_norm": 0.25288605773968553, "learning_rate": 0.0003173294878168025, "loss": 0.6374, "step": 1666 }, { "epoch": 0.6308420056764428, "grad_norm": 0.21207537861603282, "learning_rate": 0.0003167588941995276, "loss": 0.6267, "step": 1667 }, { "epoch": 0.6312204351939451, "grad_norm": 0.21940468842657435, "learning_rate": 0.0003161885761093674, "loss": 0.6156, "step": 1668 }, { "epoch": 0.6315988647114474, "grad_norm": 0.23060104646317092, "learning_rate": 0.00031561853440386994, "loss": 0.6082, "step": 1669 }, { "epoch": 0.6319772942289499, "grad_norm": 0.23023640510381171, "learning_rate": 0.00031504876994016806, "loss": 0.6224, "step": 1670 }, { "epoch": 0.6323557237464522, "grad_norm": 0.2074458990175312, "learning_rate": 0.00031447928357497757, "loss": 0.6265, "step": 1671 }, { "epoch": 0.6327341532639545, "grad_norm": 0.20004534803309934, "learning_rate": 0.0003139100761645961, "loss": 0.6224, "step": 1672 }, { "epoch": 0.633112582781457, "grad_norm": 0.19985923469985287, "learning_rate": 0.00031334114856490205, "loss": 0.6297, "step": 1673 }, { "epoch": 0.6334910122989593, "grad_norm": 0.21381677181388895, "learning_rate": 0.00031277250163135273, "loss": 0.6211, "step": 1674 }, { "epoch": 0.6338694418164617, "grad_norm": 0.229452608632022, "learning_rate": 0.0003122041362189838, "loss": 0.6226, "step": 1675 }, { "epoch": 0.6342478713339641, "grad_norm": 0.2300249206606058, "learning_rate": 0.0003116360531824074, "loss": 0.6011, "step": 1676 }, { "epoch": 0.6346263008514664, "grad_norm": 0.23394517426893552, "learning_rate": 0.0003110682533758111, "loss": 0.6111, "step": 1677 }, { "epoch": 0.6350047303689688, "grad_norm": 0.23635202082106377, "learning_rate": 0.00031050073765295674, "loss": 0.6092, "step": 1678 }, { "epoch": 0.6353831598864711, "grad_norm": 0.18930971479145953, "learning_rate": 0.0003099335068671787, "loss": 0.5991, "step": 1679 }, { "epoch": 0.6357615894039735, "grad_norm": 0.21678634136831526, "learning_rate": 0.0003093665618713831, "loss": 0.621, "step": 1680 }, { "epoch": 0.6361400189214759, "grad_norm": 0.2479380721914287, "learning_rate": 0.0003087999035180465, "loss": 0.6246, "step": 1681 }, { "epoch": 0.6365184484389782, "grad_norm": 0.21559289022833536, "learning_rate": 0.0003082335326592142, "loss": 0.6012, "step": 1682 }, { "epoch": 0.6368968779564806, "grad_norm": 0.2013471080932793, "learning_rate": 0.00030766745014649936, "loss": 0.6162, "step": 1683 }, { "epoch": 0.637275307473983, "grad_norm": 0.2600324593969625, "learning_rate": 0.00030710165683108155, "loss": 0.6377, "step": 1684 }, { "epoch": 0.6376537369914853, "grad_norm": 0.19188264999298896, "learning_rate": 0.00030653615356370537, "loss": 0.644, "step": 1685 }, { "epoch": 0.6380321665089878, "grad_norm": 0.23710584747857866, "learning_rate": 0.0003059709411946795, "loss": 0.6141, "step": 1686 }, { "epoch": 0.6384105960264901, "grad_norm": 0.2255833943541545, "learning_rate": 0.000305406020573875, "loss": 0.6018, "step": 1687 }, { "epoch": 0.6387890255439924, "grad_norm": 0.24146610925236828, "learning_rate": 0.00030484139255072454, "loss": 0.6131, "step": 1688 }, { "epoch": 0.6391674550614947, "grad_norm": 0.25252479931882205, "learning_rate": 0.00030427705797422046, "loss": 0.603, "step": 1689 }, { "epoch": 0.6395458845789972, "grad_norm": 0.24444481019618267, "learning_rate": 0.00030371301769291413, "loss": 0.6524, "step": 1690 }, { "epoch": 0.6399243140964995, "grad_norm": 0.19809549287931077, "learning_rate": 0.0003031492725549143, "loss": 0.6374, "step": 1691 }, { "epoch": 0.6403027436140019, "grad_norm": 0.22173062550125372, "learning_rate": 0.000302585823407886, "loss": 0.6144, "step": 1692 }, { "epoch": 0.6406811731315043, "grad_norm": 0.2461319469902248, "learning_rate": 0.0003020226710990492, "loss": 0.6233, "step": 1693 }, { "epoch": 0.6410596026490066, "grad_norm": 0.22048178688991987, "learning_rate": 0.0003014598164751774, "loss": 0.6211, "step": 1694 }, { "epoch": 0.641438032166509, "grad_norm": 0.2207671469269376, "learning_rate": 0.00030089726038259667, "loss": 0.5861, "step": 1695 }, { "epoch": 0.6418164616840114, "grad_norm": 0.22623323487767055, "learning_rate": 0.0003003350036671841, "loss": 0.6328, "step": 1696 }, { "epoch": 0.6421948912015137, "grad_norm": 0.23758177416080328, "learning_rate": 0.0002997730471743667, "loss": 0.6398, "step": 1697 }, { "epoch": 0.6425733207190161, "grad_norm": 0.20432804628425955, "learning_rate": 0.0002992113917491199, "loss": 0.5961, "step": 1698 }, { "epoch": 0.6429517502365184, "grad_norm": 0.21117793614660907, "learning_rate": 0.0002986500382359667, "loss": 0.6375, "step": 1699 }, { "epoch": 0.6433301797540208, "grad_norm": 0.22887864581110154, "learning_rate": 0.00029808898747897577, "loss": 0.6127, "step": 1700 }, { "epoch": 0.6437086092715232, "grad_norm": 0.17976633705883424, "learning_rate": 0.00029752824032176084, "loss": 0.6315, "step": 1701 }, { "epoch": 0.6440870387890255, "grad_norm": 0.20129764662208827, "learning_rate": 0.00029696779760747906, "loss": 0.6033, "step": 1702 }, { "epoch": 0.644465468306528, "grad_norm": 0.2127372644279346, "learning_rate": 0.00029640766017882973, "loss": 0.6343, "step": 1703 }, { "epoch": 0.6448438978240303, "grad_norm": 0.20958591947009075, "learning_rate": 0.0002958478288780533, "loss": 0.6113, "step": 1704 }, { "epoch": 0.6452223273415326, "grad_norm": 0.24993819724536814, "learning_rate": 0.00029528830454692966, "loss": 0.6301, "step": 1705 }, { "epoch": 0.645600756859035, "grad_norm": 0.20065455911182645, "learning_rate": 0.00029472908802677747, "loss": 0.6215, "step": 1706 }, { "epoch": 0.6459791863765374, "grad_norm": 0.22210212821825337, "learning_rate": 0.0002941701801584521, "loss": 0.6166, "step": 1707 }, { "epoch": 0.6463576158940397, "grad_norm": 0.26953259066925545, "learning_rate": 0.00029361158178234527, "loss": 0.6082, "step": 1708 }, { "epoch": 0.6467360454115421, "grad_norm": 0.2516518647349359, "learning_rate": 0.00029305329373838305, "loss": 0.614, "step": 1709 }, { "epoch": 0.6471144749290445, "grad_norm": 0.2954156039657812, "learning_rate": 0.00029249531686602505, "loss": 0.6311, "step": 1710 }, { "epoch": 0.6474929044465468, "grad_norm": 0.2940999888280343, "learning_rate": 0.0002919376520042628, "loss": 0.5964, "step": 1711 }, { "epoch": 0.6478713339640492, "grad_norm": 0.20077182287505588, "learning_rate": 0.0002913802999916187, "loss": 0.6095, "step": 1712 }, { "epoch": 0.6482497634815516, "grad_norm": 0.22764031772722396, "learning_rate": 0.0002908232616661453, "loss": 0.5699, "step": 1713 }, { "epoch": 0.6486281929990539, "grad_norm": 0.23382717322281066, "learning_rate": 0.00029026653786542274, "loss": 0.6145, "step": 1714 }, { "epoch": 0.6490066225165563, "grad_norm": 0.21339275044588477, "learning_rate": 0.0002897101294265583, "loss": 0.612, "step": 1715 }, { "epoch": 0.6493850520340586, "grad_norm": 0.20025706385537026, "learning_rate": 0.0002891540371861856, "loss": 0.6306, "step": 1716 }, { "epoch": 0.649763481551561, "grad_norm": 0.20764819476173968, "learning_rate": 0.0002885982619804622, "loss": 0.638, "step": 1717 }, { "epoch": 0.6501419110690634, "grad_norm": 0.22340836568437042, "learning_rate": 0.0002880428046450697, "loss": 0.6277, "step": 1718 }, { "epoch": 0.6505203405865657, "grad_norm": 0.2129074230041397, "learning_rate": 0.0002874876660152108, "loss": 0.6191, "step": 1719 }, { "epoch": 0.6508987701040682, "grad_norm": 0.21345237671709993, "learning_rate": 0.0002869328469256099, "loss": 0.5966, "step": 1720 }, { "epoch": 0.6512771996215705, "grad_norm": 0.24095231836915057, "learning_rate": 0.0002863783482105101, "loss": 0.5979, "step": 1721 }, { "epoch": 0.6516556291390728, "grad_norm": 0.185854856974145, "learning_rate": 0.0002858241707036736, "loss": 0.6232, "step": 1722 }, { "epoch": 0.6520340586565753, "grad_norm": 0.21755900752730867, "learning_rate": 0.00028527031523837887, "loss": 0.6095, "step": 1723 }, { "epoch": 0.6524124881740776, "grad_norm": 0.22346821187098187, "learning_rate": 0.00028471678264742074, "loss": 0.6069, "step": 1724 }, { "epoch": 0.6527909176915799, "grad_norm": 0.21261676182056868, "learning_rate": 0.0002841635737631082, "loss": 0.6158, "step": 1725 }, { "epoch": 0.6531693472090823, "grad_norm": 0.21370443483087778, "learning_rate": 0.0002836106894172633, "loss": 0.6151, "step": 1726 }, { "epoch": 0.6535477767265847, "grad_norm": 0.21139029189593983, "learning_rate": 0.00028305813044122096, "loss": 0.5905, "step": 1727 }, { "epoch": 0.653926206244087, "grad_norm": 0.18774894241918977, "learning_rate": 0.0002825058976658258, "loss": 0.6072, "step": 1728 }, { "epoch": 0.6543046357615894, "grad_norm": 0.25249945380841093, "learning_rate": 0.0002819539919214329, "loss": 0.6151, "step": 1729 }, { "epoch": 0.6546830652790918, "grad_norm": 0.2408514239409098, "learning_rate": 0.0002814024140379048, "loss": 0.6347, "step": 1730 }, { "epoch": 0.6550614947965941, "grad_norm": 0.22345991822464956, "learning_rate": 0.00028085116484461174, "loss": 0.6595, "step": 1731 }, { "epoch": 0.6554399243140965, "grad_norm": 0.22617952271281852, "learning_rate": 0.0002803002451704291, "loss": 0.6208, "step": 1732 }, { "epoch": 0.6558183538315988, "grad_norm": 0.1783689551635747, "learning_rate": 0.0002797496558437375, "loss": 0.6136, "step": 1733 }, { "epoch": 0.6561967833491013, "grad_norm": 0.23299408046422998, "learning_rate": 0.00027919939769242017, "loss": 0.59, "step": 1734 }, { "epoch": 0.6565752128666036, "grad_norm": 0.20557755102541508, "learning_rate": 0.00027864947154386245, "loss": 0.6402, "step": 1735 }, { "epoch": 0.6569536423841059, "grad_norm": 0.2194979856274833, "learning_rate": 0.00027809987822495117, "loss": 0.6247, "step": 1736 }, { "epoch": 0.6573320719016084, "grad_norm": 0.21494769269597117, "learning_rate": 0.00027755061856207173, "loss": 0.6573, "step": 1737 }, { "epoch": 0.6577105014191107, "grad_norm": 0.20055394497911858, "learning_rate": 0.0002770016933811087, "loss": 0.6147, "step": 1738 }, { "epoch": 0.658088930936613, "grad_norm": 0.21176215849404653, "learning_rate": 0.000276453103507443, "loss": 0.6066, "step": 1739 }, { "epoch": 0.6584673604541155, "grad_norm": 0.22323519745009907, "learning_rate": 0.000275904849765952, "loss": 0.6309, "step": 1740 }, { "epoch": 0.6588457899716178, "grad_norm": 0.17431710080133975, "learning_rate": 0.00027535693298100694, "loss": 0.6082, "step": 1741 }, { "epoch": 0.6592242194891201, "grad_norm": 0.21499663895097912, "learning_rate": 0.00027480935397647323, "loss": 0.6243, "step": 1742 }, { "epoch": 0.6596026490066225, "grad_norm": 0.25549367438938375, "learning_rate": 0.00027426211357570765, "loss": 0.6337, "step": 1743 }, { "epoch": 0.6599810785241249, "grad_norm": 0.1997376397513699, "learning_rate": 0.0002737152126015584, "loss": 0.5999, "step": 1744 }, { "epoch": 0.6603595080416272, "grad_norm": 0.2480122249160886, "learning_rate": 0.00027316865187636296, "loss": 0.6331, "step": 1745 }, { "epoch": 0.6607379375591296, "grad_norm": 0.2193646141321645, "learning_rate": 0.0002726224322219473, "loss": 0.6235, "step": 1746 }, { "epoch": 0.661116367076632, "grad_norm": 0.21850043774916666, "learning_rate": 0.0002720765544596242, "loss": 0.6355, "step": 1747 }, { "epoch": 0.6614947965941343, "grad_norm": 0.20281836399835, "learning_rate": 0.00027153101941019333, "loss": 0.6204, "step": 1748 }, { "epoch": 0.6618732261116367, "grad_norm": 0.21239980060326472, "learning_rate": 0.00027098582789393804, "loss": 0.6092, "step": 1749 }, { "epoch": 0.6622516556291391, "grad_norm": 0.2556219965378237, "learning_rate": 0.0002704409807306254, "loss": 0.6018, "step": 1750 }, { "epoch": 0.6626300851466415, "grad_norm": 0.22759679954064438, "learning_rate": 0.0002698964787395052, "loss": 0.6003, "step": 1751 }, { "epoch": 0.6630085146641438, "grad_norm": 0.2346939936068412, "learning_rate": 0.0002693523227393075, "loss": 0.6363, "step": 1752 }, { "epoch": 0.6633869441816461, "grad_norm": 0.21489733981296183, "learning_rate": 0.00026880851354824277, "loss": 0.6105, "step": 1753 }, { "epoch": 0.6637653736991486, "grad_norm": 1.2510394542885221, "learning_rate": 0.0002682650519839992, "loss": 0.6038, "step": 1754 }, { "epoch": 0.6641438032166509, "grad_norm": 0.19579533976083197, "learning_rate": 0.0002677219388637434, "loss": 0.6191, "step": 1755 }, { "epoch": 0.6645222327341532, "grad_norm": 0.2174152113633183, "learning_rate": 0.0002671791750041167, "loss": 0.5967, "step": 1756 }, { "epoch": 0.6649006622516557, "grad_norm": 0.24747449059469126, "learning_rate": 0.0002666367612212367, "loss": 0.6213, "step": 1757 }, { "epoch": 0.665279091769158, "grad_norm": 0.19967228556894595, "learning_rate": 0.0002660946983306933, "loss": 0.6142, "step": 1758 }, { "epoch": 0.6656575212866603, "grad_norm": 0.2285558551034163, "learning_rate": 0.00026555298714754985, "loss": 0.6442, "step": 1759 }, { "epoch": 0.6660359508041628, "grad_norm": 0.217149139007901, "learning_rate": 0.0002650116284863402, "loss": 0.6189, "step": 1760 }, { "epoch": 0.6664143803216651, "grad_norm": 0.23169254023562386, "learning_rate": 0.00026447062316106804, "loss": 0.6284, "step": 1761 }, { "epoch": 0.6667928098391674, "grad_norm": 0.2153087698553833, "learning_rate": 0.0002639299719852066, "loss": 0.6019, "step": 1762 }, { "epoch": 0.6671712393566698, "grad_norm": 0.335781049892531, "learning_rate": 0.0002633896757716956, "loss": 0.6151, "step": 1763 }, { "epoch": 0.6675496688741722, "grad_norm": 0.2483556432644637, "learning_rate": 0.00026284973533294195, "loss": 0.6006, "step": 1764 }, { "epoch": 0.6679280983916746, "grad_norm": 0.23158964654357297, "learning_rate": 0.0002623101514808166, "loss": 0.5972, "step": 1765 }, { "epoch": 0.6683065279091769, "grad_norm": 0.20381822037821185, "learning_rate": 0.00026177092502665546, "loss": 0.613, "step": 1766 }, { "epoch": 0.6686849574266793, "grad_norm": 0.21313740487228847, "learning_rate": 0.00026123205678125593, "loss": 0.6083, "step": 1767 }, { "epoch": 0.6690633869441817, "grad_norm": 0.23722346607375241, "learning_rate": 0.00026069354755487773, "loss": 0.6139, "step": 1768 }, { "epoch": 0.669441816461684, "grad_norm": 0.2746182561010754, "learning_rate": 0.00026015539815724023, "loss": 0.6135, "step": 1769 }, { "epoch": 0.6698202459791863, "grad_norm": 0.22726657416131385, "learning_rate": 0.00025961760939752156, "loss": 0.6391, "step": 1770 }, { "epoch": 0.6701986754966888, "grad_norm": 0.2169770102425044, "learning_rate": 0.00025908018208435835, "loss": 0.6318, "step": 1771 }, { "epoch": 0.6705771050141911, "grad_norm": 0.22795441990993856, "learning_rate": 0.00025854311702584287, "loss": 0.6106, "step": 1772 }, { "epoch": 0.6709555345316934, "grad_norm": 0.21888937937539824, "learning_rate": 0.0002580064150295235, "loss": 0.6053, "step": 1773 }, { "epoch": 0.6713339640491959, "grad_norm": 0.2757407998956497, "learning_rate": 0.00025747007690240197, "loss": 0.6151, "step": 1774 }, { "epoch": 0.6717123935666982, "grad_norm": 0.2276468799210102, "learning_rate": 0.0002569341034509335, "loss": 0.6263, "step": 1775 }, { "epoch": 0.6720908230842005, "grad_norm": 0.21102120363501323, "learning_rate": 0.00025639849548102445, "loss": 0.605, "step": 1776 }, { "epoch": 0.672469252601703, "grad_norm": 0.2121542610882109, "learning_rate": 0.0002558632537980321, "loss": 0.5939, "step": 1777 }, { "epoch": 0.6728476821192053, "grad_norm": 0.25508754005990736, "learning_rate": 0.00025532837920676254, "loss": 0.5948, "step": 1778 }, { "epoch": 0.6732261116367076, "grad_norm": 0.19653614113074733, "learning_rate": 0.0002547938725114705, "loss": 0.6255, "step": 1779 }, { "epoch": 0.67360454115421, "grad_norm": 0.22110103082968663, "learning_rate": 0.00025425973451585667, "loss": 0.6274, "step": 1780 }, { "epoch": 0.6739829706717124, "grad_norm": 0.18577654828602105, "learning_rate": 0.00025372596602306785, "loss": 0.5893, "step": 1781 }, { "epoch": 0.6743614001892148, "grad_norm": 0.23595005721976492, "learning_rate": 0.0002531925678356956, "loss": 0.5935, "step": 1782 }, { "epoch": 0.6747398297067171, "grad_norm": 0.26699136874806595, "learning_rate": 0.0002526595407557738, "loss": 0.6086, "step": 1783 }, { "epoch": 0.6751182592242195, "grad_norm": 0.2449182029461018, "learning_rate": 0.0002521268855847792, "loss": 0.6249, "step": 1784 }, { "epoch": 0.6754966887417219, "grad_norm": 0.2129498932957298, "learning_rate": 0.00025159460312362857, "loss": 0.5883, "step": 1785 }, { "epoch": 0.6758751182592242, "grad_norm": 0.23880118567832925, "learning_rate": 0.00025106269417267906, "loss": 0.6238, "step": 1786 }, { "epoch": 0.6762535477767266, "grad_norm": 0.22236706542429308, "learning_rate": 0.00025053115953172536, "loss": 0.6123, "step": 1787 }, { "epoch": 0.676631977294229, "grad_norm": 0.20797139954974966, "learning_rate": 0.0002500000000000001, "loss": 0.61, "step": 1788 }, { "epoch": 0.6770104068117313, "grad_norm": 0.21142134100147608, "learning_rate": 0.00024946921637617115, "loss": 0.6074, "step": 1789 }, { "epoch": 0.6773888363292336, "grad_norm": 0.21959631556354042, "learning_rate": 0.00024893880945834197, "loss": 0.6089, "step": 1790 }, { "epoch": 0.6777672658467361, "grad_norm": 0.1827943727684127, "learning_rate": 0.00024840878004404887, "loss": 0.623, "step": 1791 }, { "epoch": 0.6781456953642384, "grad_norm": 0.22491414498405132, "learning_rate": 0.00024787912893026073, "loss": 0.6027, "step": 1792 }, { "epoch": 0.6785241248817407, "grad_norm": 0.18955383376936397, "learning_rate": 0.00024734985691337804, "loss": 0.6123, "step": 1793 }, { "epoch": 0.6789025543992432, "grad_norm": 0.20147203071115932, "learning_rate": 0.0002468209647892305, "loss": 0.5806, "step": 1794 }, { "epoch": 0.6792809839167455, "grad_norm": 0.2092659025495898, "learning_rate": 0.00024629245335307736, "loss": 0.6036, "step": 1795 }, { "epoch": 0.6796594134342478, "grad_norm": 0.20804210612969257, "learning_rate": 0.0002457643233996049, "loss": 0.629, "step": 1796 }, { "epoch": 0.6800378429517502, "grad_norm": 0.21702342181990322, "learning_rate": 0.00024523657572292617, "loss": 0.6273, "step": 1797 }, { "epoch": 0.6804162724692526, "grad_norm": 0.22609120710902997, "learning_rate": 0.00024470921111657904, "loss": 0.6045, "step": 1798 }, { "epoch": 0.680794701986755, "grad_norm": 0.22001402806131276, "learning_rate": 0.0002441822303735259, "loss": 0.6314, "step": 1799 }, { "epoch": 0.6811731315042573, "grad_norm": 0.21376660564123226, "learning_rate": 0.0002436556342861514, "loss": 0.616, "step": 1800 }, { "epoch": 0.6815515610217597, "grad_norm": 0.21585578972085653, "learning_rate": 0.00024312942364626195, "loss": 0.6353, "step": 1801 }, { "epoch": 0.6819299905392621, "grad_norm": 0.22774118442726057, "learning_rate": 0.0002426035992450848, "loss": 0.5924, "step": 1802 }, { "epoch": 0.6823084200567644, "grad_norm": 0.2464888746089688, "learning_rate": 0.00024207816187326575, "loss": 0.6124, "step": 1803 }, { "epoch": 0.6826868495742668, "grad_norm": 0.21351879954987787, "learning_rate": 0.00024155311232086947, "loss": 0.5989, "step": 1804 }, { "epoch": 0.6830652790917692, "grad_norm": 0.2351296831249681, "learning_rate": 0.00024102845137737668, "loss": 0.6378, "step": 1805 }, { "epoch": 0.6834437086092715, "grad_norm": 0.21983727950583248, "learning_rate": 0.00024050417983168437, "loss": 0.592, "step": 1806 }, { "epoch": 0.6838221381267738, "grad_norm": 0.20285856673035493, "learning_rate": 0.00023998029847210363, "loss": 0.6083, "step": 1807 }, { "epoch": 0.6842005676442763, "grad_norm": 0.22483284655411775, "learning_rate": 0.00023945680808635923, "loss": 0.6024, "step": 1808 }, { "epoch": 0.6845789971617786, "grad_norm": 0.2533918892849513, "learning_rate": 0.00023893370946158755, "loss": 0.6404, "step": 1809 }, { "epoch": 0.684957426679281, "grad_norm": 0.28078199230471707, "learning_rate": 0.00023841100338433657, "loss": 0.6012, "step": 1810 }, { "epoch": 0.6853358561967834, "grad_norm": 0.22175490997817857, "learning_rate": 0.00023788869064056352, "loss": 0.6277, "step": 1811 }, { "epoch": 0.6857142857142857, "grad_norm": 0.21208431945963732, "learning_rate": 0.00023736677201563401, "loss": 0.5785, "step": 1812 }, { "epoch": 0.686092715231788, "grad_norm": 0.2262721503200429, "learning_rate": 0.0002368452482943219, "loss": 0.6181, "step": 1813 }, { "epoch": 0.6864711447492905, "grad_norm": 0.22076115647039934, "learning_rate": 0.00023632412026080625, "loss": 0.6412, "step": 1814 }, { "epoch": 0.6868495742667928, "grad_norm": 0.2074005683544891, "learning_rate": 0.00023580338869867208, "loss": 0.5749, "step": 1815 }, { "epoch": 0.6872280037842952, "grad_norm": 0.21993178214868161, "learning_rate": 0.00023528305439090742, "loss": 0.6324, "step": 1816 }, { "epoch": 0.6876064333017975, "grad_norm": 0.20296223244543402, "learning_rate": 0.0002347631181199037, "loss": 0.5984, "step": 1817 }, { "epoch": 0.6879848628192999, "grad_norm": 0.24744374999082722, "learning_rate": 0.00023424358066745317, "loss": 0.6098, "step": 1818 }, { "epoch": 0.6883632923368023, "grad_norm": 0.20838247951340377, "learning_rate": 0.00023372444281474913, "loss": 0.5908, "step": 1819 }, { "epoch": 0.6887417218543046, "grad_norm": 0.20367236846641634, "learning_rate": 0.00023320570534238332, "loss": 0.6049, "step": 1820 }, { "epoch": 0.689120151371807, "grad_norm": 0.2389248146044435, "learning_rate": 0.0002326873690303461, "loss": 0.6237, "step": 1821 }, { "epoch": 0.6894985808893094, "grad_norm": 0.20188069695739777, "learning_rate": 0.00023216943465802415, "loss": 0.609, "step": 1822 }, { "epoch": 0.6898770104068117, "grad_norm": 0.2459526565806118, "learning_rate": 0.0002316519030041998, "loss": 0.6361, "step": 1823 }, { "epoch": 0.6902554399243142, "grad_norm": 0.2051889755749057, "learning_rate": 0.00023113477484705032, "loss": 0.5893, "step": 1824 }, { "epoch": 0.6906338694418165, "grad_norm": 0.23179505075238938, "learning_rate": 0.00023061805096414562, "loss": 0.6176, "step": 1825 }, { "epoch": 0.6910122989593188, "grad_norm": 0.2211648449307692, "learning_rate": 0.0002301017321324484, "loss": 0.6131, "step": 1826 }, { "epoch": 0.6913907284768211, "grad_norm": 0.2310635762508175, "learning_rate": 0.0002295858191283115, "loss": 0.605, "step": 1827 }, { "epoch": 0.6917691579943236, "grad_norm": 0.22890927569892636, "learning_rate": 0.00022907031272747852, "loss": 0.599, "step": 1828 }, { "epoch": 0.6921475875118259, "grad_norm": 0.20895996129405542, "learning_rate": 0.00022855521370508075, "loss": 0.5957, "step": 1829 }, { "epoch": 0.6925260170293283, "grad_norm": 0.21038261474817668, "learning_rate": 0.0002280405228356377, "loss": 0.5886, "step": 1830 }, { "epoch": 0.6929044465468307, "grad_norm": 0.218331609540485, "learning_rate": 0.0002275262408930547, "loss": 0.588, "step": 1831 }, { "epoch": 0.693282876064333, "grad_norm": 0.24610752491016138, "learning_rate": 0.00022701236865062198, "loss": 0.6249, "step": 1832 }, { "epoch": 0.6936613055818354, "grad_norm": 0.21253940062731738, "learning_rate": 0.0002264989068810146, "loss": 0.6169, "step": 1833 }, { "epoch": 0.6940397350993377, "grad_norm": 0.20886390993091125, "learning_rate": 0.00022598585635628948, "loss": 0.6076, "step": 1834 }, { "epoch": 0.6944181646168401, "grad_norm": 0.21073741606931226, "learning_rate": 0.00022547321784788593, "loss": 0.5982, "step": 1835 }, { "epoch": 0.6947965941343425, "grad_norm": 0.207527178455146, "learning_rate": 0.00022496099212662309, "loss": 0.6232, "step": 1836 }, { "epoch": 0.6951750236518448, "grad_norm": 0.2498952974173516, "learning_rate": 0.00022444917996270003, "loss": 0.6259, "step": 1837 }, { "epoch": 0.6955534531693472, "grad_norm": 0.5794330439457429, "learning_rate": 0.00022393778212569337, "loss": 0.5994, "step": 1838 }, { "epoch": 0.6959318826868496, "grad_norm": 0.20584707060493032, "learning_rate": 0.00022342679938455745, "loss": 0.6183, "step": 1839 }, { "epoch": 0.6963103122043519, "grad_norm": 0.2245653109243164, "learning_rate": 0.0002229162325076216, "loss": 0.5791, "step": 1840 }, { "epoch": 0.6966887417218544, "grad_norm": 0.2152893819979972, "learning_rate": 0.00022240608226259078, "loss": 0.5881, "step": 1841 }, { "epoch": 0.6970671712393567, "grad_norm": 0.23558724343721113, "learning_rate": 0.00022189634941654273, "loss": 0.6005, "step": 1842 }, { "epoch": 0.697445600756859, "grad_norm": 0.2340368533630761, "learning_rate": 0.0002213870347359277, "loss": 0.5955, "step": 1843 }, { "epoch": 0.6978240302743614, "grad_norm": 0.24196274388143296, "learning_rate": 0.00022087813898656773, "loss": 0.6211, "step": 1844 }, { "epoch": 0.6982024597918638, "grad_norm": 0.23351971017449047, "learning_rate": 0.00022036966293365417, "loss": 0.596, "step": 1845 }, { "epoch": 0.6985808893093661, "grad_norm": 0.2276475017649733, "learning_rate": 0.00021986160734174803, "loss": 0.6268, "step": 1846 }, { "epoch": 0.6989593188268685, "grad_norm": 0.24385939612980936, "learning_rate": 0.00021935397297477734, "loss": 0.6335, "step": 1847 }, { "epoch": 0.6993377483443709, "grad_norm": 0.23520659101148925, "learning_rate": 0.0002188467605960376, "loss": 0.636, "step": 1848 }, { "epoch": 0.6997161778618732, "grad_norm": 0.20023492560755604, "learning_rate": 0.00021833997096818897, "loss": 0.6213, "step": 1849 }, { "epoch": 0.7000946073793756, "grad_norm": 0.21851544918561946, "learning_rate": 0.0002178336048532567, "loss": 0.6363, "step": 1850 }, { "epoch": 0.700473036896878, "grad_norm": 0.24438234951109208, "learning_rate": 0.00021732766301262867, "loss": 0.6105, "step": 1851 }, { "epoch": 0.7008514664143803, "grad_norm": 0.21240497455940788, "learning_rate": 0.00021682214620705493, "loss": 0.611, "step": 1852 }, { "epoch": 0.7012298959318827, "grad_norm": 0.22670093779258454, "learning_rate": 0.00021631705519664673, "loss": 0.6078, "step": 1853 }, { "epoch": 0.701608325449385, "grad_norm": 0.22118544412623817, "learning_rate": 0.00021581239074087467, "loss": 0.6009, "step": 1854 }, { "epoch": 0.7019867549668874, "grad_norm": 0.22358224955158781, "learning_rate": 0.00021530815359856837, "loss": 0.6407, "step": 1855 }, { "epoch": 0.7023651844843898, "grad_norm": 0.25491326979558, "learning_rate": 0.00021480434452791447, "loss": 0.6065, "step": 1856 }, { "epoch": 0.7027436140018921, "grad_norm": 0.26000211462664813, "learning_rate": 0.00021430096428645652, "loss": 0.5971, "step": 1857 }, { "epoch": 0.7031220435193946, "grad_norm": 0.2865516799498107, "learning_rate": 0.00021379801363109258, "loss": 0.6148, "step": 1858 }, { "epoch": 0.7035004730368969, "grad_norm": 0.2250045948089222, "learning_rate": 0.00021329549331807553, "loss": 0.6122, "step": 1859 }, { "epoch": 0.7038789025543992, "grad_norm": 0.19715472210412838, "learning_rate": 0.0002127934041030104, "loss": 0.5991, "step": 1860 }, { "epoch": 0.7042573320719016, "grad_norm": 0.2053616970087423, "learning_rate": 0.00021229174674085478, "loss": 0.6301, "step": 1861 }, { "epoch": 0.704635761589404, "grad_norm": 0.27301597802420924, "learning_rate": 0.0002117905219859163, "loss": 0.6194, "step": 1862 }, { "epoch": 0.7050141911069063, "grad_norm": 0.21847992641915492, "learning_rate": 0.00021128973059185224, "loss": 0.6161, "step": 1863 }, { "epoch": 0.7053926206244087, "grad_norm": 0.21702489803787717, "learning_rate": 0.00021078937331166865, "loss": 0.5836, "step": 1864 }, { "epoch": 0.7057710501419111, "grad_norm": 0.19675102720513543, "learning_rate": 0.00021028945089771818, "loss": 0.6166, "step": 1865 }, { "epoch": 0.7061494796594134, "grad_norm": 0.21149420567053676, "learning_rate": 0.00020978996410170032, "loss": 0.6031, "step": 1866 }, { "epoch": 0.7065279091769158, "grad_norm": 0.22059751194843874, "learning_rate": 0.00020929091367465885, "loss": 0.6162, "step": 1867 }, { "epoch": 0.7069063386944182, "grad_norm": 0.27821819372806, "learning_rate": 0.00020879230036698217, "loss": 0.5878, "step": 1868 }, { "epoch": 0.7072847682119205, "grad_norm": 0.23478399195365338, "learning_rate": 0.00020829412492840056, "loss": 0.5979, "step": 1869 }, { "epoch": 0.7076631977294229, "grad_norm": 0.2083071314706213, "learning_rate": 0.00020779638810798667, "loss": 0.6075, "step": 1870 }, { "epoch": 0.7080416272469252, "grad_norm": 0.23946065284134857, "learning_rate": 0.00020729909065415308, "loss": 0.6372, "step": 1871 }, { "epoch": 0.7084200567644277, "grad_norm": 0.24583773693219355, "learning_rate": 0.0002068022333146522, "loss": 0.6098, "step": 1872 }, { "epoch": 0.70879848628193, "grad_norm": 0.22269086401403523, "learning_rate": 0.0002063058168365743, "loss": 0.6237, "step": 1873 }, { "epoch": 0.7091769157994323, "grad_norm": 0.19460127079711453, "learning_rate": 0.0002058098419663466, "loss": 0.5952, "step": 1874 }, { "epoch": 0.7095553453169348, "grad_norm": 0.20088143354058394, "learning_rate": 0.00020531430944973307, "loss": 0.6098, "step": 1875 }, { "epoch": 0.7099337748344371, "grad_norm": 0.2102815708186865, "learning_rate": 0.00020481922003183161, "loss": 0.6267, "step": 1876 }, { "epoch": 0.7103122043519394, "grad_norm": 0.2495875532613986, "learning_rate": 0.0002043245744570747, "loss": 0.6083, "step": 1877 }, { "epoch": 0.7106906338694419, "grad_norm": 0.21453542259364947, "learning_rate": 0.00020383037346922666, "loss": 0.6147, "step": 1878 }, { "epoch": 0.7110690633869442, "grad_norm": 0.24503043717963593, "learning_rate": 0.00020333661781138406, "loss": 0.6076, "step": 1879 }, { "epoch": 0.7114474929044465, "grad_norm": 0.23766920562282354, "learning_rate": 0.00020284330822597325, "loss": 0.619, "step": 1880 }, { "epoch": 0.7118259224219489, "grad_norm": 0.20942845800809237, "learning_rate": 0.00020235044545475036, "loss": 0.5848, "step": 1881 }, { "epoch": 0.7122043519394513, "grad_norm": 0.2087332205681581, "learning_rate": 0.00020185803023879917, "loss": 0.6149, "step": 1882 }, { "epoch": 0.7125827814569536, "grad_norm": 0.2264471582610914, "learning_rate": 0.00020136606331853075, "loss": 0.6223, "step": 1883 }, { "epoch": 0.712961210974456, "grad_norm": 0.2125596351952046, "learning_rate": 0.00020087454543368238, "loss": 0.5973, "step": 1884 }, { "epoch": 0.7133396404919584, "grad_norm": 0.1961572498686818, "learning_rate": 0.00020038347732331547, "loss": 0.6129, "step": 1885 }, { "epoch": 0.7137180700094607, "grad_norm": 0.20508586323988767, "learning_rate": 0.000199892859725816, "loss": 0.6118, "step": 1886 }, { "epoch": 0.7140964995269631, "grad_norm": 0.19540315431539937, "learning_rate": 0.0001994026933788916, "loss": 0.6131, "step": 1887 }, { "epoch": 0.7144749290444655, "grad_norm": 0.2137635559111273, "learning_rate": 0.00019891297901957234, "loss": 0.6079, "step": 1888 }, { "epoch": 0.7148533585619679, "grad_norm": 0.19502380970028205, "learning_rate": 0.0001984237173842078, "loss": 0.5956, "step": 1889 }, { "epoch": 0.7152317880794702, "grad_norm": 0.21386898161949908, "learning_rate": 0.00019793490920846758, "loss": 0.6149, "step": 1890 }, { "epoch": 0.7156102175969725, "grad_norm": 0.2126602361110692, "learning_rate": 0.00019744655522733874, "loss": 0.6267, "step": 1891 }, { "epoch": 0.715988647114475, "grad_norm": 0.2606269067241711, "learning_rate": 0.00019695865617512615, "loss": 0.6062, "step": 1892 }, { "epoch": 0.7163670766319773, "grad_norm": 0.22727008196255774, "learning_rate": 0.00019647121278544994, "loss": 0.6155, "step": 1893 }, { "epoch": 0.7167455061494796, "grad_norm": 0.22803130544615013, "learning_rate": 0.00019598422579124536, "loss": 0.6193, "step": 1894 }, { "epoch": 0.7171239356669821, "grad_norm": 0.2301636470530538, "learning_rate": 0.00019549769592476168, "loss": 0.606, "step": 1895 }, { "epoch": 0.7175023651844844, "grad_norm": 0.19679348219202927, "learning_rate": 0.00019501162391756028, "loss": 0.6316, "step": 1896 }, { "epoch": 0.7178807947019867, "grad_norm": 0.2436555468514994, "learning_rate": 0.00019452601050051472, "loss": 0.6151, "step": 1897 }, { "epoch": 0.7182592242194891, "grad_norm": 0.19243878552390592, "learning_rate": 0.0001940408564038083, "loss": 0.6211, "step": 1898 }, { "epoch": 0.7186376537369915, "grad_norm": 0.21316760532258483, "learning_rate": 0.0001935561623569344, "loss": 0.594, "step": 1899 }, { "epoch": 0.7190160832544938, "grad_norm": 0.19546522844772585, "learning_rate": 0.00019307192908869398, "loss": 0.6161, "step": 1900 }, { "epoch": 0.7193945127719962, "grad_norm": 0.2391119531830121, "learning_rate": 0.0001925881573271958, "loss": 0.5823, "step": 1901 }, { "epoch": 0.7197729422894986, "grad_norm": 0.23318905396558443, "learning_rate": 0.00019210484779985404, "loss": 0.6035, "step": 1902 }, { "epoch": 0.720151371807001, "grad_norm": 0.2307442457095354, "learning_rate": 0.00019162200123338852, "loss": 0.6142, "step": 1903 }, { "epoch": 0.7205298013245033, "grad_norm": 0.21086047673653097, "learning_rate": 0.00019113961835382233, "loss": 0.5985, "step": 1904 }, { "epoch": 0.7209082308420057, "grad_norm": 0.18997345092649898, "learning_rate": 0.0001906576998864815, "loss": 0.6068, "step": 1905 }, { "epoch": 0.7212866603595081, "grad_norm": 0.2048330386552587, "learning_rate": 0.00019017624655599425, "loss": 0.6176, "step": 1906 }, { "epoch": 0.7216650898770104, "grad_norm": 0.25918869255882526, "learning_rate": 0.0001896952590862886, "loss": 0.5957, "step": 1907 }, { "epoch": 0.7220435193945127, "grad_norm": 0.21775306298437105, "learning_rate": 0.00018921473820059282, "loss": 0.6085, "step": 1908 }, { "epoch": 0.7224219489120152, "grad_norm": 0.19059630709595096, "learning_rate": 0.00018873468462143306, "loss": 0.6357, "step": 1909 }, { "epoch": 0.7228003784295175, "grad_norm": 0.25454048758195336, "learning_rate": 0.00018825509907063325, "loss": 0.6037, "step": 1910 }, { "epoch": 0.7231788079470198, "grad_norm": 0.21697082929454128, "learning_rate": 0.00018777598226931314, "loss": 0.6033, "step": 1911 }, { "epoch": 0.7235572374645223, "grad_norm": 0.8523609023287047, "learning_rate": 0.0001872973349378882, "loss": 0.6325, "step": 1912 }, { "epoch": 0.7239356669820246, "grad_norm": 0.22625943994060282, "learning_rate": 0.00018681915779606751, "loss": 0.6004, "step": 1913 }, { "epoch": 0.7243140964995269, "grad_norm": 0.25080049030972423, "learning_rate": 0.0001863414515628531, "loss": 0.5928, "step": 1914 }, { "epoch": 0.7246925260170294, "grad_norm": 0.21428448103368192, "learning_rate": 0.00018586421695653953, "loss": 0.6179, "step": 1915 }, { "epoch": 0.7250709555345317, "grad_norm": 0.2195415820205283, "learning_rate": 0.00018538745469471153, "loss": 0.6137, "step": 1916 }, { "epoch": 0.725449385052034, "grad_norm": 0.21496122789649266, "learning_rate": 0.00018491116549424413, "loss": 0.6378, "step": 1917 }, { "epoch": 0.7258278145695364, "grad_norm": 0.21431297280723058, "learning_rate": 0.00018443535007130058, "loss": 0.6184, "step": 1918 }, { "epoch": 0.7262062440870388, "grad_norm": 0.21411369435515468, "learning_rate": 0.00018396000914133227, "loss": 0.6125, "step": 1919 }, { "epoch": 0.7265846736045412, "grad_norm": 0.2110875796722878, "learning_rate": 0.00018348514341907652, "loss": 0.6008, "step": 1920 }, { "epoch": 0.7269631031220435, "grad_norm": 0.2165077274520421, "learning_rate": 0.00018301075361855674, "loss": 0.609, "step": 1921 }, { "epoch": 0.7273415326395459, "grad_norm": 0.2411119126520138, "learning_rate": 0.00018253684045308006, "loss": 0.6179, "step": 1922 }, { "epoch": 0.7277199621570483, "grad_norm": 0.2593950482169366, "learning_rate": 0.0001820634046352377, "loss": 0.6244, "step": 1923 }, { "epoch": 0.7280983916745506, "grad_norm": 0.2498189374782854, "learning_rate": 0.00018159044687690245, "loss": 0.6087, "step": 1924 }, { "epoch": 0.7284768211920529, "grad_norm": 0.23295375327971565, "learning_rate": 0.00018111796788922846, "loss": 0.5785, "step": 1925 }, { "epoch": 0.7288552507095554, "grad_norm": 0.24489195608988043, "learning_rate": 0.00018064596838265036, "loss": 0.6083, "step": 1926 }, { "epoch": 0.7292336802270577, "grad_norm": 0.25093695777083896, "learning_rate": 0.00018017444906688113, "loss": 0.6136, "step": 1927 }, { "epoch": 0.72961210974456, "grad_norm": 0.2486359817051185, "learning_rate": 0.00017970341065091244, "loss": 0.5741, "step": 1928 }, { "epoch": 0.7299905392620625, "grad_norm": 0.24059651379596367, "learning_rate": 0.0001792328538430123, "loss": 0.5807, "step": 1929 }, { "epoch": 0.7303689687795648, "grad_norm": 0.20567285495377668, "learning_rate": 0.00017876277935072505, "loss": 0.6166, "step": 1930 }, { "epoch": 0.7307473982970671, "grad_norm": 0.2134740167938921, "learning_rate": 0.00017829318788086922, "loss": 0.6233, "step": 1931 }, { "epoch": 0.7311258278145696, "grad_norm": 0.26856747683502746, "learning_rate": 0.0001778240801395377, "loss": 0.6298, "step": 1932 }, { "epoch": 0.7315042573320719, "grad_norm": 0.2206978079906495, "learning_rate": 0.0001773554568320956, "loss": 0.6194, "step": 1933 }, { "epoch": 0.7318826868495742, "grad_norm": 0.22863745091104964, "learning_rate": 0.00017688731866317948, "loss": 0.626, "step": 1934 }, { "epoch": 0.7322611163670766, "grad_norm": 0.2547167825539799, "learning_rate": 0.00017641966633669703, "loss": 0.6132, "step": 1935 }, { "epoch": 0.732639545884579, "grad_norm": 0.2193445742393667, "learning_rate": 0.00017595250055582473, "loss": 0.6352, "step": 1936 }, { "epoch": 0.7330179754020814, "grad_norm": 0.22053212318177273, "learning_rate": 0.00017548582202300817, "loss": 0.617, "step": 1937 }, { "epoch": 0.7333964049195837, "grad_norm": 0.23930814563937228, "learning_rate": 0.0001750196314399596, "loss": 0.5922, "step": 1938 }, { "epoch": 0.7337748344370861, "grad_norm": 0.2126589748423817, "learning_rate": 0.00017455392950765813, "loss": 0.6231, "step": 1939 }, { "epoch": 0.7341532639545885, "grad_norm": 0.20541885771043922, "learning_rate": 0.0001740887169263477, "loss": 0.5994, "step": 1940 }, { "epoch": 0.7345316934720908, "grad_norm": 0.21141314744094053, "learning_rate": 0.00017362399439553694, "loss": 0.6308, "step": 1941 }, { "epoch": 0.7349101229895932, "grad_norm": 0.27163770174346646, "learning_rate": 0.00017315976261399698, "loss": 0.6166, "step": 1942 }, { "epoch": 0.7352885525070956, "grad_norm": 0.2477426916583272, "learning_rate": 0.00017269602227976166, "loss": 0.5954, "step": 1943 }, { "epoch": 0.7356669820245979, "grad_norm": 0.23500164817678484, "learning_rate": 0.00017223277409012555, "loss": 0.6219, "step": 1944 }, { "epoch": 0.7360454115421002, "grad_norm": 0.22526585383091705, "learning_rate": 0.000171770018741643, "loss": 0.5994, "step": 1945 }, { "epoch": 0.7364238410596027, "grad_norm": 0.2242112985894698, "learning_rate": 0.0001713077569301279, "loss": 0.6047, "step": 1946 }, { "epoch": 0.736802270577105, "grad_norm": 0.2787567330945674, "learning_rate": 0.00017084598935065143, "loss": 0.587, "step": 1947 }, { "epoch": 0.7371807000946073, "grad_norm": 0.2390213401117646, "learning_rate": 0.00017038471669754223, "loss": 0.5996, "step": 1948 }, { "epoch": 0.7375591296121098, "grad_norm": 0.22445519172521586, "learning_rate": 0.00016992393966438408, "loss": 0.6363, "step": 1949 }, { "epoch": 0.7379375591296121, "grad_norm": 0.22658604466662283, "learning_rate": 0.00016946365894401623, "loss": 0.5725, "step": 1950 }, { "epoch": 0.7383159886471145, "grad_norm": 0.2534260390214938, "learning_rate": 0.00016900387522853093, "loss": 0.627, "step": 1951 }, { "epoch": 0.7386944181646169, "grad_norm": 0.2336917932987794, "learning_rate": 0.0001685445892092739, "loss": 0.6419, "step": 1952 }, { "epoch": 0.7390728476821192, "grad_norm": 0.20980742409241887, "learning_rate": 0.00016808580157684171, "loss": 0.5871, "step": 1953 }, { "epoch": 0.7394512771996216, "grad_norm": 0.25479445990598454, "learning_rate": 0.00016762751302108238, "loss": 0.6112, "step": 1954 }, { "epoch": 0.7398297067171239, "grad_norm": 0.2534516887244299, "learning_rate": 0.0001671697242310927, "loss": 0.6031, "step": 1955 }, { "epoch": 0.7402081362346263, "grad_norm": 0.22876583974747539, "learning_rate": 0.0001667124358952184, "loss": 0.6361, "step": 1956 }, { "epoch": 0.7405865657521287, "grad_norm": 0.219522474663357, "learning_rate": 0.0001662556487010528, "loss": 0.6246, "step": 1957 }, { "epoch": 0.740964995269631, "grad_norm": 0.27376090406883913, "learning_rate": 0.0001657993633354354, "loss": 0.6481, "step": 1958 }, { "epoch": 0.7413434247871334, "grad_norm": 0.21054439649379322, "learning_rate": 0.00016534358048445153, "loss": 0.6112, "step": 1959 }, { "epoch": 0.7417218543046358, "grad_norm": 0.22454629095106543, "learning_rate": 0.00016488830083343032, "loss": 0.6177, "step": 1960 }, { "epoch": 0.7421002838221381, "grad_norm": 0.2292143093061229, "learning_rate": 0.000164433525066945, "loss": 0.577, "step": 1961 }, { "epoch": 0.7424787133396404, "grad_norm": 0.21670526854505412, "learning_rate": 0.00016397925386881046, "loss": 0.6014, "step": 1962 }, { "epoch": 0.7428571428571429, "grad_norm": 0.2218307199081189, "learning_rate": 0.00016352548792208355, "loss": 0.6031, "step": 1963 }, { "epoch": 0.7432355723746452, "grad_norm": 0.2110925740195836, "learning_rate": 0.0001630722279090609, "loss": 0.6291, "step": 1964 }, { "epoch": 0.7436140018921475, "grad_norm": 0.2660858140121537, "learning_rate": 0.0001626194745112784, "loss": 0.5994, "step": 1965 }, { "epoch": 0.74399243140965, "grad_norm": 0.2347028823007129, "learning_rate": 0.00016216722840951077, "loss": 0.5772, "step": 1966 }, { "epoch": 0.7443708609271523, "grad_norm": 0.20039698049717491, "learning_rate": 0.0001617154902837691, "loss": 0.6333, "step": 1967 }, { "epoch": 0.7447492904446547, "grad_norm": 0.23343885542643306, "learning_rate": 0.00016126426081330148, "loss": 0.5957, "step": 1968 }, { "epoch": 0.7451277199621571, "grad_norm": 0.2147320630467388, "learning_rate": 0.0001608135406765905, "loss": 0.6095, "step": 1969 }, { "epoch": 0.7455061494796594, "grad_norm": 0.22393290611940198, "learning_rate": 0.00016036333055135344, "loss": 0.6109, "step": 1970 }, { "epoch": 0.7458845789971618, "grad_norm": 0.19775771149164922, "learning_rate": 0.0001599136311145402, "loss": 0.6071, "step": 1971 }, { "epoch": 0.7462630085146641, "grad_norm": 0.21220881974387798, "learning_rate": 0.00015946444304233343, "loss": 0.6187, "step": 1972 }, { "epoch": 0.7466414380321665, "grad_norm": 0.21701284768596524, "learning_rate": 0.00015901576701014602, "loss": 0.6151, "step": 1973 }, { "epoch": 0.7470198675496689, "grad_norm": 0.24719536673546236, "learning_rate": 0.0001585676036926219, "loss": 0.6025, "step": 1974 }, { "epoch": 0.7473982970671712, "grad_norm": 0.2133397605481641, "learning_rate": 0.00015811995376363341, "loss": 0.6091, "step": 1975 }, { "epoch": 0.7477767265846736, "grad_norm": 0.22705713078634276, "learning_rate": 0.00015767281789628084, "loss": 0.6002, "step": 1976 }, { "epoch": 0.748155156102176, "grad_norm": 0.1906798944228714, "learning_rate": 0.0001572261967628923, "loss": 0.5951, "step": 1977 }, { "epoch": 0.7485335856196783, "grad_norm": 0.26684413954825537, "learning_rate": 0.00015678009103502094, "loss": 0.6161, "step": 1978 }, { "epoch": 0.7489120151371808, "grad_norm": 0.20278020168097546, "learning_rate": 0.0001563345013834459, "loss": 0.6158, "step": 1979 }, { "epoch": 0.7492904446546831, "grad_norm": 0.22489699317176018, "learning_rate": 0.0001558894284781694, "loss": 0.5661, "step": 1980 }, { "epoch": 0.7496688741721854, "grad_norm": 0.20830049653353505, "learning_rate": 0.00015544487298841753, "loss": 0.6329, "step": 1981 }, { "epoch": 0.7500473036896877, "grad_norm": 0.21113095419433808, "learning_rate": 0.00015500083558263761, "loss": 0.592, "step": 1982 }, { "epoch": 0.7504257332071902, "grad_norm": 0.2416706723396425, "learning_rate": 0.00015455731692849863, "loss": 0.6193, "step": 1983 }, { "epoch": 0.7508041627246925, "grad_norm": 0.20560559150733446, "learning_rate": 0.0001541143176928891, "loss": 0.589, "step": 1984 }, { "epoch": 0.7511825922421949, "grad_norm": 0.1957174299726415, "learning_rate": 0.00015367183854191646, "loss": 0.613, "step": 1985 }, { "epoch": 0.7515610217596973, "grad_norm": 0.22903583686285986, "learning_rate": 0.00015322988014090672, "loss": 0.5836, "step": 1986 }, { "epoch": 0.7519394512771996, "grad_norm": 0.20467658923822665, "learning_rate": 0.00015278844315440215, "loss": 0.5829, "step": 1987 }, { "epoch": 0.752317880794702, "grad_norm": 0.2271557882994104, "learning_rate": 0.00015234752824616165, "loss": 0.6056, "step": 1988 }, { "epoch": 0.7526963103122043, "grad_norm": 0.25020025101015597, "learning_rate": 0.0001519071360791585, "loss": 0.5851, "step": 1989 }, { "epoch": 0.7530747398297067, "grad_norm": 0.23014257269967572, "learning_rate": 0.00015146726731558058, "loss": 0.6128, "step": 1990 }, { "epoch": 0.7534531693472091, "grad_norm": 0.2302597439797881, "learning_rate": 0.00015102792261682813, "loss": 0.607, "step": 1991 }, { "epoch": 0.7538315988647114, "grad_norm": 0.20777719097329414, "learning_rate": 0.00015058910264351404, "loss": 0.606, "step": 1992 }, { "epoch": 0.7542100283822138, "grad_norm": 0.2098750360107164, "learning_rate": 0.00015015080805546162, "loss": 0.6114, "step": 1993 }, { "epoch": 0.7545884578997162, "grad_norm": 0.26958177618159734, "learning_rate": 0.00014971303951170472, "loss": 0.607, "step": 1994 }, { "epoch": 0.7549668874172185, "grad_norm": 0.26307435325261025, "learning_rate": 0.00014927579767048577, "loss": 0.5953, "step": 1995 }, { "epoch": 0.755345316934721, "grad_norm": 0.2391697935792715, "learning_rate": 0.00014883908318925526, "loss": 0.6384, "step": 1996 }, { "epoch": 0.7557237464522233, "grad_norm": 0.24096713698961242, "learning_rate": 0.00014840289672467128, "loss": 0.5895, "step": 1997 }, { "epoch": 0.7561021759697256, "grad_norm": 0.21646307257774647, "learning_rate": 0.0001479672389325971, "loss": 0.6287, "step": 1998 }, { "epoch": 0.756480605487228, "grad_norm": 0.22285643161867166, "learning_rate": 0.00014753211046810206, "loss": 0.6139, "step": 1999 }, { "epoch": 0.7568590350047304, "grad_norm": 0.2273118026152122, "learning_rate": 0.00014709751198545856, "loss": 0.5884, "step": 2000 }, { "epoch": 0.7572374645222327, "grad_norm": 0.23261617504959714, "learning_rate": 0.00014666344413814304, "loss": 0.6015, "step": 2001 }, { "epoch": 0.7576158940397351, "grad_norm": 0.2551174910401463, "learning_rate": 0.00014622990757883337, "loss": 0.5931, "step": 2002 }, { "epoch": 0.7579943235572375, "grad_norm": 0.21592576915603534, "learning_rate": 0.0001457969029594089, "loss": 0.5913, "step": 2003 }, { "epoch": 0.7583727530747398, "grad_norm": 0.21578696152892185, "learning_rate": 0.00014536443093094887, "loss": 0.6392, "step": 2004 }, { "epoch": 0.7587511825922422, "grad_norm": 0.23865122292131555, "learning_rate": 0.00014493249214373222, "loss": 0.6369, "step": 2005 }, { "epoch": 0.7591296121097446, "grad_norm": 0.23965012441723826, "learning_rate": 0.00014450108724723537, "loss": 0.6225, "step": 2006 }, { "epoch": 0.7595080416272469, "grad_norm": 0.1961460066693459, "learning_rate": 0.00014407021689013272, "loss": 0.5888, "step": 2007 }, { "epoch": 0.7598864711447493, "grad_norm": 0.22200762653245718, "learning_rate": 0.00014363988172029417, "loss": 0.5941, "step": 2008 }, { "epoch": 0.7602649006622516, "grad_norm": 0.22112261310796483, "learning_rate": 0.0001432100823847857, "loss": 0.5961, "step": 2009 }, { "epoch": 0.760643330179754, "grad_norm": 0.23220478465870265, "learning_rate": 0.00014278081952986694, "loss": 0.6111, "step": 2010 }, { "epoch": 0.7610217596972564, "grad_norm": 0.23695496900213875, "learning_rate": 0.0001423520938009909, "loss": 0.5873, "step": 2011 }, { "epoch": 0.7614001892147587, "grad_norm": 0.21463802197831489, "learning_rate": 0.00014192390584280345, "loss": 0.5949, "step": 2012 }, { "epoch": 0.7617786187322612, "grad_norm": 0.23042229814672285, "learning_rate": 0.00014149625629914132, "loss": 0.6019, "step": 2013 }, { "epoch": 0.7621570482497635, "grad_norm": 0.2183501642567231, "learning_rate": 0.000141069145813032, "loss": 0.598, "step": 2014 }, { "epoch": 0.7625354777672658, "grad_norm": 0.22451740538110018, "learning_rate": 0.00014064257502669216, "loss": 0.6313, "step": 2015 }, { "epoch": 0.7629139072847683, "grad_norm": 0.23028202898274502, "learning_rate": 0.00014021654458152733, "loss": 0.6301, "step": 2016 }, { "epoch": 0.7632923368022706, "grad_norm": 0.24406547915780996, "learning_rate": 0.00013979105511813, "loss": 0.5942, "step": 2017 }, { "epoch": 0.7636707663197729, "grad_norm": 0.19945657783146425, "learning_rate": 0.00013936610727628002, "loss": 0.6184, "step": 2018 }, { "epoch": 0.7640491958372753, "grad_norm": 0.2137786472933314, "learning_rate": 0.0001389417016949419, "loss": 0.6113, "step": 2019 }, { "epoch": 0.7644276253547777, "grad_norm": 0.20572793734863237, "learning_rate": 0.00013851783901226578, "loss": 0.5863, "step": 2020 }, { "epoch": 0.76480605487228, "grad_norm": 0.19606950921616362, "learning_rate": 0.00013809451986558465, "loss": 0.6143, "step": 2021 }, { "epoch": 0.7651844843897824, "grad_norm": 0.19637201938030444, "learning_rate": 0.0001376717448914145, "loss": 0.6367, "step": 2022 }, { "epoch": 0.7655629139072848, "grad_norm": 0.21809134778461065, "learning_rate": 0.00013724951472545338, "loss": 0.5828, "step": 2023 }, { "epoch": 0.7659413434247871, "grad_norm": 0.24554458530922949, "learning_rate": 0.00013682783000257964, "loss": 0.5901, "step": 2024 }, { "epoch": 0.7663197729422895, "grad_norm": 0.2473517688191186, "learning_rate": 0.00013640669135685202, "loss": 0.609, "step": 2025 }, { "epoch": 0.7666982024597918, "grad_norm": 0.23707578945951668, "learning_rate": 0.00013598609942150764, "loss": 0.5755, "step": 2026 }, { "epoch": 0.7670766319772943, "grad_norm": 0.1961369660619001, "learning_rate": 0.00013556605482896206, "loss": 0.6152, "step": 2027 }, { "epoch": 0.7674550614947966, "grad_norm": 0.2250048928258628, "learning_rate": 0.00013514655821080735, "loss": 0.5948, "step": 2028 }, { "epoch": 0.7678334910122989, "grad_norm": 0.20388183447575514, "learning_rate": 0.00013472761019781216, "loss": 0.5854, "step": 2029 }, { "epoch": 0.7682119205298014, "grad_norm": 0.19982714410288024, "learning_rate": 0.00013430921141991976, "loss": 0.6312, "step": 2030 }, { "epoch": 0.7685903500473037, "grad_norm": 0.20102378801953716, "learning_rate": 0.00013389136250624767, "loss": 0.5813, "step": 2031 }, { "epoch": 0.768968779564806, "grad_norm": 0.19632322099589586, "learning_rate": 0.00013347406408508694, "loss": 0.5754, "step": 2032 }, { "epoch": 0.7693472090823085, "grad_norm": 0.2345695641701837, "learning_rate": 0.00013305731678390047, "loss": 0.6098, "step": 2033 }, { "epoch": 0.7697256385998108, "grad_norm": 0.21760663884975612, "learning_rate": 0.00013264112122932293, "loss": 0.5948, "step": 2034 }, { "epoch": 0.7701040681173131, "grad_norm": 0.20473472363502637, "learning_rate": 0.00013222547804715872, "loss": 0.5748, "step": 2035 }, { "epoch": 0.7704824976348155, "grad_norm": 0.2713734516800718, "learning_rate": 0.00013181038786238248, "loss": 0.5833, "step": 2036 }, { "epoch": 0.7708609271523179, "grad_norm": 0.2189240869855906, "learning_rate": 0.00013139585129913652, "loss": 0.5873, "step": 2037 }, { "epoch": 0.7712393566698202, "grad_norm": 0.25738375588465756, "learning_rate": 0.00013098186898073143, "loss": 0.6295, "step": 2038 }, { "epoch": 0.7716177861873226, "grad_norm": 0.21932247728605578, "learning_rate": 0.00013056844152964386, "loss": 0.5877, "step": 2039 }, { "epoch": 0.771996215704825, "grad_norm": 0.22437189235500968, "learning_rate": 0.00013015556956751668, "loss": 0.5948, "step": 2040 }, { "epoch": 0.7723746452223273, "grad_norm": 0.24450863377123921, "learning_rate": 0.000129743253715157, "loss": 0.5934, "step": 2041 }, { "epoch": 0.7727530747398297, "grad_norm": 0.2155494198125235, "learning_rate": 0.0001293314945925358, "loss": 0.5834, "step": 2042 }, { "epoch": 0.7731315042573321, "grad_norm": 0.24219049944096982, "learning_rate": 0.00012892029281878748, "loss": 0.6661, "step": 2043 }, { "epoch": 0.7735099337748345, "grad_norm": 0.21469041380924694, "learning_rate": 0.00012850964901220762, "loss": 0.6148, "step": 2044 }, { "epoch": 0.7738883632923368, "grad_norm": 0.20713798003080952, "learning_rate": 0.0001280995637902536, "loss": 0.6089, "step": 2045 }, { "epoch": 0.7742667928098391, "grad_norm": 0.21119344160149953, "learning_rate": 0.00012769003776954223, "loss": 0.6089, "step": 2046 }, { "epoch": 0.7746452223273416, "grad_norm": 0.21374536118855647, "learning_rate": 0.00012728107156585, "loss": 0.6065, "step": 2047 }, { "epoch": 0.7750236518448439, "grad_norm": 0.22385999642389254, "learning_rate": 0.0001268726657941111, "loss": 0.6112, "step": 2048 }, { "epoch": 0.7754020813623462, "grad_norm": 0.20702489807173385, "learning_rate": 0.00012646482106841772, "loss": 0.6339, "step": 2049 }, { "epoch": 0.7757805108798487, "grad_norm": 0.20688570052532818, "learning_rate": 0.0001260575380020179, "loss": 0.587, "step": 2050 }, { "epoch": 0.776158940397351, "grad_norm": 0.2033415337150565, "learning_rate": 0.00012565081720731513, "loss": 0.6201, "step": 2051 }, { "epoch": 0.7765373699148533, "grad_norm": 0.19684507846520258, "learning_rate": 0.00012524465929586803, "loss": 0.6115, "step": 2052 }, { "epoch": 0.7769157994323557, "grad_norm": 0.2441442565647873, "learning_rate": 0.00012483906487838808, "loss": 0.5962, "step": 2053 }, { "epoch": 0.7772942289498581, "grad_norm": 0.23389351980240172, "learning_rate": 0.00012443403456474018, "loss": 0.5895, "step": 2054 }, { "epoch": 0.7776726584673604, "grad_norm": 0.22826910170673279, "learning_rate": 0.00012402956896394042, "loss": 0.5914, "step": 2055 }, { "epoch": 0.7780510879848628, "grad_norm": 0.21996803578443588, "learning_rate": 0.00012362566868415631, "loss": 0.6049, "step": 2056 }, { "epoch": 0.7784295175023652, "grad_norm": 0.19837113119101388, "learning_rate": 0.0001232223343327048, "loss": 0.6211, "step": 2057 }, { "epoch": 0.7788079470198676, "grad_norm": 0.22253901758339154, "learning_rate": 0.00012281956651605247, "loss": 0.6101, "step": 2058 }, { "epoch": 0.7791863765373699, "grad_norm": 0.24174535291785287, "learning_rate": 0.00012241736583981346, "loss": 0.5877, "step": 2059 }, { "epoch": 0.7795648060548723, "grad_norm": 0.22014835140443592, "learning_rate": 0.00012201573290874962, "loss": 0.5956, "step": 2060 }, { "epoch": 0.7799432355723747, "grad_norm": 0.2048535161487321, "learning_rate": 0.00012161466832676888, "loss": 0.5923, "step": 2061 }, { "epoch": 0.780321665089877, "grad_norm": 0.21976391792651434, "learning_rate": 0.00012121417269692437, "loss": 0.637, "step": 2062 }, { "epoch": 0.7807000946073793, "grad_norm": 0.21753578732929926, "learning_rate": 0.00012081424662141428, "loss": 0.6216, "step": 2063 }, { "epoch": 0.7810785241248818, "grad_norm": 0.24164591500097035, "learning_rate": 0.0001204148907015798, "loss": 0.5873, "step": 2064 }, { "epoch": 0.7814569536423841, "grad_norm": 0.26044284053240085, "learning_rate": 0.00012001610553790543, "loss": 0.6042, "step": 2065 }, { "epoch": 0.7818353831598864, "grad_norm": 0.22369614583040978, "learning_rate": 0.0001196178917300168, "loss": 0.5989, "step": 2066 }, { "epoch": 0.7822138126773889, "grad_norm": 0.23080674941633256, "learning_rate": 0.00011922024987668106, "loss": 0.6094, "step": 2067 }, { "epoch": 0.7825922421948912, "grad_norm": 0.21828986206214074, "learning_rate": 0.00011882318057580488, "loss": 0.5994, "step": 2068 }, { "epoch": 0.7829706717123935, "grad_norm": 0.2716077620227915, "learning_rate": 0.00011842668442443433, "loss": 0.5785, "step": 2069 }, { "epoch": 0.783349101229896, "grad_norm": 0.24588014021010907, "learning_rate": 0.00011803076201875335, "loss": 0.6226, "step": 2070 }, { "epoch": 0.7837275307473983, "grad_norm": 0.22369262361907177, "learning_rate": 0.00011763541395408367, "loss": 0.5977, "step": 2071 }, { "epoch": 0.7841059602649006, "grad_norm": 0.2310329578594011, "learning_rate": 0.00011724064082488295, "loss": 0.6229, "step": 2072 }, { "epoch": 0.784484389782403, "grad_norm": 0.2041756709863972, "learning_rate": 0.00011684644322474441, "loss": 0.5963, "step": 2073 }, { "epoch": 0.7848628192999054, "grad_norm": 0.230745198243252, "learning_rate": 0.00011645282174639631, "loss": 0.5859, "step": 2074 }, { "epoch": 0.7852412488174078, "grad_norm": 0.22257035254916252, "learning_rate": 0.0001160597769817, "loss": 0.6269, "step": 2075 }, { "epoch": 0.7856196783349101, "grad_norm": 0.20453657672260825, "learning_rate": 0.00011566730952165034, "loss": 0.5875, "step": 2076 }, { "epoch": 0.7859981078524125, "grad_norm": 0.22834885861759083, "learning_rate": 0.00011527541995637348, "loss": 0.62, "step": 2077 }, { "epoch": 0.7863765373699149, "grad_norm": 0.21574968589171953, "learning_rate": 0.00011488410887512729, "loss": 0.5998, "step": 2078 }, { "epoch": 0.7867549668874172, "grad_norm": 0.22072764910262818, "learning_rate": 0.00011449337686629913, "loss": 0.5919, "step": 2079 }, { "epoch": 0.7871333964049196, "grad_norm": 0.2373112179206895, "learning_rate": 0.0001141032245174063, "loss": 0.6127, "step": 2080 }, { "epoch": 0.787511825922422, "grad_norm": 0.214480431004732, "learning_rate": 0.00011371365241509401, "loss": 0.5794, "step": 2081 }, { "epoch": 0.7878902554399243, "grad_norm": 0.19298786539697563, "learning_rate": 0.00011332466114513513, "loss": 0.609, "step": 2082 }, { "epoch": 0.7882686849574266, "grad_norm": 0.23192142655431358, "learning_rate": 0.0001129362512924294, "loss": 0.5917, "step": 2083 }, { "epoch": 0.7886471144749291, "grad_norm": 0.22311437891409908, "learning_rate": 0.00011254842344100191, "loss": 0.6369, "step": 2084 }, { "epoch": 0.7890255439924314, "grad_norm": 0.22909018837092407, "learning_rate": 0.00011216117817400318, "loss": 0.6072, "step": 2085 }, { "epoch": 0.7894039735099337, "grad_norm": 0.234390750991526, "learning_rate": 0.00011177451607370703, "loss": 0.597, "step": 2086 }, { "epoch": 0.7897824030274362, "grad_norm": 0.19657596099319766, "learning_rate": 0.00011138843772151124, "loss": 0.6034, "step": 2087 }, { "epoch": 0.7901608325449385, "grad_norm": 0.23316963504849572, "learning_rate": 0.00011100294369793507, "loss": 0.6133, "step": 2088 }, { "epoch": 0.7905392620624409, "grad_norm": 0.1936036695597591, "learning_rate": 0.00011061803458261976, "loss": 0.6132, "step": 2089 }, { "epoch": 0.7909176915799432, "grad_norm": 0.22826491565739052, "learning_rate": 0.00011023371095432655, "loss": 0.6099, "step": 2090 }, { "epoch": 0.7912961210974456, "grad_norm": 0.2605667238238542, "learning_rate": 0.00010984997339093684, "loss": 0.6027, "step": 2091 }, { "epoch": 0.791674550614948, "grad_norm": 0.22965585270015595, "learning_rate": 0.0001094668224694505, "loss": 0.5971, "step": 2092 }, { "epoch": 0.7920529801324503, "grad_norm": 0.2235387912653008, "learning_rate": 0.0001090842587659851, "loss": 0.6561, "step": 2093 }, { "epoch": 0.7924314096499527, "grad_norm": 0.2153064773171242, "learning_rate": 0.00010870228285577594, "loss": 0.5888, "step": 2094 }, { "epoch": 0.7928098391674551, "grad_norm": 0.2251609415390158, "learning_rate": 0.00010832089531317364, "loss": 0.6024, "step": 2095 }, { "epoch": 0.7931882686849574, "grad_norm": 0.20566073720187314, "learning_rate": 0.00010794009671164484, "loss": 0.5892, "step": 2096 }, { "epoch": 0.7935666982024598, "grad_norm": 0.21762967175383358, "learning_rate": 0.00010755988762377017, "loss": 0.615, "step": 2097 }, { "epoch": 0.7939451277199622, "grad_norm": 0.20775565682179592, "learning_rate": 0.00010718026862124425, "loss": 0.6167, "step": 2098 }, { "epoch": 0.7943235572374645, "grad_norm": 0.1941487642573813, "learning_rate": 0.00010680124027487393, "loss": 0.6055, "step": 2099 }, { "epoch": 0.7947019867549668, "grad_norm": 0.23665138079133427, "learning_rate": 0.00010642280315457848, "loss": 0.6125, "step": 2100 }, { "epoch": 0.7950804162724693, "grad_norm": 0.2106374354752382, "learning_rate": 0.00010604495782938772, "loss": 0.5959, "step": 2101 }, { "epoch": 0.7954588457899716, "grad_norm": 0.2651779114704443, "learning_rate": 0.00010566770486744171, "loss": 0.5829, "step": 2102 }, { "epoch": 0.795837275307474, "grad_norm": 0.22015957868400154, "learning_rate": 0.00010529104483599022, "loss": 0.6182, "step": 2103 }, { "epoch": 0.7962157048249764, "grad_norm": 0.21690214024387422, "learning_rate": 0.00010491497830139091, "loss": 0.6111, "step": 2104 }, { "epoch": 0.7965941343424787, "grad_norm": 0.21930658682480111, "learning_rate": 0.00010453950582910954, "loss": 0.6094, "step": 2105 }, { "epoch": 0.796972563859981, "grad_norm": 0.220714526599313, "learning_rate": 0.00010416462798371806, "loss": 0.6006, "step": 2106 }, { "epoch": 0.7973509933774835, "grad_norm": 0.21612365256485733, "learning_rate": 0.00010379034532889503, "loss": 0.6125, "step": 2107 }, { "epoch": 0.7977294228949858, "grad_norm": 0.19130608798961765, "learning_rate": 0.00010341665842742326, "loss": 0.5906, "step": 2108 }, { "epoch": 0.7981078524124882, "grad_norm": 0.25623141605165156, "learning_rate": 0.00010304356784119057, "loss": 0.5787, "step": 2109 }, { "epoch": 0.7984862819299905, "grad_norm": 0.20484517809139455, "learning_rate": 0.00010267107413118743, "loss": 0.584, "step": 2110 }, { "epoch": 0.7988647114474929, "grad_norm": 0.21202556471767994, "learning_rate": 0.00010229917785750743, "loss": 0.6033, "step": 2111 }, { "epoch": 0.7992431409649953, "grad_norm": 0.19835541221263114, "learning_rate": 0.00010192787957934535, "loss": 0.5864, "step": 2112 }, { "epoch": 0.7996215704824976, "grad_norm": 0.21579481638829048, "learning_rate": 0.00010155717985499696, "loss": 0.6053, "step": 2113 }, { "epoch": 0.8, "grad_norm": 0.20621123852773765, "learning_rate": 0.00010118707924185832, "loss": 0.6398, "step": 2114 }, { "epoch": 0.8003784295175024, "grad_norm": 0.2450864248235344, "learning_rate": 0.00010081757829642413, "loss": 0.5897, "step": 2115 }, { "epoch": 0.8007568590350047, "grad_norm": 0.19989443242939048, "learning_rate": 0.00010044867757428793, "loss": 0.6065, "step": 2116 }, { "epoch": 0.801135288552507, "grad_norm": 0.21658584702823586, "learning_rate": 0.00010008037763014032, "loss": 0.6371, "step": 2117 }, { "epoch": 0.8015137180700095, "grad_norm": 0.23440918673094702, "learning_rate": 9.971267901776888e-05, "loss": 0.6039, "step": 2118 }, { "epoch": 0.8018921475875118, "grad_norm": 0.238542616660435, "learning_rate": 9.934558229005663e-05, "loss": 0.6067, "step": 2119 }, { "epoch": 0.8022705771050141, "grad_norm": 0.21922312388879428, "learning_rate": 9.897908799898209e-05, "loss": 0.6109, "step": 2120 }, { "epoch": 0.8026490066225166, "grad_norm": 0.19736606571398302, "learning_rate": 9.861319669561735e-05, "loss": 0.5827, "step": 2121 }, { "epoch": 0.8030274361400189, "grad_norm": 0.23416727715601593, "learning_rate": 9.824790893012841e-05, "loss": 0.5965, "step": 2122 }, { "epoch": 0.8034058656575213, "grad_norm": 0.20859071607919347, "learning_rate": 9.78832252517734e-05, "loss": 0.6035, "step": 2123 }, { "epoch": 0.8037842951750237, "grad_norm": 0.22249729773669952, "learning_rate": 9.751914620890207e-05, "loss": 0.5915, "step": 2124 }, { "epoch": 0.804162724692526, "grad_norm": 0.2175115884236853, "learning_rate": 9.715567234895539e-05, "loss": 0.6052, "step": 2125 }, { "epoch": 0.8045411542100284, "grad_norm": 0.22820449907045237, "learning_rate": 9.679280421846392e-05, "loss": 0.6091, "step": 2126 }, { "epoch": 0.8049195837275307, "grad_norm": 0.2264716854271054, "learning_rate": 9.643054236304788e-05, "loss": 0.622, "step": 2127 }, { "epoch": 0.8052980132450331, "grad_norm": 0.23118340686836758, "learning_rate": 9.606888732741536e-05, "loss": 0.5911, "step": 2128 }, { "epoch": 0.8056764427625355, "grad_norm": 0.2045007388741639, "learning_rate": 9.570783965536261e-05, "loss": 0.5875, "step": 2129 }, { "epoch": 0.8060548722800378, "grad_norm": 0.19610275612625103, "learning_rate": 9.534739988977198e-05, "loss": 0.601, "step": 2130 }, { "epoch": 0.8064333017975402, "grad_norm": 0.22383844508268047, "learning_rate": 9.498756857261243e-05, "loss": 0.6239, "step": 2131 }, { "epoch": 0.8068117313150426, "grad_norm": 0.23759523508866373, "learning_rate": 9.462834624493755e-05, "loss": 0.6095, "step": 2132 }, { "epoch": 0.8071901608325449, "grad_norm": 0.22695605855546114, "learning_rate": 9.426973344688516e-05, "loss": 0.5794, "step": 2133 }, { "epoch": 0.8075685903500474, "grad_norm": 0.1918730784453557, "learning_rate": 9.391173071767716e-05, "loss": 0.6134, "step": 2134 }, { "epoch": 0.8079470198675497, "grad_norm": 0.24796582196819913, "learning_rate": 9.355433859561741e-05, "loss": 0.6048, "step": 2135 }, { "epoch": 0.808325449385052, "grad_norm": 0.23136061716226086, "learning_rate": 9.319755761809235e-05, "loss": 0.5894, "step": 2136 }, { "epoch": 0.8087038789025544, "grad_norm": 0.268845861548805, "learning_rate": 9.284138832156875e-05, "loss": 0.5991, "step": 2137 }, { "epoch": 0.8090823084200568, "grad_norm": 0.2051941498019604, "learning_rate": 9.248583124159438e-05, "loss": 0.6094, "step": 2138 }, { "epoch": 0.8094607379375591, "grad_norm": 0.19687076745706072, "learning_rate": 9.213088691279576e-05, "loss": 0.5933, "step": 2139 }, { "epoch": 0.8098391674550615, "grad_norm": 0.21409662559425596, "learning_rate": 9.177655586887873e-05, "loss": 0.5962, "step": 2140 }, { "epoch": 0.8102175969725639, "grad_norm": 0.24132579894965525, "learning_rate": 9.14228386426263e-05, "loss": 0.606, "step": 2141 }, { "epoch": 0.8105960264900662, "grad_norm": 0.20613304564676263, "learning_rate": 9.106973576589922e-05, "loss": 0.5958, "step": 2142 }, { "epoch": 0.8109744560075686, "grad_norm": 0.21197363899359556, "learning_rate": 9.071724776963386e-05, "loss": 0.5925, "step": 2143 }, { "epoch": 0.811352885525071, "grad_norm": 0.197050241406643, "learning_rate": 9.03653751838423e-05, "loss": 0.5916, "step": 2144 }, { "epoch": 0.8117313150425733, "grad_norm": 0.2360728911167388, "learning_rate": 9.001411853761148e-05, "loss": 0.6016, "step": 2145 }, { "epoch": 0.8121097445600757, "grad_norm": 0.3063965816421658, "learning_rate": 8.966347835910177e-05, "loss": 0.5791, "step": 2146 }, { "epoch": 0.812488174077578, "grad_norm": 0.21226695338659454, "learning_rate": 8.931345517554701e-05, "loss": 0.6045, "step": 2147 }, { "epoch": 0.8128666035950805, "grad_norm": 0.2156930608726167, "learning_rate": 8.896404951325294e-05, "loss": 0.5926, "step": 2148 }, { "epoch": 0.8132450331125828, "grad_norm": 0.18908270775171857, "learning_rate": 8.861526189759705e-05, "loss": 0.6022, "step": 2149 }, { "epoch": 0.8136234626300851, "grad_norm": 0.2107390070296056, "learning_rate": 8.826709285302736e-05, "loss": 0.5743, "step": 2150 }, { "epoch": 0.8140018921475876, "grad_norm": 0.21716082155227545, "learning_rate": 8.791954290306198e-05, "loss": 0.6118, "step": 2151 }, { "epoch": 0.8143803216650899, "grad_norm": 0.2074816680474225, "learning_rate": 8.757261257028776e-05, "loss": 0.6119, "step": 2152 }, { "epoch": 0.8147587511825922, "grad_norm": 0.22732306940198033, "learning_rate": 8.722630237636037e-05, "loss": 0.5906, "step": 2153 }, { "epoch": 0.8151371807000946, "grad_norm": 0.23714727510466801, "learning_rate": 8.688061284200266e-05, "loss": 0.5727, "step": 2154 }, { "epoch": 0.815515610217597, "grad_norm": 0.22167923376902388, "learning_rate": 8.653554448700412e-05, "loss": 0.6185, "step": 2155 }, { "epoch": 0.8158940397350993, "grad_norm": 0.22110027263172513, "learning_rate": 8.619109783022072e-05, "loss": 0.566, "step": 2156 }, { "epoch": 0.8162724692526017, "grad_norm": 0.22616893193202856, "learning_rate": 8.584727338957315e-05, "loss": 0.601, "step": 2157 }, { "epoch": 0.8166508987701041, "grad_norm": 0.22098197032184114, "learning_rate": 8.550407168204683e-05, "loss": 0.6229, "step": 2158 }, { "epoch": 0.8170293282876064, "grad_norm": 0.19905900160142237, "learning_rate": 8.516149322369055e-05, "loss": 0.557, "step": 2159 }, { "epoch": 0.8174077578051088, "grad_norm": 0.19585095568935768, "learning_rate": 8.481953852961628e-05, "loss": 0.5921, "step": 2160 }, { "epoch": 0.8177861873226112, "grad_norm": 0.22611280734001082, "learning_rate": 8.447820811399765e-05, "loss": 0.6057, "step": 2161 }, { "epoch": 0.8181646168401135, "grad_norm": 0.21327453085687112, "learning_rate": 8.413750249007013e-05, "loss": 0.5981, "step": 2162 }, { "epoch": 0.8185430463576159, "grad_norm": 0.21079263304336615, "learning_rate": 8.379742217012931e-05, "loss": 0.6196, "step": 2163 }, { "epoch": 0.8189214758751182, "grad_norm": 0.2266932554127013, "learning_rate": 8.345796766553066e-05, "loss": 0.611, "step": 2164 }, { "epoch": 0.8192999053926207, "grad_norm": 0.2262401095752353, "learning_rate": 8.311913948668882e-05, "loss": 0.6168, "step": 2165 }, { "epoch": 0.819678334910123, "grad_norm": 0.20714657648334278, "learning_rate": 8.278093814307636e-05, "loss": 0.588, "step": 2166 }, { "epoch": 0.8200567644276253, "grad_norm": 0.21090547883738767, "learning_rate": 8.244336414322374e-05, "loss": 0.5917, "step": 2167 }, { "epoch": 0.8204351939451278, "grad_norm": 0.1879769854434017, "learning_rate": 8.210641799471763e-05, "loss": 0.6015, "step": 2168 }, { "epoch": 0.8208136234626301, "grad_norm": 0.23029087767429335, "learning_rate": 8.177010020420118e-05, "loss": 0.6098, "step": 2169 }, { "epoch": 0.8211920529801324, "grad_norm": 0.20933784468334238, "learning_rate": 8.143441127737216e-05, "loss": 0.6193, "step": 2170 }, { "epoch": 0.8215704824976349, "grad_norm": 0.2509346121672925, "learning_rate": 8.10993517189833e-05, "loss": 0.582, "step": 2171 }, { "epoch": 0.8219489120151372, "grad_norm": 0.20785047284582484, "learning_rate": 8.076492203284053e-05, "loss": 0.6166, "step": 2172 }, { "epoch": 0.8223273415326395, "grad_norm": 0.21039461989776365, "learning_rate": 8.043112272180309e-05, "loss": 0.5812, "step": 2173 }, { "epoch": 0.8227057710501419, "grad_norm": 0.21180076910735693, "learning_rate": 8.009795428778204e-05, "loss": 0.5911, "step": 2174 }, { "epoch": 0.8230842005676443, "grad_norm": 0.24333198907613685, "learning_rate": 7.976541723173986e-05, "loss": 0.5882, "step": 2175 }, { "epoch": 0.8234626300851466, "grad_norm": 0.3414923482962023, "learning_rate": 7.943351205369004e-05, "loss": 0.5985, "step": 2176 }, { "epoch": 0.823841059602649, "grad_norm": 0.20430752524045873, "learning_rate": 7.910223925269539e-05, "loss": 0.5993, "step": 2177 }, { "epoch": 0.8242194891201514, "grad_norm": 0.2274561636867794, "learning_rate": 7.877159932686839e-05, "loss": 0.5994, "step": 2178 }, { "epoch": 0.8245979186376537, "grad_norm": 0.22441854248355994, "learning_rate": 7.844159277336948e-05, "loss": 0.5951, "step": 2179 }, { "epoch": 0.8249763481551561, "grad_norm": 0.21549069361329146, "learning_rate": 7.811222008840718e-05, "loss": 0.5639, "step": 2180 }, { "epoch": 0.8253547776726584, "grad_norm": 0.20676284161232683, "learning_rate": 7.778348176723643e-05, "loss": 0.6012, "step": 2181 }, { "epoch": 0.8257332071901609, "grad_norm": 0.21410520364316535, "learning_rate": 7.745537830415877e-05, "loss": 0.6067, "step": 2182 }, { "epoch": 0.8261116367076632, "grad_norm": 0.22433139350722214, "learning_rate": 7.71279101925208e-05, "loss": 0.5971, "step": 2183 }, { "epoch": 0.8264900662251655, "grad_norm": 0.22512178436515973, "learning_rate": 7.68010779247138e-05, "loss": 0.6061, "step": 2184 }, { "epoch": 0.826868495742668, "grad_norm": 0.24165844814258303, "learning_rate": 7.647488199217333e-05, "loss": 0.6251, "step": 2185 }, { "epoch": 0.8272469252601703, "grad_norm": 0.22582003184474153, "learning_rate": 7.614932288537774e-05, "loss": 0.6142, "step": 2186 }, { "epoch": 0.8276253547776726, "grad_norm": 0.22343290964493134, "learning_rate": 7.582440109384809e-05, "loss": 0.6073, "step": 2187 }, { "epoch": 0.8280037842951751, "grad_norm": 0.1929256989645472, "learning_rate": 7.55001171061469e-05, "loss": 0.6013, "step": 2188 }, { "epoch": 0.8283822138126774, "grad_norm": 0.2013418997603049, "learning_rate": 7.517647140987798e-05, "loss": 0.5918, "step": 2189 }, { "epoch": 0.8287606433301797, "grad_norm": 0.1925811519733108, "learning_rate": 7.485346449168512e-05, "loss": 0.5725, "step": 2190 }, { "epoch": 0.8291390728476821, "grad_norm": 0.21041034242045828, "learning_rate": 7.45310968372518e-05, "loss": 0.604, "step": 2191 }, { "epoch": 0.8295175023651845, "grad_norm": 0.21987001610551352, "learning_rate": 7.42093689313001e-05, "loss": 0.6232, "step": 2192 }, { "epoch": 0.8298959318826868, "grad_norm": 0.20646961974914466, "learning_rate": 7.38882812575904e-05, "loss": 0.5977, "step": 2193 }, { "epoch": 0.8302743614001892, "grad_norm": 0.23020748912144406, "learning_rate": 7.356783429892023e-05, "loss": 0.5856, "step": 2194 }, { "epoch": 0.8306527909176916, "grad_norm": 0.22121323231512655, "learning_rate": 7.324802853712354e-05, "loss": 0.6202, "step": 2195 }, { "epoch": 0.831031220435194, "grad_norm": 2.25885239251363, "learning_rate": 7.292886445307073e-05, "loss": 0.6164, "step": 2196 }, { "epoch": 0.8314096499526963, "grad_norm": 0.25515183322912477, "learning_rate": 7.261034252666671e-05, "loss": 0.5983, "step": 2197 }, { "epoch": 0.8317880794701987, "grad_norm": 0.240985935305313, "learning_rate": 7.22924632368513e-05, "loss": 0.5826, "step": 2198 }, { "epoch": 0.8321665089877011, "grad_norm": 0.24569928631717694, "learning_rate": 7.19752270615977e-05, "loss": 0.5845, "step": 2199 }, { "epoch": 0.8325449385052034, "grad_norm": 0.21540864027172807, "learning_rate": 7.165863447791237e-05, "loss": 0.586, "step": 2200 }, { "epoch": 0.8329233680227057, "grad_norm": 0.24186657592731606, "learning_rate": 7.13426859618338e-05, "loss": 0.6058, "step": 2201 }, { "epoch": 0.8333017975402082, "grad_norm": 0.21329024194619267, "learning_rate": 7.102738198843228e-05, "loss": 0.6194, "step": 2202 }, { "epoch": 0.8336802270577105, "grad_norm": 0.2241957629446156, "learning_rate": 7.07127230318087e-05, "loss": 0.6003, "step": 2203 }, { "epoch": 0.8340586565752128, "grad_norm": 0.2269546141058546, "learning_rate": 7.039870956509432e-05, "loss": 0.5866, "step": 2204 }, { "epoch": 0.8344370860927153, "grad_norm": 0.19707221757102195, "learning_rate": 7.00853420604497e-05, "loss": 0.5842, "step": 2205 }, { "epoch": 0.8348155156102176, "grad_norm": 0.22243929214270075, "learning_rate": 6.977262098906389e-05, "loss": 0.6191, "step": 2206 }, { "epoch": 0.8351939451277199, "grad_norm": 0.2427885227319505, "learning_rate": 6.946054682115455e-05, "loss": 0.5888, "step": 2207 }, { "epoch": 0.8355723746452224, "grad_norm": 0.2409472394773229, "learning_rate": 6.91491200259659e-05, "loss": 0.6152, "step": 2208 }, { "epoch": 0.8359508041627247, "grad_norm": 0.22039461093720691, "learning_rate": 6.88383410717694e-05, "loss": 0.6123, "step": 2209 }, { "epoch": 0.836329233680227, "grad_norm": 0.21683543094389607, "learning_rate": 6.852821042586183e-05, "loss": 0.5967, "step": 2210 }, { "epoch": 0.8367076631977294, "grad_norm": 0.24277175141636725, "learning_rate": 6.821872855456562e-05, "loss": 0.5886, "step": 2211 }, { "epoch": 0.8370860927152318, "grad_norm": 0.22923078055557453, "learning_rate": 6.790989592322739e-05, "loss": 0.5993, "step": 2212 }, { "epoch": 0.8374645222327342, "grad_norm": 0.16863112368609523, "learning_rate": 6.760171299621776e-05, "loss": 0.6004, "step": 2213 }, { "epoch": 0.8378429517502365, "grad_norm": 0.23630067965453755, "learning_rate": 6.729418023693024e-05, "loss": 0.6109, "step": 2214 }, { "epoch": 0.8382213812677389, "grad_norm": 0.23726331876124418, "learning_rate": 6.698729810778065e-05, "loss": 0.5802, "step": 2215 }, { "epoch": 0.8385998107852413, "grad_norm": 0.1970768582454489, "learning_rate": 6.668106707020694e-05, "loss": 0.6166, "step": 2216 }, { "epoch": 0.8389782403027436, "grad_norm": 0.2308447486258875, "learning_rate": 6.637548758466749e-05, "loss": 0.5746, "step": 2217 }, { "epoch": 0.8393566698202459, "grad_norm": 0.23802069191681538, "learning_rate": 6.607056011064155e-05, "loss": 0.5821, "step": 2218 }, { "epoch": 0.8397350993377484, "grad_norm": 0.2196665194383812, "learning_rate": 6.576628510662741e-05, "loss": 0.6174, "step": 2219 }, { "epoch": 0.8401135288552507, "grad_norm": 0.2390508418263651, "learning_rate": 6.546266303014281e-05, "loss": 0.5926, "step": 2220 }, { "epoch": 0.840491958372753, "grad_norm": 0.25864639668295125, "learning_rate": 6.515969433772328e-05, "loss": 0.6197, "step": 2221 }, { "epoch": 0.8408703878902555, "grad_norm": 0.23026961274080798, "learning_rate": 6.485737948492237e-05, "loss": 0.5827, "step": 2222 }, { "epoch": 0.8412488174077578, "grad_norm": 0.2683813472009668, "learning_rate": 6.455571892631001e-05, "loss": 0.6231, "step": 2223 }, { "epoch": 0.8416272469252601, "grad_norm": 0.20853784576393752, "learning_rate": 6.425471311547276e-05, "loss": 0.6079, "step": 2224 }, { "epoch": 0.8420056764427626, "grad_norm": 0.21442516471208897, "learning_rate": 6.395436250501235e-05, "loss": 0.601, "step": 2225 }, { "epoch": 0.8423841059602649, "grad_norm": 0.21358736448642543, "learning_rate": 6.365466754654531e-05, "loss": 0.605, "step": 2226 }, { "epoch": 0.8427625354777672, "grad_norm": 0.21675107661203166, "learning_rate": 6.335562869070272e-05, "loss": 0.6049, "step": 2227 }, { "epoch": 0.8431409649952696, "grad_norm": 0.22240731059319058, "learning_rate": 6.305724638712856e-05, "loss": 0.5813, "step": 2228 }, { "epoch": 0.843519394512772, "grad_norm": 0.20948708757960494, "learning_rate": 6.275952108448018e-05, "loss": 0.6156, "step": 2229 }, { "epoch": 0.8438978240302744, "grad_norm": 0.2431742884312656, "learning_rate": 6.246245323042648e-05, "loss": 0.5835, "step": 2230 }, { "epoch": 0.8442762535477767, "grad_norm": 0.2549615497210636, "learning_rate": 6.216604327164827e-05, "loss": 0.6055, "step": 2231 }, { "epoch": 0.8446546830652791, "grad_norm": 0.2012334192447164, "learning_rate": 6.187029165383679e-05, "loss": 0.5995, "step": 2232 }, { "epoch": 0.8450331125827815, "grad_norm": 0.2546016121190357, "learning_rate": 6.157519882169365e-05, "loss": 0.5751, "step": 2233 }, { "epoch": 0.8454115421002838, "grad_norm": 0.22674376537043564, "learning_rate": 6.128076521892956e-05, "loss": 0.6093, "step": 2234 }, { "epoch": 0.8457899716177862, "grad_norm": 0.2111685335358541, "learning_rate": 6.098699128826446e-05, "loss": 0.6046, "step": 2235 }, { "epoch": 0.8461684011352886, "grad_norm": 0.22703141981041078, "learning_rate": 6.0693877471425906e-05, "loss": 0.602, "step": 2236 }, { "epoch": 0.8465468306527909, "grad_norm": 0.1896328908910461, "learning_rate": 6.0401424209149016e-05, "loss": 0.5937, "step": 2237 }, { "epoch": 0.8469252601702932, "grad_norm": 0.1931850340965662, "learning_rate": 6.010963194117603e-05, "loss": 0.598, "step": 2238 }, { "epoch": 0.8473036896877957, "grad_norm": 0.20225837040308925, "learning_rate": 5.981850110625475e-05, "loss": 0.6037, "step": 2239 }, { "epoch": 0.847682119205298, "grad_norm": 0.24811643090813348, "learning_rate": 5.952803214213887e-05, "loss": 0.5857, "step": 2240 }, { "epoch": 0.8480605487228003, "grad_norm": 0.20233588810968756, "learning_rate": 5.923822548558655e-05, "loss": 0.6224, "step": 2241 }, { "epoch": 0.8484389782403028, "grad_norm": 0.2487306931086151, "learning_rate": 5.894908157236045e-05, "loss": 0.6063, "step": 2242 }, { "epoch": 0.8488174077578051, "grad_norm": 0.24022110848120728, "learning_rate": 5.866060083722624e-05, "loss": 0.6186, "step": 2243 }, { "epoch": 0.8491958372753075, "grad_norm": 0.21684914036246827, "learning_rate": 5.837278371395288e-05, "loss": 0.5694, "step": 2244 }, { "epoch": 0.8495742667928098, "grad_norm": 0.23716621911597988, "learning_rate": 5.8085630635311194e-05, "loss": 0.5965, "step": 2245 }, { "epoch": 0.8499526963103122, "grad_norm": 0.23239295042197058, "learning_rate": 5.779914203307357e-05, "loss": 0.61, "step": 2246 }, { "epoch": 0.8503311258278146, "grad_norm": 0.2063265629470553, "learning_rate": 5.751331833801343e-05, "loss": 0.5716, "step": 2247 }, { "epoch": 0.8507095553453169, "grad_norm": 0.26101321874957745, "learning_rate": 5.72281599799041e-05, "loss": 0.5782, "step": 2248 }, { "epoch": 0.8510879848628193, "grad_norm": 0.23582616414899232, "learning_rate": 5.694366738751905e-05, "loss": 0.5811, "step": 2249 }, { "epoch": 0.8514664143803217, "grad_norm": 0.2007390152264314, "learning_rate": 5.665984098862992e-05, "loss": 0.6003, "step": 2250 }, { "epoch": 0.851844843897824, "grad_norm": 0.2225325893527324, "learning_rate": 5.6376681210007386e-05, "loss": 0.5761, "step": 2251 }, { "epoch": 0.8522232734153264, "grad_norm": 0.20325214734767175, "learning_rate": 5.609418847741926e-05, "loss": 0.5948, "step": 2252 }, { "epoch": 0.8526017029328288, "grad_norm": 0.2290074362411586, "learning_rate": 5.58123632156306e-05, "loss": 0.6101, "step": 2253 }, { "epoch": 0.8529801324503311, "grad_norm": 0.21109522115221585, "learning_rate": 5.5531205848402655e-05, "loss": 0.6003, "step": 2254 }, { "epoch": 0.8533585619678334, "grad_norm": 0.22785325521739255, "learning_rate": 5.525071679849275e-05, "loss": 0.5992, "step": 2255 }, { "epoch": 0.8537369914853359, "grad_norm": 0.21695785731593129, "learning_rate": 5.497089648765296e-05, "loss": 0.6219, "step": 2256 }, { "epoch": 0.8541154210028382, "grad_norm": 0.22231056751229575, "learning_rate": 5.469174533662979e-05, "loss": 0.5831, "step": 2257 }, { "epoch": 0.8544938505203405, "grad_norm": 0.20121503570654717, "learning_rate": 5.4413263765163976e-05, "loss": 0.6348, "step": 2258 }, { "epoch": 0.854872280037843, "grad_norm": 0.20247032043688756, "learning_rate": 5.4135452191989124e-05, "loss": 0.5964, "step": 2259 }, { "epoch": 0.8552507095553453, "grad_norm": 0.21106092939055085, "learning_rate": 5.385831103483141e-05, "loss": 0.5695, "step": 2260 }, { "epoch": 0.8556291390728477, "grad_norm": 0.20797261759703414, "learning_rate": 5.3581840710409005e-05, "loss": 0.6007, "step": 2261 }, { "epoch": 0.8560075685903501, "grad_norm": 0.20817997572926117, "learning_rate": 5.330604163443159e-05, "loss": 0.5824, "step": 2262 }, { "epoch": 0.8563859981078524, "grad_norm": 0.2391555411850312, "learning_rate": 5.303091422159923e-05, "loss": 0.6214, "step": 2263 }, { "epoch": 0.8567644276253548, "grad_norm": 0.22755427166448325, "learning_rate": 5.275645888560232e-05, "loss": 0.5895, "step": 2264 }, { "epoch": 0.8571428571428571, "grad_norm": 0.21946207870898804, "learning_rate": 5.248267603912038e-05, "loss": 0.5842, "step": 2265 }, { "epoch": 0.8575212866603595, "grad_norm": 0.22075999210927455, "learning_rate": 5.2209566093822156e-05, "loss": 0.588, "step": 2266 }, { "epoch": 0.8578997161778619, "grad_norm": 0.2190602945796294, "learning_rate": 5.193712946036416e-05, "loss": 0.5819, "step": 2267 }, { "epoch": 0.8582781456953642, "grad_norm": 0.229195130632347, "learning_rate": 5.166536654839099e-05, "loss": 0.61, "step": 2268 }, { "epoch": 0.8586565752128666, "grad_norm": 0.21444668925205182, "learning_rate": 5.1394277766533715e-05, "loss": 0.6093, "step": 2269 }, { "epoch": 0.859035004730369, "grad_norm": 0.24789122706742003, "learning_rate": 5.1123863522410165e-05, "loss": 0.5914, "step": 2270 }, { "epoch": 0.8594134342478713, "grad_norm": 0.2055359045310464, "learning_rate": 5.085412422262364e-05, "loss": 0.583, "step": 2271 }, { "epoch": 0.8597918637653738, "grad_norm": 0.2036264538792084, "learning_rate": 5.058506027276261e-05, "loss": 0.6095, "step": 2272 }, { "epoch": 0.8601702932828761, "grad_norm": 0.202308757068562, "learning_rate": 5.031667207740026e-05, "loss": 0.5845, "step": 2273 }, { "epoch": 0.8605487228003784, "grad_norm": 0.20851667601862553, "learning_rate": 5.004896004009346e-05, "loss": 0.6017, "step": 2274 }, { "epoch": 0.8609271523178808, "grad_norm": 0.24501403978806954, "learning_rate": 4.97819245633826e-05, "loss": 0.6221, "step": 2275 }, { "epoch": 0.8613055818353832, "grad_norm": 0.22923627341753688, "learning_rate": 4.9515566048790485e-05, "loss": 0.5898, "step": 2276 }, { "epoch": 0.8616840113528855, "grad_norm": 0.2062602351278905, "learning_rate": 4.9249884896822364e-05, "loss": 0.5835, "step": 2277 }, { "epoch": 0.8620624408703879, "grad_norm": 0.21943847419937562, "learning_rate": 4.8984881506964676e-05, "loss": 0.6159, "step": 2278 }, { "epoch": 0.8624408703878903, "grad_norm": 0.23284433742424007, "learning_rate": 4.8720556277685015e-05, "loss": 0.5931, "step": 2279 }, { "epoch": 0.8628192999053926, "grad_norm": 0.2355708687298488, "learning_rate": 4.845690960643107e-05, "loss": 0.6056, "step": 2280 }, { "epoch": 0.863197729422895, "grad_norm": 0.2134709803347871, "learning_rate": 4.819394188963022e-05, "loss": 0.6132, "step": 2281 }, { "epoch": 0.8635761589403973, "grad_norm": 0.194056055388887, "learning_rate": 4.7931653522689245e-05, "loss": 0.571, "step": 2282 }, { "epoch": 0.8639545884578997, "grad_norm": 0.21372679888420812, "learning_rate": 4.7670044899992994e-05, "loss": 0.6043, "step": 2283 }, { "epoch": 0.8643330179754021, "grad_norm": 0.20821447359400028, "learning_rate": 4.740911641490464e-05, "loss": 0.6308, "step": 2284 }, { "epoch": 0.8647114474929044, "grad_norm": 0.21503392385775377, "learning_rate": 4.714886845976429e-05, "loss": 0.6096, "step": 2285 }, { "epoch": 0.8650898770104068, "grad_norm": 0.21930825151289346, "learning_rate": 4.688930142588921e-05, "loss": 0.5907, "step": 2286 }, { "epoch": 0.8654683065279092, "grad_norm": 0.2417826036669518, "learning_rate": 4.66304157035724e-05, "loss": 0.6001, "step": 2287 }, { "epoch": 0.8658467360454115, "grad_norm": 0.23098373258736574, "learning_rate": 4.6372211682082835e-05, "loss": 0.5897, "step": 2288 }, { "epoch": 0.866225165562914, "grad_norm": 0.23744644828677616, "learning_rate": 4.6114689749663987e-05, "loss": 0.5914, "step": 2289 }, { "epoch": 0.8666035950804163, "grad_norm": 0.22880379528919317, "learning_rate": 4.585785029353412e-05, "loss": 0.5885, "step": 2290 }, { "epoch": 0.8669820245979186, "grad_norm": 0.22530778853507458, "learning_rate": 4.560169369988515e-05, "loss": 0.5857, "step": 2291 }, { "epoch": 0.867360454115421, "grad_norm": 0.20244353864165957, "learning_rate": 4.5346220353882137e-05, "loss": 0.5903, "step": 2292 }, { "epoch": 0.8677388836329234, "grad_norm": 0.19122803429452795, "learning_rate": 4.509143063966298e-05, "loss": 0.6136, "step": 2293 }, { "epoch": 0.8681173131504257, "grad_norm": 0.21112415214795197, "learning_rate": 4.483732494033738e-05, "loss": 0.5954, "step": 2294 }, { "epoch": 0.8684957426679281, "grad_norm": 0.21053918566136284, "learning_rate": 4.4583903637986924e-05, "loss": 0.5999, "step": 2295 }, { "epoch": 0.8688741721854305, "grad_norm": 0.21293298492369794, "learning_rate": 4.433116711366364e-05, "loss": 0.6098, "step": 2296 }, { "epoch": 0.8692526017029328, "grad_norm": 0.23862782273139202, "learning_rate": 4.4079115747390374e-05, "loss": 0.63, "step": 2297 }, { "epoch": 0.8696310312204352, "grad_norm": 0.23236595410710076, "learning_rate": 4.38277499181593e-05, "loss": 0.6093, "step": 2298 }, { "epoch": 0.8700094607379376, "grad_norm": 0.24077519739057948, "learning_rate": 4.357707000393224e-05, "loss": 0.6162, "step": 2299 }, { "epoch": 0.8703878902554399, "grad_norm": 0.20839558307155095, "learning_rate": 4.332707638163935e-05, "loss": 0.6125, "step": 2300 }, { "epoch": 0.8707663197729423, "grad_norm": 0.23351130290417088, "learning_rate": 4.307776942717884e-05, "loss": 0.6116, "step": 2301 }, { "epoch": 0.8711447492904446, "grad_norm": 0.20594331705621452, "learning_rate": 4.282914951541661e-05, "loss": 0.5847, "step": 2302 }, { "epoch": 0.871523178807947, "grad_norm": 0.21445440212070982, "learning_rate": 4.2581217020185356e-05, "loss": 0.5967, "step": 2303 }, { "epoch": 0.8719016083254494, "grad_norm": 0.21726098086850582, "learning_rate": 4.233397231428432e-05, "loss": 0.5969, "step": 2304 }, { "epoch": 0.8722800378429517, "grad_norm": 0.24052686640900775, "learning_rate": 4.2087415769478333e-05, "loss": 0.5946, "step": 2305 }, { "epoch": 0.8726584673604542, "grad_norm": 0.2560174904814415, "learning_rate": 4.184154775649768e-05, "loss": 0.5994, "step": 2306 }, { "epoch": 0.8730368968779565, "grad_norm": 0.20559973004908783, "learning_rate": 4.1596368645037185e-05, "loss": 0.5711, "step": 2307 }, { "epoch": 0.8734153263954588, "grad_norm": 0.2195724794762389, "learning_rate": 4.135187880375607e-05, "loss": 0.6065, "step": 2308 }, { "epoch": 0.8737937559129612, "grad_norm": 0.23560546696645587, "learning_rate": 4.1108078600276766e-05, "loss": 0.5905, "step": 2309 }, { "epoch": 0.8741721854304636, "grad_norm": 0.24252051868838306, "learning_rate": 4.086496840118514e-05, "loss": 0.6177, "step": 2310 }, { "epoch": 0.8745506149479659, "grad_norm": 0.2504553670990464, "learning_rate": 4.062254857202935e-05, "loss": 0.6229, "step": 2311 }, { "epoch": 0.8749290444654683, "grad_norm": 0.21900461909678978, "learning_rate": 4.0380819477319305e-05, "loss": 0.5865, "step": 2312 }, { "epoch": 0.8753074739829707, "grad_norm": 0.2155410893580799, "learning_rate": 4.013978148052677e-05, "loss": 0.5733, "step": 2313 }, { "epoch": 0.875685903500473, "grad_norm": 0.2146564447438446, "learning_rate": 3.989943494408388e-05, "loss": 0.5889, "step": 2314 }, { "epoch": 0.8760643330179754, "grad_norm": 0.20491162762051013, "learning_rate": 3.9659780229383435e-05, "loss": 0.6253, "step": 2315 }, { "epoch": 0.8764427625354778, "grad_norm": 0.23584521427713426, "learning_rate": 3.9420817696777746e-05, "loss": 0.6011, "step": 2316 }, { "epoch": 0.8768211920529801, "grad_norm": 0.23324647050131303, "learning_rate": 3.9182547705578494e-05, "loss": 0.6237, "step": 2317 }, { "epoch": 0.8771996215704825, "grad_norm": 0.20680116968195347, "learning_rate": 3.894497061405588e-05, "loss": 0.5754, "step": 2318 }, { "epoch": 0.8775780510879848, "grad_norm": 0.2155116709449672, "learning_rate": 3.870808677943838e-05, "loss": 0.5966, "step": 2319 }, { "epoch": 0.8779564806054873, "grad_norm": 0.20322668758061904, "learning_rate": 3.8471896557912e-05, "loss": 0.6021, "step": 2320 }, { "epoch": 0.8783349101229896, "grad_norm": 0.21500248680229056, "learning_rate": 3.823640030461983e-05, "loss": 0.6196, "step": 2321 }, { "epoch": 0.8787133396404919, "grad_norm": 0.20604825122778925, "learning_rate": 3.800159837366146e-05, "loss": 0.5592, "step": 2322 }, { "epoch": 0.8790917691579944, "grad_norm": 0.19850010372466637, "learning_rate": 3.7767491118092415e-05, "loss": 0.598, "step": 2323 }, { "epoch": 0.8794701986754967, "grad_norm": 0.20882741434908997, "learning_rate": 3.753407888992394e-05, "loss": 0.5819, "step": 2324 }, { "epoch": 0.879848628192999, "grad_norm": 0.2137790655150801, "learning_rate": 3.73013620401218e-05, "loss": 0.6224, "step": 2325 }, { "epoch": 0.8802270577105015, "grad_norm": 0.22362159301066642, "learning_rate": 3.706934091860664e-05, "loss": 0.6092, "step": 2326 }, { "epoch": 0.8806054872280038, "grad_norm": 0.23101736494434041, "learning_rate": 3.68380158742525e-05, "loss": 0.611, "step": 2327 }, { "epoch": 0.8809839167455061, "grad_norm": 0.20929219401935675, "learning_rate": 3.660738725488733e-05, "loss": 0.5968, "step": 2328 }, { "epoch": 0.8813623462630085, "grad_norm": 0.20961510320934298, "learning_rate": 3.637745540729126e-05, "loss": 0.5979, "step": 2329 }, { "epoch": 0.8817407757805109, "grad_norm": 0.20667180761709808, "learning_rate": 3.614822067719736e-05, "loss": 0.6066, "step": 2330 }, { "epoch": 0.8821192052980132, "grad_norm": 0.27221233552433, "learning_rate": 3.5919683409290136e-05, "loss": 0.5881, "step": 2331 }, { "epoch": 0.8824976348155156, "grad_norm": 0.21483858275701928, "learning_rate": 3.5691843947205314e-05, "loss": 0.6021, "step": 2332 }, { "epoch": 0.882876064333018, "grad_norm": 0.18982524310668789, "learning_rate": 3.546470263352963e-05, "loss": 0.6136, "step": 2333 }, { "epoch": 0.8832544938505204, "grad_norm": 0.20121942347096744, "learning_rate": 3.523825980979989e-05, "loss": 0.5747, "step": 2334 }, { "epoch": 0.8836329233680227, "grad_norm": 0.20812275893920734, "learning_rate": 3.501251581650272e-05, "loss": 0.6223, "step": 2335 }, { "epoch": 0.8840113528855251, "grad_norm": 0.2351771377082092, "learning_rate": 3.4787470993073885e-05, "loss": 0.5894, "step": 2336 }, { "epoch": 0.8843897824030275, "grad_norm": 0.21837459436104914, "learning_rate": 3.456312567789793e-05, "loss": 0.6065, "step": 2337 }, { "epoch": 0.8847682119205298, "grad_norm": 0.21548937111728222, "learning_rate": 3.433948020830746e-05, "loss": 0.6025, "step": 2338 }, { "epoch": 0.8851466414380321, "grad_norm": 0.22816687650272657, "learning_rate": 3.411653492058298e-05, "loss": 0.5686, "step": 2339 }, { "epoch": 0.8855250709555346, "grad_norm": 0.20712842999681885, "learning_rate": 3.389429014995199e-05, "loss": 0.5977, "step": 2340 }, { "epoch": 0.8859035004730369, "grad_norm": 0.2170917292970771, "learning_rate": 3.36727462305888e-05, "loss": 0.6113, "step": 2341 }, { "epoch": 0.8862819299905392, "grad_norm": 0.2293198364432735, "learning_rate": 3.345190349561384e-05, "loss": 0.5758, "step": 2342 }, { "epoch": 0.8866603595080417, "grad_norm": 0.2227362773363113, "learning_rate": 3.3231762277093126e-05, "loss": 0.6051, "step": 2343 }, { "epoch": 0.887038789025544, "grad_norm": 0.19835036009007456, "learning_rate": 3.301232290603812e-05, "loss": 0.5879, "step": 2344 }, { "epoch": 0.8874172185430463, "grad_norm": 0.22961758373392832, "learning_rate": 3.279358571240459e-05, "loss": 0.6045, "step": 2345 }, { "epoch": 0.8877956480605487, "grad_norm": 0.22874452261281844, "learning_rate": 3.257555102509291e-05, "loss": 0.6271, "step": 2346 }, { "epoch": 0.8881740775780511, "grad_norm": 0.21038088828758073, "learning_rate": 3.23582191719467e-05, "loss": 0.5991, "step": 2347 }, { "epoch": 0.8885525070955534, "grad_norm": 0.23285852015812636, "learning_rate": 3.214159047975324e-05, "loss": 0.6107, "step": 2348 }, { "epoch": 0.8889309366130558, "grad_norm": 0.23969282191608812, "learning_rate": 3.192566527424201e-05, "loss": 0.5998, "step": 2349 }, { "epoch": 0.8893093661305582, "grad_norm": 0.21491812372348523, "learning_rate": 3.171044388008515e-05, "loss": 0.5969, "step": 2350 }, { "epoch": 0.8896877956480606, "grad_norm": 0.22882201366657662, "learning_rate": 3.149592662089634e-05, "loss": 0.5989, "step": 2351 }, { "epoch": 0.8900662251655629, "grad_norm": 0.20532876094019745, "learning_rate": 3.1282113819230404e-05, "loss": 0.5961, "step": 2352 }, { "epoch": 0.8904446546830653, "grad_norm": 0.23065266302431234, "learning_rate": 3.106900579658311e-05, "loss": 0.5883, "step": 2353 }, { "epoch": 0.8908230842005677, "grad_norm": 0.20568976246236587, "learning_rate": 3.085660287339031e-05, "loss": 0.6081, "step": 2354 }, { "epoch": 0.89120151371807, "grad_norm": 0.2147986538114711, "learning_rate": 3.064490536902792e-05, "loss": 0.5884, "step": 2355 }, { "epoch": 0.8915799432355723, "grad_norm": 0.2094727451706633, "learning_rate": 3.043391360181086e-05, "loss": 0.5897, "step": 2356 }, { "epoch": 0.8919583727530748, "grad_norm": 0.23393494301982568, "learning_rate": 3.0223627888993076e-05, "loss": 0.6088, "step": 2357 }, { "epoch": 0.8923368022705771, "grad_norm": 0.24864572040527064, "learning_rate": 3.0014048546766702e-05, "loss": 0.6034, "step": 2358 }, { "epoch": 0.8927152317880794, "grad_norm": 0.21458848180119072, "learning_rate": 2.980517589026205e-05, "loss": 0.6027, "step": 2359 }, { "epoch": 0.8930936613055819, "grad_norm": 0.23478502329882964, "learning_rate": 2.959701023354644e-05, "loss": 0.6161, "step": 2360 }, { "epoch": 0.8934720908230842, "grad_norm": 0.24448049158637475, "learning_rate": 2.9389551889624498e-05, "loss": 0.6057, "step": 2361 }, { "epoch": 0.8938505203405865, "grad_norm": 0.20342992856399525, "learning_rate": 2.918280117043709e-05, "loss": 0.6155, "step": 2362 }, { "epoch": 0.894228949858089, "grad_norm": 0.2197754530026225, "learning_rate": 2.897675838686098e-05, "loss": 0.6278, "step": 2363 }, { "epoch": 0.8946073793755913, "grad_norm": 0.2575142609248984, "learning_rate": 2.8771423848708843e-05, "loss": 0.5879, "step": 2364 }, { "epoch": 0.8949858088930936, "grad_norm": 0.23088114421209358, "learning_rate": 2.8566797864727988e-05, "loss": 0.5863, "step": 2365 }, { "epoch": 0.895364238410596, "grad_norm": 0.21357449224983918, "learning_rate": 2.8362880742600684e-05, "loss": 0.5513, "step": 2366 }, { "epoch": 0.8957426679280984, "grad_norm": 0.20915884239987417, "learning_rate": 2.8159672788942937e-05, "loss": 0.6077, "step": 2367 }, { "epoch": 0.8961210974456008, "grad_norm": 0.21709039081430767, "learning_rate": 2.7957174309304834e-05, "loss": 0.6338, "step": 2368 }, { "epoch": 0.8964995269631031, "grad_norm": 0.21206044709380928, "learning_rate": 2.775538560816937e-05, "loss": 0.5869, "step": 2369 }, { "epoch": 0.8968779564806055, "grad_norm": 0.22877168573449355, "learning_rate": 2.7554306988952506e-05, "loss": 0.6002, "step": 2370 }, { "epoch": 0.8972563859981079, "grad_norm": 0.23965354422776458, "learning_rate": 2.735393875400227e-05, "loss": 0.5901, "step": 2371 }, { "epoch": 0.8976348155156102, "grad_norm": 0.2078469419978446, "learning_rate": 2.7154281204598897e-05, "loss": 0.5745, "step": 2372 }, { "epoch": 0.8980132450331125, "grad_norm": 0.23424722844471996, "learning_rate": 2.6955334640953567e-05, "loss": 0.5746, "step": 2373 }, { "epoch": 0.898391674550615, "grad_norm": 0.20819536448717854, "learning_rate": 2.6757099362208658e-05, "loss": 0.5868, "step": 2374 }, { "epoch": 0.8987701040681173, "grad_norm": 0.19845749761787357, "learning_rate": 2.6559575666437073e-05, "loss": 0.6098, "step": 2375 }, { "epoch": 0.8991485335856196, "grad_norm": 0.21340565110118517, "learning_rate": 2.636276385064157e-05, "loss": 0.6006, "step": 2376 }, { "epoch": 0.8995269631031221, "grad_norm": 0.20067620553851787, "learning_rate": 2.6166664210754753e-05, "loss": 0.589, "step": 2377 }, { "epoch": 0.8999053926206244, "grad_norm": 0.23353500042066538, "learning_rate": 2.5971277041638097e-05, "loss": 0.5751, "step": 2378 }, { "epoch": 0.9002838221381267, "grad_norm": 0.22005905426239938, "learning_rate": 2.5776602637082037e-05, "loss": 0.588, "step": 2379 }, { "epoch": 0.9006622516556292, "grad_norm": 0.2192617713901111, "learning_rate": 2.5582641289805032e-05, "loss": 0.5817, "step": 2380 }, { "epoch": 0.9010406811731315, "grad_norm": 0.22426555689941768, "learning_rate": 2.538939329145362e-05, "loss": 0.5848, "step": 2381 }, { "epoch": 0.9014191106906339, "grad_norm": 0.24281359315132714, "learning_rate": 2.5196858932601596e-05, "loss": 0.592, "step": 2382 }, { "epoch": 0.9017975402081362, "grad_norm": 0.20774100658538303, "learning_rate": 2.500503850274949e-05, "loss": 0.5981, "step": 2383 }, { "epoch": 0.9021759697256386, "grad_norm": 0.20370080836743845, "learning_rate": 2.481393229032486e-05, "loss": 0.6048, "step": 2384 }, { "epoch": 0.902554399243141, "grad_norm": 0.19899080362707763, "learning_rate": 2.4623540582680793e-05, "loss": 0.6048, "step": 2385 }, { "epoch": 0.9029328287606433, "grad_norm": 0.2061016155879237, "learning_rate": 2.4433863666096456e-05, "loss": 0.5878, "step": 2386 }, { "epoch": 0.9033112582781457, "grad_norm": 0.2233517809147624, "learning_rate": 2.4244901825775878e-05, "loss": 0.5959, "step": 2387 }, { "epoch": 0.9036896877956481, "grad_norm": 0.20788916485892603, "learning_rate": 2.405665534584822e-05, "loss": 0.6038, "step": 2388 }, { "epoch": 0.9040681173131504, "grad_norm": 0.22621606780452833, "learning_rate": 2.3869124509366736e-05, "loss": 0.6104, "step": 2389 }, { "epoch": 0.9044465468306528, "grad_norm": 0.2241218084851672, "learning_rate": 2.368230959830875e-05, "loss": 0.6012, "step": 2390 }, { "epoch": 0.9048249763481552, "grad_norm": 0.21266055142707227, "learning_rate": 2.349621089357501e-05, "loss": 0.5914, "step": 2391 }, { "epoch": 0.9052034058656575, "grad_norm": 0.2627498754397604, "learning_rate": 2.3310828674989514e-05, "loss": 0.6112, "step": 2392 }, { "epoch": 0.9055818353831598, "grad_norm": 0.20976098764077927, "learning_rate": 2.312616322129879e-05, "loss": 0.5999, "step": 2393 }, { "epoch": 0.9059602649006623, "grad_norm": 0.24537930399015048, "learning_rate": 2.2942214810171503e-05, "loss": 0.5993, "step": 2394 }, { "epoch": 0.9063386944181646, "grad_norm": 0.19832856311958577, "learning_rate": 2.275898371819851e-05, "loss": 0.6015, "step": 2395 }, { "epoch": 0.906717123935667, "grad_norm": 0.18089408664994294, "learning_rate": 2.2576470220891655e-05, "loss": 0.6015, "step": 2396 }, { "epoch": 0.9070955534531694, "grad_norm": 0.21478019120993158, "learning_rate": 2.2394674592684183e-05, "loss": 0.5885, "step": 2397 }, { "epoch": 0.9074739829706717, "grad_norm": 0.1907745661802712, "learning_rate": 2.2213597106929607e-05, "loss": 0.5986, "step": 2398 }, { "epoch": 0.907852412488174, "grad_norm": 0.2147610938855955, "learning_rate": 2.2033238035901903e-05, "loss": 0.6071, "step": 2399 }, { "epoch": 0.9082308420056765, "grad_norm": 0.22168942568727443, "learning_rate": 2.1853597650794477e-05, "loss": 0.6261, "step": 2400 }, { "epoch": 0.9086092715231788, "grad_norm": 0.19249791608648512, "learning_rate": 2.1674676221720478e-05, "loss": 0.5945, "step": 2401 }, { "epoch": 0.9089877010406812, "grad_norm": 0.22106653861556738, "learning_rate": 2.1496474017711654e-05, "loss": 0.5834, "step": 2402 }, { "epoch": 0.9093661305581835, "grad_norm": 0.21274826119358964, "learning_rate": 2.1318991306718605e-05, "loss": 0.5729, "step": 2403 }, { "epoch": 0.9097445600756859, "grad_norm": 0.21627258793329465, "learning_rate": 2.114222835560986e-05, "loss": 0.6427, "step": 2404 }, { "epoch": 0.9101229895931883, "grad_norm": 0.22844024784402822, "learning_rate": 2.096618543017176e-05, "loss": 0.6003, "step": 2405 }, { "epoch": 0.9105014191106906, "grad_norm": 0.25299237921417467, "learning_rate": 2.079086279510811e-05, "loss": 0.6172, "step": 2406 }, { "epoch": 0.910879848628193, "grad_norm": 0.23384347089735252, "learning_rate": 2.061626071403938e-05, "loss": 0.5974, "step": 2407 }, { "epoch": 0.9112582781456954, "grad_norm": 0.224464004576904, "learning_rate": 2.0442379449503002e-05, "loss": 0.6124, "step": 2408 }, { "epoch": 0.9116367076631977, "grad_norm": 0.2441259913827247, "learning_rate": 2.026921926295211e-05, "loss": 0.5867, "step": 2409 }, { "epoch": 0.9120151371807, "grad_norm": 0.21533564696444418, "learning_rate": 2.0096780414756045e-05, "loss": 0.5805, "step": 2410 }, { "epoch": 0.9123935666982025, "grad_norm": 0.2066159671717552, "learning_rate": 1.992506316419912e-05, "loss": 0.5961, "step": 2411 }, { "epoch": 0.9127719962157048, "grad_norm": 0.20900874129523056, "learning_rate": 1.9754067769480966e-05, "loss": 0.5589, "step": 2412 }, { "epoch": 0.9131504257332071, "grad_norm": 0.21440872886280027, "learning_rate": 1.9583794487715578e-05, "loss": 0.6089, "step": 2413 }, { "epoch": 0.9135288552507096, "grad_norm": 0.23146502897728177, "learning_rate": 1.941424357493121e-05, "loss": 0.5819, "step": 2414 }, { "epoch": 0.9139072847682119, "grad_norm": 0.2014562763088054, "learning_rate": 1.9245415286070045e-05, "loss": 0.6072, "step": 2415 }, { "epoch": 0.9142857142857143, "grad_norm": 0.21324376905266726, "learning_rate": 1.9077309874987568e-05, "loss": 0.5988, "step": 2416 }, { "epoch": 0.9146641438032167, "grad_norm": 0.2045884598933284, "learning_rate": 1.890992759445248e-05, "loss": 0.5933, "step": 2417 }, { "epoch": 0.915042573320719, "grad_norm": 0.20815939596124242, "learning_rate": 1.8743268696145954e-05, "loss": 0.5872, "step": 2418 }, { "epoch": 0.9154210028382214, "grad_norm": 0.2161360455024204, "learning_rate": 1.8577333430661647e-05, "loss": 0.568, "step": 2419 }, { "epoch": 0.9157994323557237, "grad_norm": 0.21719035963193006, "learning_rate": 1.8412122047505033e-05, "loss": 0.6083, "step": 2420 }, { "epoch": 0.9161778618732261, "grad_norm": 0.24941992517569603, "learning_rate": 1.8247634795093227e-05, "loss": 0.6018, "step": 2421 }, { "epoch": 0.9165562913907285, "grad_norm": 0.21583082423783026, "learning_rate": 1.808387192075428e-05, "loss": 0.5715, "step": 2422 }, { "epoch": 0.9169347209082308, "grad_norm": 0.2403728892143915, "learning_rate": 1.7920833670727444e-05, "loss": 0.6005, "step": 2423 }, { "epoch": 0.9173131504257332, "grad_norm": 0.22644725696728446, "learning_rate": 1.7758520290162118e-05, "loss": 0.6066, "step": 2424 }, { "epoch": 0.9176915799432356, "grad_norm": 0.22586383464188153, "learning_rate": 1.7596932023117686e-05, "loss": 0.568, "step": 2425 }, { "epoch": 0.9180700094607379, "grad_norm": 0.2114593532064739, "learning_rate": 1.743606911256357e-05, "loss": 0.6, "step": 2426 }, { "epoch": 0.9184484389782404, "grad_norm": 0.23636533901315768, "learning_rate": 1.7275931800378176e-05, "loss": 0.5934, "step": 2427 }, { "epoch": 0.9188268684957427, "grad_norm": 0.22716947344974564, "learning_rate": 1.7116520327349116e-05, "loss": 0.6019, "step": 2428 }, { "epoch": 0.919205298013245, "grad_norm": 0.21842088805353027, "learning_rate": 1.6957834933172432e-05, "loss": 0.6014, "step": 2429 }, { "epoch": 0.9195837275307474, "grad_norm": 0.21423777170961417, "learning_rate": 1.6799875856452597e-05, "loss": 0.5858, "step": 2430 }, { "epoch": 0.9199621570482498, "grad_norm": 0.20343311489670526, "learning_rate": 1.664264333470178e-05, "loss": 0.5971, "step": 2431 }, { "epoch": 0.9203405865657521, "grad_norm": 0.1937518904193822, "learning_rate": 1.6486137604339813e-05, "loss": 0.597, "step": 2432 }, { "epoch": 0.9207190160832545, "grad_norm": 0.21256884483484909, "learning_rate": 1.6330358900693678e-05, "loss": 0.6264, "step": 2433 }, { "epoch": 0.9210974456007569, "grad_norm": 0.23700531965434127, "learning_rate": 1.6175307457997057e-05, "loss": 0.6227, "step": 2434 }, { "epoch": 0.9214758751182592, "grad_norm": 0.19607805896003735, "learning_rate": 1.6020983509390352e-05, "loss": 0.608, "step": 2435 }, { "epoch": 0.9218543046357616, "grad_norm": 0.269256783763891, "learning_rate": 1.586738728691972e-05, "loss": 0.5943, "step": 2436 }, { "epoch": 0.9222327341532639, "grad_norm": 0.2612247216849205, "learning_rate": 1.571451902153748e-05, "loss": 0.6066, "step": 2437 }, { "epoch": 0.9226111636707663, "grad_norm": 0.22021680606889693, "learning_rate": 1.5562378943101085e-05, "loss": 0.6126, "step": 2438 }, { "epoch": 0.9229895931882687, "grad_norm": 0.22654700226230626, "learning_rate": 1.5410967280373223e-05, "loss": 0.5796, "step": 2439 }, { "epoch": 0.923368022705771, "grad_norm": 0.2365037722488365, "learning_rate": 1.526028426102116e-05, "loss": 0.5696, "step": 2440 }, { "epoch": 0.9237464522232735, "grad_norm": 0.1953056906466806, "learning_rate": 1.5110330111616778e-05, "loss": 0.601, "step": 2441 }, { "epoch": 0.9241248817407758, "grad_norm": 0.2231485045098091, "learning_rate": 1.4961105057635705e-05, "loss": 0.6229, "step": 2442 }, { "epoch": 0.9245033112582781, "grad_norm": 0.2351977292734817, "learning_rate": 1.4812609323457626e-05, "loss": 0.6306, "step": 2443 }, { "epoch": 0.9248817407757806, "grad_norm": 0.2219495265337334, "learning_rate": 1.4664843132365324e-05, "loss": 0.5659, "step": 2444 }, { "epoch": 0.9252601702932829, "grad_norm": 0.19556867964853844, "learning_rate": 1.4517806706544623e-05, "loss": 0.5812, "step": 2445 }, { "epoch": 0.9256385998107852, "grad_norm": 0.2492309902139012, "learning_rate": 1.4371500267084336e-05, "loss": 0.5962, "step": 2446 }, { "epoch": 0.9260170293282876, "grad_norm": 0.191178517228171, "learning_rate": 1.4225924033975269e-05, "loss": 0.6032, "step": 2447 }, { "epoch": 0.92639545884579, "grad_norm": 0.23378206591545983, "learning_rate": 1.4081078226110544e-05, "loss": 0.6181, "step": 2448 }, { "epoch": 0.9267738883632923, "grad_norm": 0.21092745085903436, "learning_rate": 1.3936963061284835e-05, "loss": 0.5911, "step": 2449 }, { "epoch": 0.9271523178807947, "grad_norm": 0.20596097107651026, "learning_rate": 1.3793578756194358e-05, "loss": 0.6088, "step": 2450 }, { "epoch": 0.9275307473982971, "grad_norm": 0.21145453488801566, "learning_rate": 1.3650925526436098e-05, "loss": 0.5761, "step": 2451 }, { "epoch": 0.9279091769157994, "grad_norm": 0.22288533188090184, "learning_rate": 1.3509003586508195e-05, "loss": 0.6013, "step": 2452 }, { "epoch": 0.9282876064333018, "grad_norm": 0.22631607959837713, "learning_rate": 1.3367813149808727e-05, "loss": 0.6098, "step": 2453 }, { "epoch": 0.9286660359508042, "grad_norm": 0.24380434909760032, "learning_rate": 1.3227354428636262e-05, "loss": 0.5929, "step": 2454 }, { "epoch": 0.9290444654683065, "grad_norm": 0.21593950182349828, "learning_rate": 1.3087627634188915e-05, "loss": 0.5756, "step": 2455 }, { "epoch": 0.9294228949858089, "grad_norm": 0.23498398024914396, "learning_rate": 1.294863297656429e-05, "loss": 0.6339, "step": 2456 }, { "epoch": 0.9298013245033112, "grad_norm": 0.23006005337633695, "learning_rate": 1.2810370664759153e-05, "loss": 0.5958, "step": 2457 }, { "epoch": 0.9301797540208137, "grad_norm": 0.21984795501960344, "learning_rate": 1.2672840906669092e-05, "loss": 0.5887, "step": 2458 }, { "epoch": 0.930558183538316, "grad_norm": 0.21367397028368187, "learning_rate": 1.2536043909088191e-05, "loss": 0.5827, "step": 2459 }, { "epoch": 0.9309366130558183, "grad_norm": 0.19807385593793114, "learning_rate": 1.2399979877708744e-05, "loss": 0.6136, "step": 2460 }, { "epoch": 0.9313150425733208, "grad_norm": 0.18956629332442546, "learning_rate": 1.2264649017120933e-05, "loss": 0.6085, "step": 2461 }, { "epoch": 0.9316934720908231, "grad_norm": 0.22531881659511716, "learning_rate": 1.2130051530812424e-05, "loss": 0.6, "step": 2462 }, { "epoch": 0.9320719016083254, "grad_norm": 0.21729482564949304, "learning_rate": 1.1996187621168386e-05, "loss": 0.5908, "step": 2463 }, { "epoch": 0.9324503311258279, "grad_norm": 0.2415379535323731, "learning_rate": 1.1863057489470808e-05, "loss": 0.6025, "step": 2464 }, { "epoch": 0.9328287606433302, "grad_norm": 0.20840487841133204, "learning_rate": 1.173066133589823e-05, "loss": 0.6084, "step": 2465 }, { "epoch": 0.9332071901608325, "grad_norm": 0.22631514957460372, "learning_rate": 1.1598999359525797e-05, "loss": 0.589, "step": 2466 }, { "epoch": 0.9335856196783349, "grad_norm": 0.2042323610546342, "learning_rate": 1.1468071758324595e-05, "loss": 0.5952, "step": 2467 }, { "epoch": 0.9339640491958373, "grad_norm": 0.22530606896882252, "learning_rate": 1.1337878729161533e-05, "loss": 0.5946, "step": 2468 }, { "epoch": 0.9343424787133396, "grad_norm": 0.20616447091404563, "learning_rate": 1.1208420467798852e-05, "loss": 0.6137, "step": 2469 }, { "epoch": 0.934720908230842, "grad_norm": 0.2374743494813625, "learning_rate": 1.1079697168894231e-05, "loss": 0.5765, "step": 2470 }, { "epoch": 0.9350993377483444, "grad_norm": 0.2026252007945276, "learning_rate": 1.0951709025999956e-05, "loss": 0.5925, "step": 2471 }, { "epoch": 0.9354777672658467, "grad_norm": 0.21086730495493564, "learning_rate": 1.082445623156314e-05, "loss": 0.6042, "step": 2472 }, { "epoch": 0.9358561967833491, "grad_norm": 0.20827041311610545, "learning_rate": 1.0697938976925059e-05, "loss": 0.5991, "step": 2473 }, { "epoch": 0.9362346263008514, "grad_norm": 0.242152365320198, "learning_rate": 1.0572157452321095e-05, "loss": 0.5878, "step": 2474 }, { "epoch": 0.9366130558183539, "grad_norm": 0.22435262351972912, "learning_rate": 1.0447111846880241e-05, "loss": 0.6076, "step": 2475 }, { "epoch": 0.9369914853358562, "grad_norm": 0.23219993958451884, "learning_rate": 1.0322802348625038e-05, "loss": 0.6018, "step": 2476 }, { "epoch": 0.9373699148533585, "grad_norm": 0.2168435742368864, "learning_rate": 1.0199229144471245e-05, "loss": 0.5871, "step": 2477 }, { "epoch": 0.937748344370861, "grad_norm": 0.2087955423889363, "learning_rate": 1.0076392420227343e-05, "loss": 0.5849, "step": 2478 }, { "epoch": 0.9381267738883633, "grad_norm": 0.209756743031714, "learning_rate": 9.954292360594642e-06, "loss": 0.6232, "step": 2479 }, { "epoch": 0.9385052034058656, "grad_norm": 0.22756983995007143, "learning_rate": 9.832929149166503e-06, "loss": 0.6453, "step": 2480 }, { "epoch": 0.9388836329233681, "grad_norm": 0.2373367178065389, "learning_rate": 9.712302968428566e-06, "loss": 0.5958, "step": 2481 }, { "epoch": 0.9392620624408704, "grad_norm": 0.1885742863122077, "learning_rate": 9.592413999758075e-06, "loss": 0.585, "step": 2482 }, { "epoch": 0.9396404919583727, "grad_norm": 0.23479539996589022, "learning_rate": 9.473262423424e-06, "loss": 0.6001, "step": 2483 }, { "epoch": 0.9400189214758751, "grad_norm": 0.20937119096166457, "learning_rate": 9.35484841858636e-06, "loss": 0.5954, "step": 2484 }, { "epoch": 0.9403973509933775, "grad_norm": 0.22058142623490085, "learning_rate": 9.23717216329617e-06, "loss": 0.5985, "step": 2485 }, { "epoch": 0.9407757805108798, "grad_norm": 0.22092468820787103, "learning_rate": 9.120233834495228e-06, "loss": 0.5893, "step": 2486 }, { "epoch": 0.9411542100283822, "grad_norm": 0.2609651162615478, "learning_rate": 9.004033608015605e-06, "loss": 0.5779, "step": 2487 }, { "epoch": 0.9415326395458846, "grad_norm": 0.2176674212719668, "learning_rate": 8.888571658579703e-06, "loss": 0.5849, "step": 2488 }, { "epoch": 0.941911069063387, "grad_norm": 0.19188520297219483, "learning_rate": 8.773848159799647e-06, "loss": 0.5738, "step": 2489 }, { "epoch": 0.9422894985808893, "grad_norm": 0.22151330762265706, "learning_rate": 8.659863284177504e-06, "loss": 0.5705, "step": 2490 }, { "epoch": 0.9426679280983917, "grad_norm": 0.22177987208535654, "learning_rate": 8.546617203104401e-06, "loss": 0.5861, "step": 2491 }, { "epoch": 0.9430463576158941, "grad_norm": 0.20284171708057538, "learning_rate": 8.434110086860735e-06, "loss": 0.5815, "step": 2492 }, { "epoch": 0.9434247871333964, "grad_norm": 0.22057029642998086, "learning_rate": 8.322342104615743e-06, "loss": 0.6063, "step": 2493 }, { "epoch": 0.9438032166508987, "grad_norm": 0.20289360217530958, "learning_rate": 8.21131342442738e-06, "loss": 0.5876, "step": 2494 }, { "epoch": 0.9441816461684012, "grad_norm": 0.2256041515312879, "learning_rate": 8.101024213241825e-06, "loss": 0.5845, "step": 2495 }, { "epoch": 0.9445600756859035, "grad_norm": 0.20123175313221836, "learning_rate": 7.99147463689337e-06, "loss": 0.6083, "step": 2496 }, { "epoch": 0.9449385052034058, "grad_norm": 0.2134837890127801, "learning_rate": 7.882664860104305e-06, "loss": 0.5859, "step": 2497 }, { "epoch": 0.9453169347209083, "grad_norm": 0.2690511380320954, "learning_rate": 7.774595046484367e-06, "loss": 0.6245, "step": 2498 }, { "epoch": 0.9456953642384106, "grad_norm": 0.18876713506023407, "learning_rate": 7.66726535853085e-06, "loss": 0.6013, "step": 2499 }, { "epoch": 0.9460737937559129, "grad_norm": 0.22708447934366063, "learning_rate": 7.560675957627938e-06, "loss": 0.6026, "step": 2500 }, { "epoch": 0.9464522232734153, "grad_norm": 0.2510160422536859, "learning_rate": 7.454827004046872e-06, "loss": 0.6054, "step": 2501 }, { "epoch": 0.9468306527909177, "grad_norm": 0.21297719052913672, "learning_rate": 7.349718656945503e-06, "loss": 0.6003, "step": 2502 }, { "epoch": 0.94720908230842, "grad_norm": 0.19453621134118848, "learning_rate": 7.2453510743680226e-06, "loss": 0.5881, "step": 2503 }, { "epoch": 0.9475875118259224, "grad_norm": 0.24308000973267935, "learning_rate": 7.1417244132448985e-06, "loss": 0.5987, "step": 2504 }, { "epoch": 0.9479659413434248, "grad_norm": 0.2133286545719734, "learning_rate": 7.03883882939238e-06, "loss": 0.6003, "step": 2505 }, { "epoch": 0.9483443708609272, "grad_norm": 0.22388159560919693, "learning_rate": 6.936694477512495e-06, "loss": 0.588, "step": 2506 }, { "epoch": 0.9487228003784295, "grad_norm": 0.25005140372143364, "learning_rate": 6.835291511192665e-06, "loss": 0.592, "step": 2507 }, { "epoch": 0.9491012298959319, "grad_norm": 0.2293230123098779, "learning_rate": 6.734630082905757e-06, "loss": 0.6053, "step": 2508 }, { "epoch": 0.9494796594134343, "grad_norm": 0.21229580366678952, "learning_rate": 6.6347103440092534e-06, "loss": 0.608, "step": 2509 }, { "epoch": 0.9498580889309366, "grad_norm": 0.23867216250687934, "learning_rate": 6.5355324447458615e-06, "loss": 0.5953, "step": 2510 }, { "epoch": 0.9502365184484389, "grad_norm": 0.1947393591070878, "learning_rate": 6.437096534242404e-06, "loss": 0.6179, "step": 2511 }, { "epoch": 0.9506149479659414, "grad_norm": 0.1921958789704671, "learning_rate": 6.339402760510371e-06, "loss": 0.6165, "step": 2512 }, { "epoch": 0.9509933774834437, "grad_norm": 0.21373441572430865, "learning_rate": 6.2424512704450375e-06, "loss": 0.5919, "step": 2513 }, { "epoch": 0.951371807000946, "grad_norm": 0.20725084750355363, "learning_rate": 6.146242209825959e-06, "loss": 0.5831, "step": 2514 }, { "epoch": 0.9517502365184485, "grad_norm": 0.24815195215891994, "learning_rate": 6.050775723315915e-06, "loss": 0.6, "step": 2515 }, { "epoch": 0.9521286660359508, "grad_norm": 0.21740827179988947, "learning_rate": 5.9560519544614725e-06, "loss": 0.5873, "step": 2516 }, { "epoch": 0.9525070955534531, "grad_norm": 0.19513484504540096, "learning_rate": 5.862071045692141e-06, "loss": 0.6097, "step": 2517 }, { "epoch": 0.9528855250709556, "grad_norm": 0.23643522751855808, "learning_rate": 5.7688331383206616e-06, "loss": 0.5818, "step": 2518 }, { "epoch": 0.9532639545884579, "grad_norm": 0.22319470932810342, "learning_rate": 5.6763383725425e-06, "loss": 0.6167, "step": 2519 }, { "epoch": 0.9536423841059603, "grad_norm": 0.21737559290273664, "learning_rate": 5.5845868874357386e-06, "loss": 0.6186, "step": 2520 }, { "epoch": 0.9540208136234626, "grad_norm": 0.21712418497807767, "learning_rate": 5.493578820960743e-06, "loss": 0.5907, "step": 2521 }, { "epoch": 0.954399243140965, "grad_norm": 0.20203641653277238, "learning_rate": 5.4033143099601615e-06, "loss": 0.5768, "step": 2522 }, { "epoch": 0.9547776726584674, "grad_norm": 0.19587183017636778, "learning_rate": 5.313793490158536e-06, "loss": 0.6332, "step": 2523 }, { "epoch": 0.9551561021759697, "grad_norm": 0.21509752024680306, "learning_rate": 5.225016496162194e-06, "loss": 0.5993, "step": 2524 }, { "epoch": 0.9555345316934721, "grad_norm": 0.20016953837396417, "learning_rate": 5.136983461459077e-06, "loss": 0.5693, "step": 2525 }, { "epoch": 0.9559129612109745, "grad_norm": 0.2419281078039076, "learning_rate": 5.049694518418357e-06, "loss": 0.5723, "step": 2526 }, { "epoch": 0.9562913907284768, "grad_norm": 0.22428915167957605, "learning_rate": 4.9631497982905446e-06, "loss": 0.6115, "step": 2527 }, { "epoch": 0.9566698202459792, "grad_norm": 0.20911257367971658, "learning_rate": 4.877349431206879e-06, "loss": 0.5932, "step": 2528 }, { "epoch": 0.9570482497634816, "grad_norm": 0.19927817748561888, "learning_rate": 4.79229354617966e-06, "loss": 0.6069, "step": 2529 }, { "epoch": 0.9574266792809839, "grad_norm": 0.2275381865914312, "learning_rate": 4.70798227110153e-06, "loss": 0.6091, "step": 2530 }, { "epoch": 0.9578051087984862, "grad_norm": 0.2093765287201144, "learning_rate": 4.624415732745524e-06, "loss": 0.5622, "step": 2531 }, { "epoch": 0.9581835383159887, "grad_norm": 0.21227187611147205, "learning_rate": 4.541594056765075e-06, "loss": 0.5926, "step": 2532 }, { "epoch": 0.958561967833491, "grad_norm": 0.20373182253698185, "learning_rate": 4.459517367693289e-06, "loss": 0.6049, "step": 2533 }, { "epoch": 0.9589403973509933, "grad_norm": 0.22629070421647643, "learning_rate": 4.378185788943445e-06, "loss": 0.5733, "step": 2534 }, { "epoch": 0.9593188268684958, "grad_norm": 0.21913603371555077, "learning_rate": 4.297599442808109e-06, "loss": 0.6342, "step": 2535 }, { "epoch": 0.9596972563859981, "grad_norm": 0.20762410838666592, "learning_rate": 4.217758450459574e-06, "loss": 0.5961, "step": 2536 }, { "epoch": 0.9600756859035005, "grad_norm": 0.23666965617611002, "learning_rate": 4.138662931949256e-06, "loss": 0.5957, "step": 2537 }, { "epoch": 0.9604541154210028, "grad_norm": 0.20270448034302835, "learning_rate": 4.060313006207683e-06, "loss": 0.6009, "step": 2538 }, { "epoch": 0.9608325449385052, "grad_norm": 0.20267035882994575, "learning_rate": 3.982708791044176e-06, "loss": 0.5738, "step": 2539 }, { "epoch": 0.9612109744560076, "grad_norm": 0.20426058620160717, "learning_rate": 3.90585040314706e-06, "loss": 0.6016, "step": 2540 }, { "epoch": 0.9615894039735099, "grad_norm": 0.2280596397776704, "learning_rate": 3.8297379580828904e-06, "loss": 0.5931, "step": 2541 }, { "epoch": 0.9619678334910123, "grad_norm": 0.19998469554935935, "learning_rate": 3.7543715702967885e-06, "loss": 0.5785, "step": 2542 }, { "epoch": 0.9623462630085147, "grad_norm": 0.21063045831393817, "learning_rate": 3.6797513531120484e-06, "loss": 0.5862, "step": 2543 }, { "epoch": 0.962724692526017, "grad_norm": 0.23427430214920944, "learning_rate": 3.605877418729975e-06, "loss": 0.6102, "step": 2544 }, { "epoch": 0.9631031220435194, "grad_norm": 0.20481848857041487, "learning_rate": 3.5327498782297687e-06, "loss": 0.6179, "step": 2545 }, { "epoch": 0.9634815515610218, "grad_norm": 0.22890460498726486, "learning_rate": 3.4603688415683065e-06, "loss": 0.5943, "step": 2546 }, { "epoch": 0.9638599810785241, "grad_norm": 0.22714646486982185, "learning_rate": 3.388734417579975e-06, "loss": 0.6063, "step": 2547 }, { "epoch": 0.9642384105960264, "grad_norm": 0.22352819202752047, "learning_rate": 3.317846713976502e-06, "loss": 0.5917, "step": 2548 }, { "epoch": 0.9646168401135289, "grad_norm": 0.22827709007519986, "learning_rate": 3.2477058373470147e-06, "loss": 0.5895, "step": 2549 }, { "epoch": 0.9649952696310312, "grad_norm": 0.22575722008836746, "learning_rate": 3.1783118931574816e-06, "loss": 0.6163, "step": 2550 }, { "epoch": 0.9653736991485335, "grad_norm": 0.20974887518188995, "learning_rate": 3.1096649857508265e-06, "loss": 0.6098, "step": 2551 }, { "epoch": 0.965752128666036, "grad_norm": 0.20971912387183056, "learning_rate": 3.041765218346704e-06, "loss": 0.5974, "step": 2552 }, { "epoch": 0.9661305581835383, "grad_norm": 0.21062399384289404, "learning_rate": 2.9746126930412785e-06, "loss": 0.6144, "step": 2553 }, { "epoch": 0.9665089877010407, "grad_norm": 0.209406986924665, "learning_rate": 2.9082075108073925e-06, "loss": 0.6215, "step": 2554 }, { "epoch": 0.9668874172185431, "grad_norm": 0.2274461974199915, "learning_rate": 2.842549771493785e-06, "loss": 0.6051, "step": 2555 }, { "epoch": 0.9672658467360454, "grad_norm": 0.22115867995797206, "learning_rate": 2.7776395738256523e-06, "loss": 0.5662, "step": 2556 }, { "epoch": 0.9676442762535478, "grad_norm": 0.1883146549206641, "learning_rate": 2.71347701540392e-06, "loss": 0.5993, "step": 2557 }, { "epoch": 0.9680227057710501, "grad_norm": 0.20261453713955574, "learning_rate": 2.650062192705471e-06, "loss": 0.5941, "step": 2558 }, { "epoch": 0.9684011352885525, "grad_norm": 0.2171797421668458, "learning_rate": 2.587395201082865e-06, "loss": 0.5889, "step": 2559 }, { "epoch": 0.9687795648060549, "grad_norm": 0.18707893697405842, "learning_rate": 2.5254761347641154e-06, "loss": 0.5975, "step": 2560 }, { "epoch": 0.9691579943235572, "grad_norm": 0.2062021785262565, "learning_rate": 2.4643050868527473e-06, "loss": 0.6305, "step": 2561 }, { "epoch": 0.9695364238410596, "grad_norm": 0.1959451252763148, "learning_rate": 2.403882149327408e-06, "loss": 0.5705, "step": 2562 }, { "epoch": 0.969914853358562, "grad_norm": 0.2168142466415644, "learning_rate": 2.3442074130419764e-06, "loss": 0.6167, "step": 2563 }, { "epoch": 0.9702932828760643, "grad_norm": 0.2492427010223156, "learning_rate": 2.285280967725234e-06, "loss": 0.595, "step": 2564 }, { "epoch": 0.9706717123935666, "grad_norm": 0.216221449543894, "learning_rate": 2.227102901980971e-06, "loss": 0.617, "step": 2565 }, { "epoch": 0.9710501419110691, "grad_norm": 0.19082316315472925, "learning_rate": 2.1696733032873784e-06, "loss": 0.616, "step": 2566 }, { "epoch": 0.9714285714285714, "grad_norm": 0.2182808511357633, "learning_rate": 2.112992257997548e-06, "loss": 0.5786, "step": 2567 }, { "epoch": 0.9718070009460738, "grad_norm": 0.2032678358679273, "learning_rate": 2.0570598513388605e-06, "loss": 0.5845, "step": 2568 }, { "epoch": 0.9721854304635762, "grad_norm": 0.1994463051071908, "learning_rate": 2.0018761674130415e-06, "loss": 0.5641, "step": 2569 }, { "epoch": 0.9725638599810785, "grad_norm": 0.21490355439552783, "learning_rate": 1.9474412891959394e-06, "loss": 0.5907, "step": 2570 }, { "epoch": 0.9729422894985809, "grad_norm": 0.23256661212015717, "learning_rate": 1.8937552985377472e-06, "loss": 0.6148, "step": 2571 }, { "epoch": 0.9733207190160833, "grad_norm": 0.224300304560682, "learning_rate": 1.8408182761622261e-06, "loss": 0.6308, "step": 2572 }, { "epoch": 0.9736991485335856, "grad_norm": 0.21976879137933425, "learning_rate": 1.78863030166726e-06, "loss": 0.5832, "step": 2573 }, { "epoch": 0.974077578051088, "grad_norm": 0.2310785534384182, "learning_rate": 1.737191453524356e-06, "loss": 0.6178, "step": 2574 }, { "epoch": 0.9744560075685903, "grad_norm": 0.19514673828328397, "learning_rate": 1.6865018090784778e-06, "loss": 0.6087, "step": 2575 }, { "epoch": 0.9748344370860927, "grad_norm": 0.2402431827574295, "learning_rate": 1.636561444548268e-06, "loss": 0.6062, "step": 2576 }, { "epoch": 0.9752128666035951, "grad_norm": 0.22155053834141655, "learning_rate": 1.5873704350256035e-06, "loss": 0.6276, "step": 2577 }, { "epoch": 0.9755912961210974, "grad_norm": 0.23456525233366912, "learning_rate": 1.5389288544756518e-06, "loss": 0.5996, "step": 2578 }, { "epoch": 0.9759697256385998, "grad_norm": 0.21946176467787964, "learning_rate": 1.4912367757366486e-06, "loss": 0.5886, "step": 2579 }, { "epoch": 0.9763481551561022, "grad_norm": 0.21171059927084046, "learning_rate": 1.444294270520008e-06, "loss": 0.596, "step": 2580 }, { "epoch": 0.9767265846736045, "grad_norm": 0.22449680492882343, "learning_rate": 1.3981014094099353e-06, "loss": 0.5748, "step": 2581 }, { "epoch": 0.977105014191107, "grad_norm": 0.22909292837330456, "learning_rate": 1.3526582618634265e-06, "loss": 0.62, "step": 2582 }, { "epoch": 0.9774834437086093, "grad_norm": 0.2241639156612952, "learning_rate": 1.3079648962102674e-06, "loss": 0.5819, "step": 2583 }, { "epoch": 0.9778618732261116, "grad_norm": 0.21607171170386147, "learning_rate": 1.2640213796528132e-06, "loss": 0.5804, "step": 2584 }, { "epoch": 0.978240302743614, "grad_norm": 0.19716860963321425, "learning_rate": 1.2208277782659317e-06, "loss": 0.5948, "step": 2585 }, { "epoch": 0.9786187322611164, "grad_norm": 0.18874713273428464, "learning_rate": 1.1783841569968368e-06, "loss": 0.5943, "step": 2586 }, { "epoch": 0.9789971617786187, "grad_norm": 0.22842540753317667, "learning_rate": 1.136690579665145e-06, "loss": 0.6393, "step": 2587 }, { "epoch": 0.9793755912961211, "grad_norm": 0.2161652238413361, "learning_rate": 1.0957471089626525e-06, "loss": 0.6114, "step": 2588 }, { "epoch": 0.9797540208136235, "grad_norm": 0.21406040434263998, "learning_rate": 1.0555538064532245e-06, "loss": 0.5927, "step": 2589 }, { "epoch": 0.9801324503311258, "grad_norm": 0.19091320763624586, "learning_rate": 1.0161107325727392e-06, "loss": 0.5798, "step": 2590 }, { "epoch": 0.9805108798486282, "grad_norm": 0.20186742886497436, "learning_rate": 9.774179466291443e-07, "loss": 0.6054, "step": 2591 }, { "epoch": 0.9808893093661306, "grad_norm": 0.21593577545736065, "learning_rate": 9.394755068020677e-07, "loss": 0.6228, "step": 2592 }, { "epoch": 0.981267738883633, "grad_norm": 0.2536265813713352, "learning_rate": 9.022834701429838e-07, "loss": 0.5953, "step": 2593 }, { "epoch": 0.9816461684011353, "grad_norm": 0.20407012989448872, "learning_rate": 8.658418925749923e-07, "loss": 0.5881, "step": 2594 }, { "epoch": 0.9820245979186376, "grad_norm": 0.21788085546301603, "learning_rate": 8.301508288928172e-07, "loss": 0.5967, "step": 2595 }, { "epoch": 0.98240302743614, "grad_norm": 0.21894271059557516, "learning_rate": 7.95210332762697e-07, "loss": 0.6101, "step": 2596 }, { "epoch": 0.9827814569536424, "grad_norm": 0.2462650490049688, "learning_rate": 7.610204567222168e-07, "loss": 0.5998, "step": 2597 }, { "epoch": 0.9831598864711447, "grad_norm": 0.2009768234859214, "learning_rate": 7.275812521803649e-07, "loss": 0.6092, "step": 2598 }, { "epoch": 0.9835383159886472, "grad_norm": 0.2039008088683052, "learning_rate": 6.948927694174212e-07, "loss": 0.6077, "step": 2599 }, { "epoch": 0.9839167455061495, "grad_norm": 0.21763466146716004, "learning_rate": 6.629550575847354e-07, "loss": 0.5866, "step": 2600 }, { "epoch": 0.9842951750236518, "grad_norm": 0.1805423532808625, "learning_rate": 6.31768164704949e-07, "loss": 0.5879, "step": 2601 }, { "epoch": 0.9846736045411542, "grad_norm": 0.2321296264996747, "learning_rate": 6.013321376716618e-07, "loss": 0.5845, "step": 2602 }, { "epoch": 0.9850520340586566, "grad_norm": 0.20479051312819913, "learning_rate": 5.716470222493775e-07, "loss": 0.5916, "step": 2603 }, { "epoch": 0.9854304635761589, "grad_norm": 0.2160240006159615, "learning_rate": 5.427128630736134e-07, "loss": 0.6092, "step": 2604 }, { "epoch": 0.9858088930936613, "grad_norm": 0.24048379805131762, "learning_rate": 5.14529703650679e-07, "loss": 0.5892, "step": 2605 }, { "epoch": 0.9861873226111637, "grad_norm": 0.2096785131822611, "learning_rate": 4.87097586357621e-07, "loss": 0.6064, "step": 2606 }, { "epoch": 0.986565752128666, "grad_norm": 0.21294566851534819, "learning_rate": 4.604165524423332e-07, "loss": 0.5997, "step": 2607 }, { "epoch": 0.9869441816461684, "grad_norm": 0.23675410444172076, "learning_rate": 4.344866420231686e-07, "loss": 0.6047, "step": 2608 }, { "epoch": 0.9873226111636708, "grad_norm": 0.2127816682627401, "learning_rate": 4.0930789408921696e-07, "loss": 0.61, "step": 2609 }, { "epoch": 0.9877010406811731, "grad_norm": 0.21344441820161647, "learning_rate": 3.8488034650002724e-07, "loss": 0.6138, "step": 2610 }, { "epoch": 0.9880794701986755, "grad_norm": 0.22883928640956802, "learning_rate": 3.612040359856072e-07, "loss": 0.6008, "step": 2611 }, { "epoch": 0.9884578997161778, "grad_norm": 0.2172979201093313, "learning_rate": 3.3827899814642405e-07, "loss": 0.59, "step": 2612 }, { "epoch": 0.9888363292336803, "grad_norm": 0.2121121935016109, "learning_rate": 3.161052674532927e-07, "loss": 0.6083, "step": 2613 }, { "epoch": 0.9892147587511826, "grad_norm": 0.21726619337680383, "learning_rate": 2.946828772473764e-07, "loss": 0.6083, "step": 2614 }, { "epoch": 0.9895931882686849, "grad_norm": 0.19920124650243476, "learning_rate": 2.7401185973996435e-07, "loss": 0.5995, "step": 2615 }, { "epoch": 0.9899716177861874, "grad_norm": 0.22814196823975608, "learning_rate": 2.5409224601269376e-07, "loss": 0.5718, "step": 2616 }, { "epoch": 0.9903500473036897, "grad_norm": 0.20831283627450595, "learning_rate": 2.3492406601732796e-07, "loss": 0.5732, "step": 2617 }, { "epoch": 0.990728476821192, "grad_norm": 0.23161736061130125, "learning_rate": 2.1650734857575626e-07, "loss": 0.5794, "step": 2618 }, { "epoch": 0.9911069063386945, "grad_norm": 0.22130203848736826, "learning_rate": 1.98842121379883e-07, "loss": 0.5899, "step": 2619 }, { "epoch": 0.9914853358561968, "grad_norm": 0.24355890779731493, "learning_rate": 1.8192841099179402e-07, "loss": 0.5848, "step": 2620 }, { "epoch": 0.9918637653736991, "grad_norm": 0.21205438635246465, "learning_rate": 1.6576624284347918e-07, "loss": 0.6012, "step": 2621 }, { "epoch": 0.9922421948912015, "grad_norm": 0.20182907751380888, "learning_rate": 1.503556412367768e-07, "loss": 0.5993, "step": 2622 }, { "epoch": 0.9926206244087039, "grad_norm": 0.2027618297737031, "learning_rate": 1.3569662934359572e-07, "loss": 0.6151, "step": 2623 }, { "epoch": 0.9929990539262062, "grad_norm": 0.2196426909697521, "learning_rate": 1.2178922920574874e-07, "loss": 0.5865, "step": 2624 }, { "epoch": 0.9933774834437086, "grad_norm": 0.21688815077010795, "learning_rate": 1.0863346173478616e-07, "loss": 0.5943, "step": 2625 }, { "epoch": 0.993755912961211, "grad_norm": 0.22390354210759447, "learning_rate": 9.622934671216221e-08, "loss": 0.612, "step": 2626 }, { "epoch": 0.9941343424787134, "grad_norm": 0.19565321767541263, "learning_rate": 8.457690278912412e-08, "loss": 0.6188, "step": 2627 }, { "epoch": 0.9945127719962157, "grad_norm": 0.19991817365099313, "learning_rate": 7.36761474865455e-08, "loss": 0.6005, "step": 2628 }, { "epoch": 0.994891201513718, "grad_norm": 0.20783951268273787, "learning_rate": 6.352709719525951e-08, "loss": 0.6136, "step": 2629 }, { "epoch": 0.9952696310312205, "grad_norm": 0.20931910508909266, "learning_rate": 5.4129767175614683e-08, "loss": 0.5919, "step": 2630 }, { "epoch": 0.9956480605487228, "grad_norm": 0.211243478177764, "learning_rate": 4.548417155780804e-08, "loss": 0.6034, "step": 2631 }, { "epoch": 0.9960264900662251, "grad_norm": 0.22934565634480308, "learning_rate": 3.75903233416075e-08, "loss": 0.6225, "step": 2632 }, { "epoch": 0.9964049195837276, "grad_norm": 0.23275005988172504, "learning_rate": 3.0448234396407427e-08, "loss": 0.6151, "step": 2633 }, { "epoch": 0.9967833491012299, "grad_norm": 0.17729751639646413, "learning_rate": 2.405791546133962e-08, "loss": 0.5842, "step": 2634 }, { "epoch": 0.9971617786187322, "grad_norm": 0.20584028397985493, "learning_rate": 1.841937614505129e-08, "loss": 0.5742, "step": 2635 }, { "epoch": 0.9975402081362347, "grad_norm": 0.23116417400480282, "learning_rate": 1.3532624925871595e-08, "loss": 0.6062, "step": 2636 }, { "epoch": 0.997918637653737, "grad_norm": 0.2183787311716677, "learning_rate": 9.397669151589572e-09, "loss": 0.6291, "step": 2637 }, { "epoch": 0.9982970671712393, "grad_norm": 0.21145180416519152, "learning_rate": 6.014515039731716e-09, "loss": 0.5829, "step": 2638 }, { "epoch": 0.9986754966887417, "grad_norm": 0.19424179327135807, "learning_rate": 3.3831676772844157e-09, "loss": 0.6155, "step": 2639 }, { "epoch": 0.9990539262062441, "grad_norm": 0.20134817194578902, "learning_rate": 1.5036310208604853e-09, "loss": 0.5997, "step": 2640 }, { "epoch": 0.9994323557237464, "grad_norm": 0.21712281238033634, "learning_rate": 3.759078965326346e-10, "loss": 0.5901, "step": 2641 }, { "epoch": 0.9998107852412488, "grad_norm": 0.2399654511454606, "learning_rate": 0.0, "loss": 0.6081, "step": 2642 }, { "epoch": 0.9998107852412488, "step": 2642, "total_flos": 9.858591576383554e+18, "train_loss": 0.7059136313484259, "train_runtime": 23508.4158, "train_samples_per_second": 28.773, "train_steps_per_second": 0.112 } ], "logging_steps": 1.0, "max_steps": 2642, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.858591576383554e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }