{ "best_metric": 0.038489215075969696, "best_model_checkpoint": "./lora-out/checkpoint-8000", "epoch": 1.0287624844614, "eval_steps": 500, "global_step": 8000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000128595310557675, "grad_norm": 19.25, "learning_rate": 1e-05, "loss": 13.2598, "step": 1 }, { "epoch": 0.000128595310557675, "eval_loss": 13.362639427185059, "eval_runtime": 1028.0865, "eval_samples_per_second": 95.543, "eval_steps_per_second": 1.194, "step": 1 }, { "epoch": 0.00025719062111535, "grad_norm": 19.375, "learning_rate": 2e-05, "loss": 13.4104, "step": 2 }, { "epoch": 0.000385785931673025, "grad_norm": 19.625, "learning_rate": 3e-05, "loss": 13.458, "step": 3 }, { "epoch": 0.0005143812422307, "grad_norm": 19.0, "learning_rate": 4e-05, "loss": 13.273, "step": 4 }, { "epoch": 0.0006429765527883749, "grad_norm": 18.875, "learning_rate": 5e-05, "loss": 13.0104, "step": 5 }, { "epoch": 0.00077157186334605, "grad_norm": 18.625, "learning_rate": 6e-05, "loss": 12.7312, "step": 6 }, { "epoch": 0.000900167173903725, "grad_norm": 18.125, "learning_rate": 7e-05, "loss": 12.4207, "step": 7 }, { "epoch": 0.0010287624844614, "grad_norm": 20.0, "learning_rate": 8e-05, "loss": 11.8262, "step": 8 }, { "epoch": 0.001157357795019075, "grad_norm": 18.125, "learning_rate": 9e-05, "loss": 11.349, "step": 9 }, { "epoch": 0.0012859531055767499, "grad_norm": 18.625, "learning_rate": 0.0001, "loss": 10.6807, "step": 10 }, { "epoch": 0.001414548416134425, "grad_norm": 49.75, "learning_rate": 9.999999897852923e-05, "loss": 9.8913, "step": 11 }, { "epoch": 0.0015431437266921, "grad_norm": 40.0, "learning_rate": 9.999999591411698e-05, "loss": 9.006, "step": 12 }, { "epoch": 0.0016717390372497749, "grad_norm": 23.25, "learning_rate": 9.999999080676332e-05, "loss": 8.03, "step": 13 }, { "epoch": 0.00180033434780745, "grad_norm": 26.0, "learning_rate": 9.999998365646852e-05, "loss": 6.6621, "step": 14 }, { "epoch": 0.001928929658365125, "grad_norm": 24.25, "learning_rate": 9.999997446323286e-05, "loss": 5.5297, "step": 15 }, { "epoch": 0.0020575249689228, "grad_norm": 32.75, "learning_rate": 9.99999632270567e-05, "loss": 4.964, "step": 16 }, { "epoch": 0.002186120279480475, "grad_norm": 31.125, "learning_rate": 9.999994994794049e-05, "loss": 4.5819, "step": 17 }, { "epoch": 0.00231471559003815, "grad_norm": 16.625, "learning_rate": 9.999993462588477e-05, "loss": 4.0606, "step": 18 }, { "epoch": 0.002443310900595825, "grad_norm": 23.5, "learning_rate": 9.999991726089022e-05, "loss": 3.8919, "step": 19 }, { "epoch": 0.0025719062111534997, "grad_norm": 8.6875, "learning_rate": 9.999989785295748e-05, "loss": 3.4816, "step": 20 }, { "epoch": 0.002700501521711175, "grad_norm": 13.125, "learning_rate": 9.999987640208739e-05, "loss": 3.3472, "step": 21 }, { "epoch": 0.00282909683226885, "grad_norm": 10.4375, "learning_rate": 9.999985290828083e-05, "loss": 3.0473, "step": 22 }, { "epoch": 0.002957692142826525, "grad_norm": 7.4375, "learning_rate": 9.99998273715387e-05, "loss": 2.6943, "step": 23 }, { "epoch": 0.0030862874533842, "grad_norm": 8.6875, "learning_rate": 9.999979979186211e-05, "loss": 2.4888, "step": 24 }, { "epoch": 0.003214882763941875, "grad_norm": 5.625, "learning_rate": 9.999977016925216e-05, "loss": 2.1084, "step": 25 }, { "epoch": 0.0033434780744995497, "grad_norm": 9.125, "learning_rate": 9.999973850371006e-05, "loss": 1.9489, "step": 26 }, { "epoch": 0.003472073385057225, "grad_norm": 7.3125, "learning_rate": 9.999970479523711e-05, "loss": 1.7625, "step": 27 }, { "epoch": 0.0036006686956149, "grad_norm": 4.9375, "learning_rate": 9.999966904383466e-05, "loss": 1.5758, "step": 28 }, { "epoch": 0.003729264006172575, "grad_norm": 5.34375, "learning_rate": 9.999963124950422e-05, "loss": 1.4668, "step": 29 }, { "epoch": 0.00385785931673025, "grad_norm": 2.890625, "learning_rate": 9.999959141224729e-05, "loss": 1.2899, "step": 30 }, { "epoch": 0.003986454627287925, "grad_norm": 3.390625, "learning_rate": 9.999954953206553e-05, "loss": 1.1833, "step": 31 }, { "epoch": 0.0041150499378456, "grad_norm": 2.1875, "learning_rate": 9.999950560896063e-05, "loss": 1.0747, "step": 32 }, { "epoch": 0.004243645248403275, "grad_norm": 1.8515625, "learning_rate": 9.99994596429344e-05, "loss": 0.9041, "step": 33 }, { "epoch": 0.00437224055896095, "grad_norm": 1.953125, "learning_rate": 9.99994116339887e-05, "loss": 0.8701, "step": 34 }, { "epoch": 0.0045008358695186245, "grad_norm": 2.421875, "learning_rate": 9.999936158212549e-05, "loss": 0.8375, "step": 35 }, { "epoch": 0.0046294311800763, "grad_norm": 1.421875, "learning_rate": 9.999930948734683e-05, "loss": 0.7439, "step": 36 }, { "epoch": 0.004758026490633975, "grad_norm": 1.4765625, "learning_rate": 9.999925534965486e-05, "loss": 0.6843, "step": 37 }, { "epoch": 0.00488662180119165, "grad_norm": 1.3359375, "learning_rate": 9.999919916905178e-05, "loss": 0.6452, "step": 38 }, { "epoch": 0.005015217111749325, "grad_norm": 1.265625, "learning_rate": 9.999914094553986e-05, "loss": 0.5839, "step": 39 }, { "epoch": 0.0051438124223069995, "grad_norm": 1.5625, "learning_rate": 9.999908067912152e-05, "loss": 0.5728, "step": 40 }, { "epoch": 0.005272407732864675, "grad_norm": 1.0234375, "learning_rate": 9.999901836979919e-05, "loss": 0.5285, "step": 41 }, { "epoch": 0.00540100304342235, "grad_norm": 1.328125, "learning_rate": 9.999895401757544e-05, "loss": 0.4904, "step": 42 }, { "epoch": 0.005529598353980025, "grad_norm": 1.65625, "learning_rate": 9.999888762245287e-05, "loss": 0.4847, "step": 43 }, { "epoch": 0.0056581936645377, "grad_norm": 0.8828125, "learning_rate": 9.999881918443424e-05, "loss": 0.4378, "step": 44 }, { "epoch": 0.005786788975095375, "grad_norm": 1.4296875, "learning_rate": 9.999874870352229e-05, "loss": 0.4197, "step": 45 }, { "epoch": 0.00591538428565305, "grad_norm": 0.94921875, "learning_rate": 9.999867617971994e-05, "loss": 0.3865, "step": 46 }, { "epoch": 0.0060439795962107246, "grad_norm": 1.3125, "learning_rate": 9.999860161303015e-05, "loss": 0.3976, "step": 47 }, { "epoch": 0.0061725749067684, "grad_norm": 0.734375, "learning_rate": 9.999852500345594e-05, "loss": 0.3699, "step": 48 }, { "epoch": 0.006301170217326075, "grad_norm": 1.3125, "learning_rate": 9.999844635100046e-05, "loss": 0.3553, "step": 49 }, { "epoch": 0.00642976552788375, "grad_norm": 0.8515625, "learning_rate": 9.999836565566693e-05, "loss": 0.3391, "step": 50 }, { "epoch": 0.006558360838441425, "grad_norm": 1.15625, "learning_rate": 9.999828291745863e-05, "loss": 0.3482, "step": 51 }, { "epoch": 0.0066869561489990995, "grad_norm": 0.77734375, "learning_rate": 9.999819813637897e-05, "loss": 0.3214, "step": 52 }, { "epoch": 0.006815551459556775, "grad_norm": 0.90234375, "learning_rate": 9.999811131243137e-05, "loss": 0.3371, "step": 53 }, { "epoch": 0.00694414677011445, "grad_norm": 0.59765625, "learning_rate": 9.99980224456194e-05, "loss": 0.2972, "step": 54 }, { "epoch": 0.007072742080672125, "grad_norm": 0.7109375, "learning_rate": 9.999793153594671e-05, "loss": 0.2991, "step": 55 }, { "epoch": 0.0072013373912298, "grad_norm": 0.65625, "learning_rate": 9.999783858341698e-05, "loss": 0.2969, "step": 56 }, { "epoch": 0.007329932701787474, "grad_norm": 0.6171875, "learning_rate": 9.999774358803403e-05, "loss": 0.2564, "step": 57 }, { "epoch": 0.00745852801234515, "grad_norm": 0.703125, "learning_rate": 9.999764654980173e-05, "loss": 0.2912, "step": 58 }, { "epoch": 0.007587123322902825, "grad_norm": 0.796875, "learning_rate": 9.999754746872407e-05, "loss": 0.2958, "step": 59 }, { "epoch": 0.0077157186334605, "grad_norm": 0.6484375, "learning_rate": 9.999744634480505e-05, "loss": 0.2726, "step": 60 }, { "epoch": 0.007844313944018175, "grad_norm": 0.6171875, "learning_rate": 9.999734317804886e-05, "loss": 0.3043, "step": 61 }, { "epoch": 0.00797290925457585, "grad_norm": 0.6640625, "learning_rate": 9.999723796845966e-05, "loss": 0.2688, "step": 62 }, { "epoch": 0.008101504565133524, "grad_norm": 0.703125, "learning_rate": 9.999713071604178e-05, "loss": 0.2684, "step": 63 }, { "epoch": 0.0082300998756912, "grad_norm": 0.5234375, "learning_rate": 9.999702142079961e-05, "loss": 0.2409, "step": 64 }, { "epoch": 0.008358695186248875, "grad_norm": 0.53515625, "learning_rate": 9.999691008273759e-05, "loss": 0.2551, "step": 65 }, { "epoch": 0.00848729049680655, "grad_norm": 0.62109375, "learning_rate": 9.999679670186028e-05, "loss": 0.2501, "step": 66 }, { "epoch": 0.008615885807364224, "grad_norm": 0.57421875, "learning_rate": 9.999668127817233e-05, "loss": 0.2439, "step": 67 }, { "epoch": 0.0087444811179219, "grad_norm": 0.5546875, "learning_rate": 9.999656381167841e-05, "loss": 0.2229, "step": 68 }, { "epoch": 0.008873076428479575, "grad_norm": 0.5703125, "learning_rate": 9.999644430238337e-05, "loss": 0.2704, "step": 69 }, { "epoch": 0.009001671739037249, "grad_norm": 0.58984375, "learning_rate": 9.999632275029206e-05, "loss": 0.248, "step": 70 }, { "epoch": 0.009130267049594925, "grad_norm": 0.62109375, "learning_rate": 9.999619915540949e-05, "loss": 0.2394, "step": 71 }, { "epoch": 0.0092588623601526, "grad_norm": 0.51171875, "learning_rate": 9.999607351774065e-05, "loss": 0.2456, "step": 72 }, { "epoch": 0.009387457670710276, "grad_norm": 0.53125, "learning_rate": 9.999594583729071e-05, "loss": 0.2215, "step": 73 }, { "epoch": 0.00951605298126795, "grad_norm": 0.66015625, "learning_rate": 9.999581611406488e-05, "loss": 0.2289, "step": 74 }, { "epoch": 0.009644648291825625, "grad_norm": 0.515625, "learning_rate": 9.999568434806847e-05, "loss": 0.2003, "step": 75 }, { "epoch": 0.0097732436023833, "grad_norm": 0.53125, "learning_rate": 9.999555053930684e-05, "loss": 0.2041, "step": 76 }, { "epoch": 0.009901838912940974, "grad_norm": 0.52734375, "learning_rate": 9.999541468778549e-05, "loss": 0.2186, "step": 77 }, { "epoch": 0.01003043422349865, "grad_norm": 0.5703125, "learning_rate": 9.999527679350993e-05, "loss": 0.2219, "step": 78 }, { "epoch": 0.010159029534056325, "grad_norm": 0.470703125, "learning_rate": 9.999513685648581e-05, "loss": 0.2138, "step": 79 }, { "epoch": 0.010287624844613999, "grad_norm": 0.56640625, "learning_rate": 9.999499487671887e-05, "loss": 0.2184, "step": 80 }, { "epoch": 0.010416220155171674, "grad_norm": 0.494140625, "learning_rate": 9.999485085421489e-05, "loss": 0.2088, "step": 81 }, { "epoch": 0.01054481546572935, "grad_norm": 0.50390625, "learning_rate": 9.999470478897976e-05, "loss": 0.2024, "step": 82 }, { "epoch": 0.010673410776287025, "grad_norm": 0.4921875, "learning_rate": 9.999455668101944e-05, "loss": 0.2051, "step": 83 }, { "epoch": 0.0108020060868447, "grad_norm": 0.55859375, "learning_rate": 9.999440653034e-05, "loss": 0.2169, "step": 84 }, { "epoch": 0.010930601397402375, "grad_norm": 0.482421875, "learning_rate": 9.999425433694756e-05, "loss": 0.2045, "step": 85 }, { "epoch": 0.01105919670796005, "grad_norm": 0.4375, "learning_rate": 9.999410010084834e-05, "loss": 0.1858, "step": 86 }, { "epoch": 0.011187792018517724, "grad_norm": 0.458984375, "learning_rate": 9.999394382204865e-05, "loss": 0.1999, "step": 87 }, { "epoch": 0.0113163873290754, "grad_norm": 0.47265625, "learning_rate": 9.999378550055486e-05, "loss": 0.1982, "step": 88 }, { "epoch": 0.011444982639633075, "grad_norm": 0.51953125, "learning_rate": 9.999362513637344e-05, "loss": 0.2026, "step": 89 }, { "epoch": 0.01157357795019075, "grad_norm": 0.53125, "learning_rate": 9.999346272951097e-05, "loss": 0.1973, "step": 90 }, { "epoch": 0.011702173260748424, "grad_norm": 0.478515625, "learning_rate": 9.999329827997406e-05, "loss": 0.2043, "step": 91 }, { "epoch": 0.0118307685713061, "grad_norm": 0.484375, "learning_rate": 9.999313178776943e-05, "loss": 0.2113, "step": 92 }, { "epoch": 0.011959363881863775, "grad_norm": 0.458984375, "learning_rate": 9.99929632529039e-05, "loss": 0.1947, "step": 93 }, { "epoch": 0.012087959192421449, "grad_norm": 0.408203125, "learning_rate": 9.999279267538433e-05, "loss": 0.1778, "step": 94 }, { "epoch": 0.012216554502979125, "grad_norm": 0.5078125, "learning_rate": 9.999262005521773e-05, "loss": 0.2003, "step": 95 }, { "epoch": 0.0123451498135368, "grad_norm": 0.46484375, "learning_rate": 9.999244539241109e-05, "loss": 0.198, "step": 96 }, { "epoch": 0.012473745124094474, "grad_norm": 0.578125, "learning_rate": 9.99922686869716e-05, "loss": 0.1845, "step": 97 }, { "epoch": 0.01260234043465215, "grad_norm": 0.458984375, "learning_rate": 9.999208993890648e-05, "loss": 0.175, "step": 98 }, { "epoch": 0.012730935745209825, "grad_norm": 0.439453125, "learning_rate": 9.999190914822299e-05, "loss": 0.1914, "step": 99 }, { "epoch": 0.0128595310557675, "grad_norm": 0.51171875, "learning_rate": 9.999172631492854e-05, "loss": 0.1872, "step": 100 }, { "epoch": 0.012988126366325174, "grad_norm": 0.4609375, "learning_rate": 9.999154143903064e-05, "loss": 0.179, "step": 101 }, { "epoch": 0.01311672167688285, "grad_norm": 0.470703125, "learning_rate": 9.999135452053679e-05, "loss": 0.1919, "step": 102 }, { "epoch": 0.013245316987440525, "grad_norm": 0.50390625, "learning_rate": 9.999116555945463e-05, "loss": 0.1944, "step": 103 }, { "epoch": 0.013373912297998199, "grad_norm": 0.41796875, "learning_rate": 9.999097455579191e-05, "loss": 0.1707, "step": 104 }, { "epoch": 0.013502507608555874, "grad_norm": 0.474609375, "learning_rate": 9.999078150955642e-05, "loss": 0.1816, "step": 105 }, { "epoch": 0.01363110291911355, "grad_norm": 0.53125, "learning_rate": 9.999058642075605e-05, "loss": 0.2203, "step": 106 }, { "epoch": 0.013759698229671226, "grad_norm": 0.423828125, "learning_rate": 9.999038928939877e-05, "loss": 0.1906, "step": 107 }, { "epoch": 0.0138882935402289, "grad_norm": 0.423828125, "learning_rate": 9.999019011549263e-05, "loss": 0.1676, "step": 108 }, { "epoch": 0.014016888850786575, "grad_norm": 0.40625, "learning_rate": 9.998998889904576e-05, "loss": 0.1751, "step": 109 }, { "epoch": 0.01414548416134425, "grad_norm": 0.435546875, "learning_rate": 9.99897856400664e-05, "loss": 0.1714, "step": 110 }, { "epoch": 0.014274079471901924, "grad_norm": 0.44140625, "learning_rate": 9.998958033856284e-05, "loss": 0.1598, "step": 111 }, { "epoch": 0.0144026747824596, "grad_norm": 0.400390625, "learning_rate": 9.998937299454349e-05, "loss": 0.1669, "step": 112 }, { "epoch": 0.014531270093017275, "grad_norm": 0.478515625, "learning_rate": 9.998916360801682e-05, "loss": 0.1766, "step": 113 }, { "epoch": 0.014659865403574949, "grad_norm": 0.443359375, "learning_rate": 9.998895217899135e-05, "loss": 0.159, "step": 114 }, { "epoch": 0.014788460714132624, "grad_norm": 0.462890625, "learning_rate": 9.998873870747574e-05, "loss": 0.1641, "step": 115 }, { "epoch": 0.0149170560246903, "grad_norm": 0.44921875, "learning_rate": 9.998852319347874e-05, "loss": 0.1589, "step": 116 }, { "epoch": 0.015045651335247975, "grad_norm": 0.39453125, "learning_rate": 9.998830563700912e-05, "loss": 0.142, "step": 117 }, { "epoch": 0.01517424664580565, "grad_norm": 0.435546875, "learning_rate": 9.998808603807577e-05, "loss": 0.1601, "step": 118 }, { "epoch": 0.015302841956363325, "grad_norm": 0.474609375, "learning_rate": 9.998786439668765e-05, "loss": 0.173, "step": 119 }, { "epoch": 0.015431437266921, "grad_norm": 0.419921875, "learning_rate": 9.998764071285388e-05, "loss": 0.1772, "step": 120 }, { "epoch": 0.015560032577478674, "grad_norm": 0.4140625, "learning_rate": 9.998741498658354e-05, "loss": 0.1688, "step": 121 }, { "epoch": 0.01568862788803635, "grad_norm": 0.43359375, "learning_rate": 9.998718721788586e-05, "loss": 0.1624, "step": 122 }, { "epoch": 0.015817223198594025, "grad_norm": 0.396484375, "learning_rate": 9.998695740677017e-05, "loss": 0.1449, "step": 123 }, { "epoch": 0.0159458185091517, "grad_norm": 0.45703125, "learning_rate": 9.998672555324584e-05, "loss": 0.144, "step": 124 }, { "epoch": 0.016074413819709376, "grad_norm": 0.392578125, "learning_rate": 9.998649165732236e-05, "loss": 0.1641, "step": 125 }, { "epoch": 0.016203009130267048, "grad_norm": 0.41015625, "learning_rate": 9.998625571900926e-05, "loss": 0.1832, "step": 126 }, { "epoch": 0.016331604440824724, "grad_norm": 0.486328125, "learning_rate": 9.99860177383162e-05, "loss": 0.1796, "step": 127 }, { "epoch": 0.0164601997513824, "grad_norm": 0.455078125, "learning_rate": 9.99857777152529e-05, "loss": 0.1616, "step": 128 }, { "epoch": 0.016588795061940075, "grad_norm": 0.439453125, "learning_rate": 9.998553564982918e-05, "loss": 0.1494, "step": 129 }, { "epoch": 0.01671739037249775, "grad_norm": 0.427734375, "learning_rate": 9.998529154205491e-05, "loss": 0.1619, "step": 130 }, { "epoch": 0.016845985683055426, "grad_norm": 0.40625, "learning_rate": 9.998504539194007e-05, "loss": 0.147, "step": 131 }, { "epoch": 0.0169745809936131, "grad_norm": 0.51171875, "learning_rate": 9.998479719949471e-05, "loss": 0.161, "step": 132 }, { "epoch": 0.017103176304170773, "grad_norm": 0.458984375, "learning_rate": 9.998454696472899e-05, "loss": 0.1667, "step": 133 }, { "epoch": 0.01723177161472845, "grad_norm": 0.380859375, "learning_rate": 9.998429468765313e-05, "loss": 0.1653, "step": 134 }, { "epoch": 0.017360366925286124, "grad_norm": 0.46875, "learning_rate": 9.998404036827742e-05, "loss": 0.1937, "step": 135 }, { "epoch": 0.0174889622358438, "grad_norm": 0.431640625, "learning_rate": 9.998378400661226e-05, "loss": 0.1645, "step": 136 }, { "epoch": 0.017617557546401475, "grad_norm": 0.447265625, "learning_rate": 9.998352560266814e-05, "loss": 0.1485, "step": 137 }, { "epoch": 0.01774615285695915, "grad_norm": 0.466796875, "learning_rate": 9.99832651564556e-05, "loss": 0.1618, "step": 138 }, { "epoch": 0.017874748167516826, "grad_norm": 0.404296875, "learning_rate": 9.998300266798528e-05, "loss": 0.1518, "step": 139 }, { "epoch": 0.018003343478074498, "grad_norm": 0.431640625, "learning_rate": 9.998273813726793e-05, "loss": 0.1501, "step": 140 }, { "epoch": 0.018131938788632174, "grad_norm": 0.3984375, "learning_rate": 9.998247156431432e-05, "loss": 0.1679, "step": 141 }, { "epoch": 0.01826053409918985, "grad_norm": 0.40625, "learning_rate": 9.998220294913537e-05, "loss": 0.162, "step": 142 }, { "epoch": 0.018389129409747525, "grad_norm": 0.37890625, "learning_rate": 9.998193229174205e-05, "loss": 0.1443, "step": 143 }, { "epoch": 0.0185177247203052, "grad_norm": 0.42578125, "learning_rate": 9.998165959214542e-05, "loss": 0.1506, "step": 144 }, { "epoch": 0.018646320030862876, "grad_norm": 0.4140625, "learning_rate": 9.998138485035662e-05, "loss": 0.1584, "step": 145 }, { "epoch": 0.01877491534142055, "grad_norm": 0.38671875, "learning_rate": 9.998110806638687e-05, "loss": 0.1575, "step": 146 }, { "epoch": 0.018903510651978223, "grad_norm": 0.427734375, "learning_rate": 9.998082924024747e-05, "loss": 0.1673, "step": 147 }, { "epoch": 0.0190321059625359, "grad_norm": 0.390625, "learning_rate": 9.998054837194983e-05, "loss": 0.1645, "step": 148 }, { "epoch": 0.019160701273093574, "grad_norm": 0.453125, "learning_rate": 9.998026546150544e-05, "loss": 0.1811, "step": 149 }, { "epoch": 0.01928929658365125, "grad_norm": 0.443359375, "learning_rate": 9.997998050892583e-05, "loss": 0.1571, "step": 150 }, { "epoch": 0.019417891894208925, "grad_norm": 0.375, "learning_rate": 9.997969351422265e-05, "loss": 0.1505, "step": 151 }, { "epoch": 0.0195464872047666, "grad_norm": 0.41796875, "learning_rate": 9.997940447740763e-05, "loss": 0.1487, "step": 152 }, { "epoch": 0.019675082515324273, "grad_norm": 0.423828125, "learning_rate": 9.99791133984926e-05, "loss": 0.1529, "step": 153 }, { "epoch": 0.01980367782588195, "grad_norm": 0.4765625, "learning_rate": 9.997882027748942e-05, "loss": 0.1665, "step": 154 }, { "epoch": 0.019932273136439624, "grad_norm": 0.380859375, "learning_rate": 9.997852511441007e-05, "loss": 0.1377, "step": 155 }, { "epoch": 0.0200608684469973, "grad_norm": 0.486328125, "learning_rate": 9.997822790926662e-05, "loss": 0.1505, "step": 156 }, { "epoch": 0.020189463757554975, "grad_norm": 0.421875, "learning_rate": 9.997792866207123e-05, "loss": 0.1456, "step": 157 }, { "epoch": 0.02031805906811265, "grad_norm": 0.37109375, "learning_rate": 9.99776273728361e-05, "loss": 0.1578, "step": 158 }, { "epoch": 0.020446654378670326, "grad_norm": 0.470703125, "learning_rate": 9.997732404157356e-05, "loss": 0.1646, "step": 159 }, { "epoch": 0.020575249689227998, "grad_norm": 0.6015625, "learning_rate": 9.997701866829599e-05, "loss": 0.1794, "step": 160 }, { "epoch": 0.020703844999785673, "grad_norm": 0.38671875, "learning_rate": 9.997671125301585e-05, "loss": 0.1504, "step": 161 }, { "epoch": 0.02083244031034335, "grad_norm": 0.40625, "learning_rate": 9.997640179574574e-05, "loss": 0.147, "step": 162 }, { "epoch": 0.020961035620901024, "grad_norm": 0.388671875, "learning_rate": 9.997609029649829e-05, "loss": 0.1412, "step": 163 }, { "epoch": 0.0210896309314587, "grad_norm": 0.455078125, "learning_rate": 9.997577675528622e-05, "loss": 0.1402, "step": 164 }, { "epoch": 0.021218226242016375, "grad_norm": 0.36328125, "learning_rate": 9.997546117212234e-05, "loss": 0.1431, "step": 165 }, { "epoch": 0.02134682155257405, "grad_norm": 0.40625, "learning_rate": 9.997514354701956e-05, "loss": 0.1487, "step": 166 }, { "epoch": 0.021475416863131723, "grad_norm": 0.37109375, "learning_rate": 9.997482387999083e-05, "loss": 0.1323, "step": 167 }, { "epoch": 0.0216040121736894, "grad_norm": 0.376953125, "learning_rate": 9.997450217104924e-05, "loss": 0.1295, "step": 168 }, { "epoch": 0.021732607484247074, "grad_norm": 0.41796875, "learning_rate": 9.99741784202079e-05, "loss": 0.142, "step": 169 }, { "epoch": 0.02186120279480475, "grad_norm": 0.427734375, "learning_rate": 9.99738526274801e-05, "loss": 0.1412, "step": 170 }, { "epoch": 0.021989798105362425, "grad_norm": 0.43359375, "learning_rate": 9.997352479287909e-05, "loss": 0.1609, "step": 171 }, { "epoch": 0.0221183934159201, "grad_norm": 0.4140625, "learning_rate": 9.997319491641828e-05, "loss": 0.1512, "step": 172 }, { "epoch": 0.022246988726477776, "grad_norm": 0.40625, "learning_rate": 9.997286299811118e-05, "loss": 0.1361, "step": 173 }, { "epoch": 0.022375584037035448, "grad_norm": 0.384765625, "learning_rate": 9.997252903797129e-05, "loss": 0.1517, "step": 174 }, { "epoch": 0.022504179347593124, "grad_norm": 0.361328125, "learning_rate": 9.997219303601233e-05, "loss": 0.144, "step": 175 }, { "epoch": 0.0226327746581508, "grad_norm": 0.384765625, "learning_rate": 9.997185499224798e-05, "loss": 0.1459, "step": 176 }, { "epoch": 0.022761369968708475, "grad_norm": 0.396484375, "learning_rate": 9.997151490669204e-05, "loss": 0.1376, "step": 177 }, { "epoch": 0.02288996527926615, "grad_norm": 0.34765625, "learning_rate": 9.997117277935846e-05, "loss": 0.1407, "step": 178 }, { "epoch": 0.023018560589823826, "grad_norm": 0.361328125, "learning_rate": 9.997082861026117e-05, "loss": 0.1437, "step": 179 }, { "epoch": 0.0231471559003815, "grad_norm": 0.44140625, "learning_rate": 9.997048239941424e-05, "loss": 0.1471, "step": 180 }, { "epoch": 0.023275751210939173, "grad_norm": 0.40234375, "learning_rate": 9.997013414683185e-05, "loss": 0.146, "step": 181 }, { "epoch": 0.02340434652149685, "grad_norm": 0.384765625, "learning_rate": 9.996978385252819e-05, "loss": 0.1489, "step": 182 }, { "epoch": 0.023532941832054524, "grad_norm": 0.38671875, "learning_rate": 9.996943151651759e-05, "loss": 0.1293, "step": 183 }, { "epoch": 0.0236615371426122, "grad_norm": 0.328125, "learning_rate": 9.996907713881445e-05, "loss": 0.128, "step": 184 }, { "epoch": 0.023790132453169875, "grad_norm": 0.361328125, "learning_rate": 9.996872071943325e-05, "loss": 0.1431, "step": 185 }, { "epoch": 0.02391872776372755, "grad_norm": 0.396484375, "learning_rate": 9.996836225838853e-05, "loss": 0.1428, "step": 186 }, { "epoch": 0.024047323074285223, "grad_norm": 0.404296875, "learning_rate": 9.996800175569494e-05, "loss": 0.1437, "step": 187 }, { "epoch": 0.024175918384842898, "grad_norm": 0.388671875, "learning_rate": 9.996763921136724e-05, "loss": 0.1424, "step": 188 }, { "epoch": 0.024304513695400574, "grad_norm": 0.310546875, "learning_rate": 9.996727462542022e-05, "loss": 0.1177, "step": 189 }, { "epoch": 0.02443310900595825, "grad_norm": 0.396484375, "learning_rate": 9.996690799786879e-05, "loss": 0.1486, "step": 190 }, { "epoch": 0.024561704316515925, "grad_norm": 0.4296875, "learning_rate": 9.996653932872792e-05, "loss": 0.1292, "step": 191 }, { "epoch": 0.0246902996270736, "grad_norm": 0.34765625, "learning_rate": 9.996616861801267e-05, "loss": 0.131, "step": 192 }, { "epoch": 0.024818894937631276, "grad_norm": 0.3671875, "learning_rate": 9.996579586573819e-05, "loss": 0.1376, "step": 193 }, { "epoch": 0.024947490248188948, "grad_norm": 0.421875, "learning_rate": 9.99654210719197e-05, "loss": 0.1492, "step": 194 }, { "epoch": 0.025076085558746623, "grad_norm": 0.33984375, "learning_rate": 9.996504423657256e-05, "loss": 0.134, "step": 195 }, { "epoch": 0.0252046808693043, "grad_norm": 0.3671875, "learning_rate": 9.99646653597121e-05, "loss": 0.1243, "step": 196 }, { "epoch": 0.025333276179861974, "grad_norm": 0.361328125, "learning_rate": 9.996428444135385e-05, "loss": 0.1292, "step": 197 }, { "epoch": 0.02546187149041965, "grad_norm": 0.34765625, "learning_rate": 9.996390148151336e-05, "loss": 0.1193, "step": 198 }, { "epoch": 0.025590466800977325, "grad_norm": 0.373046875, "learning_rate": 9.996351648020626e-05, "loss": 0.1172, "step": 199 }, { "epoch": 0.025719062111535, "grad_norm": 0.388671875, "learning_rate": 9.99631294374483e-05, "loss": 0.1327, "step": 200 }, { "epoch": 0.025847657422092673, "grad_norm": 0.384765625, "learning_rate": 9.996274035325531e-05, "loss": 0.1306, "step": 201 }, { "epoch": 0.02597625273265035, "grad_norm": 0.4375, "learning_rate": 9.996234922764313e-05, "loss": 0.1343, "step": 202 }, { "epoch": 0.026104848043208024, "grad_norm": 0.333984375, "learning_rate": 9.996195606062779e-05, "loss": 0.1231, "step": 203 }, { "epoch": 0.0262334433537657, "grad_norm": 0.3671875, "learning_rate": 9.996156085222535e-05, "loss": 0.129, "step": 204 }, { "epoch": 0.026362038664323375, "grad_norm": 0.435546875, "learning_rate": 9.996116360245195e-05, "loss": 0.148, "step": 205 }, { "epoch": 0.02649063397488105, "grad_norm": 0.3515625, "learning_rate": 9.996076431132382e-05, "loss": 0.1281, "step": 206 }, { "epoch": 0.026619229285438726, "grad_norm": 0.3828125, "learning_rate": 9.996036297885727e-05, "loss": 0.1256, "step": 207 }, { "epoch": 0.026747824595996398, "grad_norm": 0.400390625, "learning_rate": 9.995995960506872e-05, "loss": 0.1214, "step": 208 }, { "epoch": 0.026876419906554073, "grad_norm": 0.384765625, "learning_rate": 9.995955418997461e-05, "loss": 0.1263, "step": 209 }, { "epoch": 0.02700501521711175, "grad_norm": 0.40234375, "learning_rate": 9.995914673359154e-05, "loss": 0.1407, "step": 210 }, { "epoch": 0.027133610527669424, "grad_norm": 0.369140625, "learning_rate": 9.995873723593615e-05, "loss": 0.1284, "step": 211 }, { "epoch": 0.0272622058382271, "grad_norm": 0.390625, "learning_rate": 9.995832569702517e-05, "loss": 0.1325, "step": 212 }, { "epoch": 0.027390801148784776, "grad_norm": 0.373046875, "learning_rate": 9.99579121168754e-05, "loss": 0.12, "step": 213 }, { "epoch": 0.02751939645934245, "grad_norm": 0.38671875, "learning_rate": 9.995749649550376e-05, "loss": 0.1449, "step": 214 }, { "epoch": 0.027647991769900123, "grad_norm": 0.380859375, "learning_rate": 9.995707883292724e-05, "loss": 0.1443, "step": 215 }, { "epoch": 0.0277765870804578, "grad_norm": 0.39453125, "learning_rate": 9.995665912916287e-05, "loss": 0.1434, "step": 216 }, { "epoch": 0.027905182391015474, "grad_norm": 0.37890625, "learning_rate": 9.995623738422784e-05, "loss": 0.123, "step": 217 }, { "epoch": 0.02803377770157315, "grad_norm": 0.353515625, "learning_rate": 9.995581359813934e-05, "loss": 0.1363, "step": 218 }, { "epoch": 0.028162373012130825, "grad_norm": 0.3828125, "learning_rate": 9.995538777091472e-05, "loss": 0.1264, "step": 219 }, { "epoch": 0.0282909683226885, "grad_norm": 0.369140625, "learning_rate": 9.995495990257135e-05, "loss": 0.1338, "step": 220 }, { "epoch": 0.028419563633246173, "grad_norm": 0.357421875, "learning_rate": 9.995452999312675e-05, "loss": 0.114, "step": 221 }, { "epoch": 0.028548158943803848, "grad_norm": 0.380859375, "learning_rate": 9.995409804259843e-05, "loss": 0.1473, "step": 222 }, { "epoch": 0.028676754254361524, "grad_norm": 0.33984375, "learning_rate": 9.995366405100411e-05, "loss": 0.1149, "step": 223 }, { "epoch": 0.0288053495649192, "grad_norm": 0.34375, "learning_rate": 9.995322801836145e-05, "loss": 0.1276, "step": 224 }, { "epoch": 0.028933944875476875, "grad_norm": 0.38671875, "learning_rate": 9.995278994468831e-05, "loss": 0.153, "step": 225 }, { "epoch": 0.02906254018603455, "grad_norm": 0.322265625, "learning_rate": 9.99523498300026e-05, "loss": 0.128, "step": 226 }, { "epoch": 0.029191135496592226, "grad_norm": 0.33203125, "learning_rate": 9.995190767432224e-05, "loss": 0.1179, "step": 227 }, { "epoch": 0.029319730807149898, "grad_norm": 0.33984375, "learning_rate": 9.995146347766536e-05, "loss": 0.1144, "step": 228 }, { "epoch": 0.029448326117707573, "grad_norm": 0.365234375, "learning_rate": 9.995101724005009e-05, "loss": 0.1339, "step": 229 }, { "epoch": 0.02957692142826525, "grad_norm": 0.369140625, "learning_rate": 9.995056896149465e-05, "loss": 0.1229, "step": 230 }, { "epoch": 0.029705516738822924, "grad_norm": 0.306640625, "learning_rate": 9.995011864201737e-05, "loss": 0.0991, "step": 231 }, { "epoch": 0.0298341120493806, "grad_norm": 0.357421875, "learning_rate": 9.994966628163665e-05, "loss": 0.1213, "step": 232 }, { "epoch": 0.029962707359938275, "grad_norm": 0.314453125, "learning_rate": 9.994921188037096e-05, "loss": 0.1148, "step": 233 }, { "epoch": 0.03009130267049595, "grad_norm": 0.34375, "learning_rate": 9.994875543823887e-05, "loss": 0.1368, "step": 234 }, { "epoch": 0.030219897981053623, "grad_norm": 0.361328125, "learning_rate": 9.994829695525905e-05, "loss": 0.1196, "step": 235 }, { "epoch": 0.0303484932916113, "grad_norm": 0.345703125, "learning_rate": 9.994783643145021e-05, "loss": 0.1186, "step": 236 }, { "epoch": 0.030477088602168974, "grad_norm": 0.345703125, "learning_rate": 9.994737386683117e-05, "loss": 0.1329, "step": 237 }, { "epoch": 0.03060568391272665, "grad_norm": 0.357421875, "learning_rate": 9.994690926142083e-05, "loss": 0.1416, "step": 238 }, { "epoch": 0.030734279223284325, "grad_norm": 0.3203125, "learning_rate": 9.994644261523818e-05, "loss": 0.1164, "step": 239 }, { "epoch": 0.030862874533842, "grad_norm": 0.3515625, "learning_rate": 9.994597392830228e-05, "loss": 0.1207, "step": 240 }, { "epoch": 0.030991469844399676, "grad_norm": 0.34375, "learning_rate": 9.994550320063229e-05, "loss": 0.13, "step": 241 }, { "epoch": 0.031120065154957348, "grad_norm": 0.30859375, "learning_rate": 9.994503043224743e-05, "loss": 0.1139, "step": 242 }, { "epoch": 0.031248660465515023, "grad_norm": 0.349609375, "learning_rate": 9.994455562316704e-05, "loss": 0.117, "step": 243 }, { "epoch": 0.0313772557760727, "grad_norm": 0.3203125, "learning_rate": 9.994407877341047e-05, "loss": 0.1186, "step": 244 }, { "epoch": 0.03150585108663037, "grad_norm": 0.33203125, "learning_rate": 9.994359988299727e-05, "loss": 0.1248, "step": 245 }, { "epoch": 0.03163444639718805, "grad_norm": 0.341796875, "learning_rate": 9.994311895194697e-05, "loss": 0.119, "step": 246 }, { "epoch": 0.03176304170774572, "grad_norm": 0.41015625, "learning_rate": 9.994263598027921e-05, "loss": 0.1281, "step": 247 }, { "epoch": 0.0318916370183034, "grad_norm": 0.3359375, "learning_rate": 9.994215096801374e-05, "loss": 0.1216, "step": 248 }, { "epoch": 0.03202023232886107, "grad_norm": 0.33984375, "learning_rate": 9.994166391517038e-05, "loss": 0.1076, "step": 249 }, { "epoch": 0.03214882763941875, "grad_norm": 0.365234375, "learning_rate": 9.994117482176902e-05, "loss": 0.1223, "step": 250 }, { "epoch": 0.032277422949976424, "grad_norm": 0.341796875, "learning_rate": 9.994068368782965e-05, "loss": 0.129, "step": 251 }, { "epoch": 0.032406018260534096, "grad_norm": 0.32421875, "learning_rate": 9.994019051337234e-05, "loss": 0.1209, "step": 252 }, { "epoch": 0.032534613571091775, "grad_norm": 0.369140625, "learning_rate": 9.993969529841725e-05, "loss": 0.1348, "step": 253 }, { "epoch": 0.03266320888164945, "grad_norm": 0.390625, "learning_rate": 9.993919804298457e-05, "loss": 0.1117, "step": 254 }, { "epoch": 0.032791804192207126, "grad_norm": 0.35546875, "learning_rate": 9.993869874709467e-05, "loss": 0.1159, "step": 255 }, { "epoch": 0.0329203995027648, "grad_norm": 0.330078125, "learning_rate": 9.993819741076793e-05, "loss": 0.1172, "step": 256 }, { "epoch": 0.03304899481332248, "grad_norm": 0.36328125, "learning_rate": 9.993769403402482e-05, "loss": 0.1354, "step": 257 }, { "epoch": 0.03317759012388015, "grad_norm": 0.443359375, "learning_rate": 9.993718861688592e-05, "loss": 0.1293, "step": 258 }, { "epoch": 0.03330618543443782, "grad_norm": 0.341796875, "learning_rate": 9.99366811593719e-05, "loss": 0.1204, "step": 259 }, { "epoch": 0.0334347807449955, "grad_norm": 0.43359375, "learning_rate": 9.993617166150344e-05, "loss": 0.1214, "step": 260 }, { "epoch": 0.03356337605555317, "grad_norm": 0.3671875, "learning_rate": 9.993566012330142e-05, "loss": 0.1204, "step": 261 }, { "epoch": 0.03369197136611085, "grad_norm": 0.376953125, "learning_rate": 9.99351465447867e-05, "loss": 0.1234, "step": 262 }, { "epoch": 0.03382056667666852, "grad_norm": 0.376953125, "learning_rate": 9.993463092598029e-05, "loss": 0.1179, "step": 263 }, { "epoch": 0.0339491619872262, "grad_norm": 0.3125, "learning_rate": 9.993411326690323e-05, "loss": 0.1064, "step": 264 }, { "epoch": 0.034077757297783874, "grad_norm": 0.376953125, "learning_rate": 9.99335935675767e-05, "loss": 0.1183, "step": 265 }, { "epoch": 0.034206352608341546, "grad_norm": 0.3359375, "learning_rate": 9.993307182802192e-05, "loss": 0.113, "step": 266 }, { "epoch": 0.034334947918899225, "grad_norm": 0.359375, "learning_rate": 9.99325480482602e-05, "loss": 0.1238, "step": 267 }, { "epoch": 0.0344635432294569, "grad_norm": 0.330078125, "learning_rate": 9.993202222831295e-05, "loss": 0.1098, "step": 268 }, { "epoch": 0.034592138540014576, "grad_norm": 0.3828125, "learning_rate": 9.993149436820165e-05, "loss": 0.1304, "step": 269 }, { "epoch": 0.03472073385057225, "grad_norm": 0.419921875, "learning_rate": 9.993096446794788e-05, "loss": 0.13, "step": 270 }, { "epoch": 0.03484932916112993, "grad_norm": 0.359375, "learning_rate": 9.993043252757327e-05, "loss": 0.1119, "step": 271 }, { "epoch": 0.0349779244716876, "grad_norm": 0.345703125, "learning_rate": 9.992989854709957e-05, "loss": 0.1179, "step": 272 }, { "epoch": 0.03510651978224527, "grad_norm": 0.3203125, "learning_rate": 9.992936252654861e-05, "loss": 0.1028, "step": 273 }, { "epoch": 0.03523511509280295, "grad_norm": 0.4140625, "learning_rate": 9.992882446594225e-05, "loss": 0.1298, "step": 274 }, { "epoch": 0.03536371040336062, "grad_norm": 0.3515625, "learning_rate": 9.992828436530253e-05, "loss": 0.1193, "step": 275 }, { "epoch": 0.0354923057139183, "grad_norm": 0.40234375, "learning_rate": 9.992774222465147e-05, "loss": 0.1141, "step": 276 }, { "epoch": 0.03562090102447597, "grad_norm": 0.361328125, "learning_rate": 9.992719804401122e-05, "loss": 0.1139, "step": 277 }, { "epoch": 0.03574949633503365, "grad_norm": 0.396484375, "learning_rate": 9.992665182340407e-05, "loss": 0.1238, "step": 278 }, { "epoch": 0.035878091645591324, "grad_norm": 0.333984375, "learning_rate": 9.992610356285228e-05, "loss": 0.1158, "step": 279 }, { "epoch": 0.036006686956148996, "grad_norm": 0.40234375, "learning_rate": 9.99255532623783e-05, "loss": 0.104, "step": 280 }, { "epoch": 0.036135282266706675, "grad_norm": 0.34765625, "learning_rate": 9.992500092200455e-05, "loss": 0.1123, "step": 281 }, { "epoch": 0.03626387757726435, "grad_norm": 0.38671875, "learning_rate": 9.992444654175366e-05, "loss": 0.115, "step": 282 }, { "epoch": 0.036392472887822026, "grad_norm": 0.365234375, "learning_rate": 9.992389012164825e-05, "loss": 0.115, "step": 283 }, { "epoch": 0.0365210681983797, "grad_norm": 0.451171875, "learning_rate": 9.992333166171106e-05, "loss": 0.1122, "step": 284 }, { "epoch": 0.03664966350893738, "grad_norm": 0.392578125, "learning_rate": 9.992277116196492e-05, "loss": 0.092, "step": 285 }, { "epoch": 0.03677825881949505, "grad_norm": 0.298828125, "learning_rate": 9.99222086224327e-05, "loss": 0.1046, "step": 286 }, { "epoch": 0.03690685413005272, "grad_norm": 0.388671875, "learning_rate": 9.992164404313742e-05, "loss": 0.1247, "step": 287 }, { "epoch": 0.0370354494406104, "grad_norm": 0.35546875, "learning_rate": 9.992107742410213e-05, "loss": 0.1041, "step": 288 }, { "epoch": 0.03716404475116807, "grad_norm": 0.353515625, "learning_rate": 9.992050876534999e-05, "loss": 0.109, "step": 289 }, { "epoch": 0.03729264006172575, "grad_norm": 0.36328125, "learning_rate": 9.991993806690423e-05, "loss": 0.11, "step": 290 }, { "epoch": 0.03742123537228342, "grad_norm": 0.349609375, "learning_rate": 9.991936532878817e-05, "loss": 0.1199, "step": 291 }, { "epoch": 0.0375498306828411, "grad_norm": 0.35546875, "learning_rate": 9.99187905510252e-05, "loss": 0.1164, "step": 292 }, { "epoch": 0.037678425993398774, "grad_norm": 0.38671875, "learning_rate": 9.991821373363881e-05, "loss": 0.1102, "step": 293 }, { "epoch": 0.037807021303956447, "grad_norm": 0.34375, "learning_rate": 9.991763487665258e-05, "loss": 0.1171, "step": 294 }, { "epoch": 0.037935616614514125, "grad_norm": 0.318359375, "learning_rate": 9.991705398009016e-05, "loss": 0.1149, "step": 295 }, { "epoch": 0.0380642119250718, "grad_norm": 0.375, "learning_rate": 9.991647104397528e-05, "loss": 0.1224, "step": 296 }, { "epoch": 0.038192807235629476, "grad_norm": 0.30078125, "learning_rate": 9.991588606833174e-05, "loss": 0.1034, "step": 297 }, { "epoch": 0.03832140254618715, "grad_norm": 0.3515625, "learning_rate": 9.991529905318347e-05, "loss": 0.122, "step": 298 }, { "epoch": 0.03844999785674483, "grad_norm": 0.330078125, "learning_rate": 9.991470999855443e-05, "loss": 0.1046, "step": 299 }, { "epoch": 0.0385785931673025, "grad_norm": 0.333984375, "learning_rate": 9.991411890446872e-05, "loss": 0.1028, "step": 300 }, { "epoch": 0.03870718847786017, "grad_norm": 0.34765625, "learning_rate": 9.991352577095046e-05, "loss": 0.1152, "step": 301 }, { "epoch": 0.03883578378841785, "grad_norm": 0.328125, "learning_rate": 9.991293059802391e-05, "loss": 0.1327, "step": 302 }, { "epoch": 0.03896437909897552, "grad_norm": 0.318359375, "learning_rate": 9.991233338571335e-05, "loss": 0.1085, "step": 303 }, { "epoch": 0.0390929744095332, "grad_norm": 0.298828125, "learning_rate": 9.991173413404323e-05, "loss": 0.1073, "step": 304 }, { "epoch": 0.039221569720090874, "grad_norm": 0.3125, "learning_rate": 9.9911132843038e-05, "loss": 0.1128, "step": 305 }, { "epoch": 0.039350165030648546, "grad_norm": 0.37890625, "learning_rate": 9.991052951272224e-05, "loss": 0.1344, "step": 306 }, { "epoch": 0.039478760341206225, "grad_norm": 0.333984375, "learning_rate": 9.99099241431206e-05, "loss": 0.1107, "step": 307 }, { "epoch": 0.0396073556517639, "grad_norm": 0.3203125, "learning_rate": 9.990931673425783e-05, "loss": 0.1188, "step": 308 }, { "epoch": 0.039735950962321576, "grad_norm": 0.29296875, "learning_rate": 9.990870728615871e-05, "loss": 0.094, "step": 309 }, { "epoch": 0.03986454627287925, "grad_norm": 0.31640625, "learning_rate": 9.990809579884818e-05, "loss": 0.1062, "step": 310 }, { "epoch": 0.03999314158343693, "grad_norm": 0.33203125, "learning_rate": 9.990748227235121e-05, "loss": 0.1046, "step": 311 }, { "epoch": 0.0401217368939946, "grad_norm": 0.31640625, "learning_rate": 9.990686670669285e-05, "loss": 0.118, "step": 312 }, { "epoch": 0.04025033220455227, "grad_norm": 0.306640625, "learning_rate": 9.990624910189828e-05, "loss": 0.1203, "step": 313 }, { "epoch": 0.04037892751510995, "grad_norm": 0.302734375, "learning_rate": 9.990562945799273e-05, "loss": 0.108, "step": 314 }, { "epoch": 0.04050752282566762, "grad_norm": 0.345703125, "learning_rate": 9.99050077750015e-05, "loss": 0.1118, "step": 315 }, { "epoch": 0.0406361181362253, "grad_norm": 0.322265625, "learning_rate": 9.990438405295e-05, "loss": 0.1031, "step": 316 }, { "epoch": 0.04076471344678297, "grad_norm": 0.322265625, "learning_rate": 9.990375829186373e-05, "loss": 0.0964, "step": 317 }, { "epoch": 0.04089330875734065, "grad_norm": 0.341796875, "learning_rate": 9.990313049176824e-05, "loss": 0.1192, "step": 318 }, { "epoch": 0.041021904067898324, "grad_norm": 0.314453125, "learning_rate": 9.990250065268918e-05, "loss": 0.1178, "step": 319 }, { "epoch": 0.041150499378455996, "grad_norm": 0.341796875, "learning_rate": 9.990186877465228e-05, "loss": 0.1275, "step": 320 }, { "epoch": 0.041279094689013675, "grad_norm": 0.33984375, "learning_rate": 9.99012348576834e-05, "loss": 0.1182, "step": 321 }, { "epoch": 0.04140768999957135, "grad_norm": 0.322265625, "learning_rate": 9.990059890180838e-05, "loss": 0.128, "step": 322 }, { "epoch": 0.041536285310129026, "grad_norm": 0.326171875, "learning_rate": 9.989996090705324e-05, "loss": 0.1225, "step": 323 }, { "epoch": 0.0416648806206867, "grad_norm": 0.296875, "learning_rate": 9.989932087344405e-05, "loss": 0.1156, "step": 324 }, { "epoch": 0.04179347593124438, "grad_norm": 0.3359375, "learning_rate": 9.989867880100697e-05, "loss": 0.119, "step": 325 }, { "epoch": 0.04192207124180205, "grad_norm": 0.32421875, "learning_rate": 9.989803468976819e-05, "loss": 0.1062, "step": 326 }, { "epoch": 0.04205066655235972, "grad_norm": 0.361328125, "learning_rate": 9.989738853975405e-05, "loss": 0.105, "step": 327 }, { "epoch": 0.0421792618629174, "grad_norm": 0.322265625, "learning_rate": 9.989674035099097e-05, "loss": 0.1149, "step": 328 }, { "epoch": 0.04230785717347507, "grad_norm": 0.31640625, "learning_rate": 9.98960901235054e-05, "loss": 0.1009, "step": 329 }, { "epoch": 0.04243645248403275, "grad_norm": 0.314453125, "learning_rate": 9.989543785732396e-05, "loss": 0.1101, "step": 330 }, { "epoch": 0.04256504779459042, "grad_norm": 0.32421875, "learning_rate": 9.989478355247323e-05, "loss": 0.1079, "step": 331 }, { "epoch": 0.0426936431051481, "grad_norm": 0.314453125, "learning_rate": 9.989412720898002e-05, "loss": 0.1189, "step": 332 }, { "epoch": 0.042822238415705774, "grad_norm": 0.326171875, "learning_rate": 9.989346882687108e-05, "loss": 0.1098, "step": 333 }, { "epoch": 0.042950833726263446, "grad_norm": 0.33203125, "learning_rate": 9.989280840617335e-05, "loss": 0.0969, "step": 334 }, { "epoch": 0.043079429036821125, "grad_norm": 0.3125, "learning_rate": 9.98921459469138e-05, "loss": 0.1096, "step": 335 }, { "epoch": 0.0432080243473788, "grad_norm": 0.33203125, "learning_rate": 9.989148144911951e-05, "loss": 0.1102, "step": 336 }, { "epoch": 0.043336619657936476, "grad_norm": 0.3046875, "learning_rate": 9.989081491281763e-05, "loss": 0.1002, "step": 337 }, { "epoch": 0.04346521496849415, "grad_norm": 0.28125, "learning_rate": 9.989014633803538e-05, "loss": 0.0919, "step": 338 }, { "epoch": 0.04359381027905183, "grad_norm": 0.33984375, "learning_rate": 9.988947572480008e-05, "loss": 0.1007, "step": 339 }, { "epoch": 0.0437224055896095, "grad_norm": 0.32421875, "learning_rate": 9.988880307313912e-05, "loss": 0.1146, "step": 340 }, { "epoch": 0.04385100090016717, "grad_norm": 0.359375, "learning_rate": 9.988812838308002e-05, "loss": 0.1275, "step": 341 }, { "epoch": 0.04397959621072485, "grad_norm": 0.318359375, "learning_rate": 9.98874516546503e-05, "loss": 0.1038, "step": 342 }, { "epoch": 0.04410819152128252, "grad_norm": 0.333984375, "learning_rate": 9.988677288787765e-05, "loss": 0.1154, "step": 343 }, { "epoch": 0.0442367868318402, "grad_norm": 0.314453125, "learning_rate": 9.98860920827898e-05, "loss": 0.0941, "step": 344 }, { "epoch": 0.04436538214239787, "grad_norm": 0.30859375, "learning_rate": 9.988540923941454e-05, "loss": 0.1083, "step": 345 }, { "epoch": 0.04449397745295555, "grad_norm": 0.33984375, "learning_rate": 9.988472435777978e-05, "loss": 0.1179, "step": 346 }, { "epoch": 0.044622572763513224, "grad_norm": 0.302734375, "learning_rate": 9.988403743791352e-05, "loss": 0.1067, "step": 347 }, { "epoch": 0.044751168074070896, "grad_norm": 0.29296875, "learning_rate": 9.988334847984382e-05, "loss": 0.0989, "step": 348 }, { "epoch": 0.044879763384628575, "grad_norm": 0.345703125, "learning_rate": 9.988265748359882e-05, "loss": 0.0942, "step": 349 }, { "epoch": 0.04500835869518625, "grad_norm": 0.330078125, "learning_rate": 9.988196444920675e-05, "loss": 0.1156, "step": 350 }, { "epoch": 0.045136954005743926, "grad_norm": 0.330078125, "learning_rate": 9.988126937669595e-05, "loss": 0.1048, "step": 351 }, { "epoch": 0.0452655493163016, "grad_norm": 0.30078125, "learning_rate": 9.98805722660948e-05, "loss": 0.1151, "step": 352 }, { "epoch": 0.04539414462685928, "grad_norm": 0.326171875, "learning_rate": 9.987987311743177e-05, "loss": 0.1004, "step": 353 }, { "epoch": 0.04552273993741695, "grad_norm": 0.3125, "learning_rate": 9.987917193073547e-05, "loss": 0.1191, "step": 354 }, { "epoch": 0.04565133524797462, "grad_norm": 0.30859375, "learning_rate": 9.987846870603452e-05, "loss": 0.1094, "step": 355 }, { "epoch": 0.0457799305585323, "grad_norm": 0.31640625, "learning_rate": 9.987776344335765e-05, "loss": 0.1124, "step": 356 }, { "epoch": 0.04590852586908997, "grad_norm": 0.302734375, "learning_rate": 9.987705614273368e-05, "loss": 0.1017, "step": 357 }, { "epoch": 0.04603712117964765, "grad_norm": 0.306640625, "learning_rate": 9.987634680419153e-05, "loss": 0.1022, "step": 358 }, { "epoch": 0.04616571649020532, "grad_norm": 0.322265625, "learning_rate": 9.987563542776015e-05, "loss": 0.0976, "step": 359 }, { "epoch": 0.046294311800763, "grad_norm": 0.296875, "learning_rate": 9.987492201346864e-05, "loss": 0.1093, "step": 360 }, { "epoch": 0.046422907111320674, "grad_norm": 0.326171875, "learning_rate": 9.987420656134611e-05, "loss": 0.1089, "step": 361 }, { "epoch": 0.046551502421878346, "grad_norm": 0.30078125, "learning_rate": 9.987348907142183e-05, "loss": 0.0903, "step": 362 }, { "epoch": 0.046680097732436025, "grad_norm": 0.31640625, "learning_rate": 9.98727695437251e-05, "loss": 0.1162, "step": 363 }, { "epoch": 0.0468086930429937, "grad_norm": 0.310546875, "learning_rate": 9.987204797828531e-05, "loss": 0.1152, "step": 364 }, { "epoch": 0.046937288353551376, "grad_norm": 0.318359375, "learning_rate": 9.987132437513196e-05, "loss": 0.1035, "step": 365 }, { "epoch": 0.04706588366410905, "grad_norm": 0.330078125, "learning_rate": 9.98705987342946e-05, "loss": 0.1071, "step": 366 }, { "epoch": 0.04719447897466672, "grad_norm": 0.330078125, "learning_rate": 9.986987105580287e-05, "loss": 0.1, "step": 367 }, { "epoch": 0.0473230742852244, "grad_norm": 0.302734375, "learning_rate": 9.986914133968654e-05, "loss": 0.1066, "step": 368 }, { "epoch": 0.04745166959578207, "grad_norm": 0.294921875, "learning_rate": 9.986840958597539e-05, "loss": 0.095, "step": 369 }, { "epoch": 0.04758026490633975, "grad_norm": 0.35546875, "learning_rate": 9.986767579469936e-05, "loss": 0.1142, "step": 370 }, { "epoch": 0.04770886021689742, "grad_norm": 0.33203125, "learning_rate": 9.986693996588836e-05, "loss": 0.1109, "step": 371 }, { "epoch": 0.0478374555274551, "grad_norm": 0.341796875, "learning_rate": 9.986620209957253e-05, "loss": 0.0849, "step": 372 }, { "epoch": 0.04796605083801277, "grad_norm": 0.3359375, "learning_rate": 9.986546219578197e-05, "loss": 0.0912, "step": 373 }, { "epoch": 0.048094646148570445, "grad_norm": 0.3515625, "learning_rate": 9.986472025454694e-05, "loss": 0.0999, "step": 374 }, { "epoch": 0.048223241459128124, "grad_norm": 0.375, "learning_rate": 9.986397627589775e-05, "loss": 0.1087, "step": 375 }, { "epoch": 0.048351836769685796, "grad_norm": 0.361328125, "learning_rate": 9.986323025986477e-05, "loss": 0.0895, "step": 376 }, { "epoch": 0.048480432080243475, "grad_norm": 0.318359375, "learning_rate": 9.986248220647851e-05, "loss": 0.0963, "step": 377 }, { "epoch": 0.04860902739080115, "grad_norm": 0.3515625, "learning_rate": 9.986173211576952e-05, "loss": 0.1063, "step": 378 }, { "epoch": 0.048737622701358826, "grad_norm": 0.326171875, "learning_rate": 9.986097998776846e-05, "loss": 0.0945, "step": 379 }, { "epoch": 0.0488662180119165, "grad_norm": 0.330078125, "learning_rate": 9.986022582250608e-05, "loss": 0.1089, "step": 380 }, { "epoch": 0.04899481332247417, "grad_norm": 0.298828125, "learning_rate": 9.985946962001315e-05, "loss": 0.0951, "step": 381 }, { "epoch": 0.04912340863303185, "grad_norm": 0.375, "learning_rate": 9.98587113803206e-05, "loss": 0.0925, "step": 382 }, { "epoch": 0.04925200394358952, "grad_norm": 0.43359375, "learning_rate": 9.985795110345937e-05, "loss": 0.117, "step": 383 }, { "epoch": 0.0493805992541472, "grad_norm": 0.33984375, "learning_rate": 9.985718878946058e-05, "loss": 0.0948, "step": 384 }, { "epoch": 0.04950919456470487, "grad_norm": 0.337890625, "learning_rate": 9.985642443835534e-05, "loss": 0.1009, "step": 385 }, { "epoch": 0.04963778987526255, "grad_norm": 0.390625, "learning_rate": 9.985565805017488e-05, "loss": 0.1129, "step": 386 }, { "epoch": 0.049766385185820224, "grad_norm": 0.3671875, "learning_rate": 9.985488962495053e-05, "loss": 0.1203, "step": 387 }, { "epoch": 0.049894980496377896, "grad_norm": 0.302734375, "learning_rate": 9.98541191627137e-05, "loss": 0.1051, "step": 388 }, { "epoch": 0.050023575806935575, "grad_norm": 0.291015625, "learning_rate": 9.985334666349584e-05, "loss": 0.0917, "step": 389 }, { "epoch": 0.05015217111749325, "grad_norm": 0.318359375, "learning_rate": 9.985257212732852e-05, "loss": 0.0911, "step": 390 }, { "epoch": 0.050280766428050926, "grad_norm": 0.376953125, "learning_rate": 9.98517955542434e-05, "loss": 0.108, "step": 391 }, { "epoch": 0.0504093617386086, "grad_norm": 0.333984375, "learning_rate": 9.985101694427219e-05, "loss": 0.1012, "step": 392 }, { "epoch": 0.05053795704916628, "grad_norm": 0.294921875, "learning_rate": 9.985023629744673e-05, "loss": 0.0928, "step": 393 }, { "epoch": 0.05066655235972395, "grad_norm": 0.30078125, "learning_rate": 9.984945361379888e-05, "loss": 0.0966, "step": 394 }, { "epoch": 0.05079514767028162, "grad_norm": 0.30859375, "learning_rate": 9.984866889336066e-05, "loss": 0.0991, "step": 395 }, { "epoch": 0.0509237429808393, "grad_norm": 0.3046875, "learning_rate": 9.984788213616409e-05, "loss": 0.1063, "step": 396 }, { "epoch": 0.05105233829139697, "grad_norm": 0.298828125, "learning_rate": 9.984709334224136e-05, "loss": 0.0969, "step": 397 }, { "epoch": 0.05118093360195465, "grad_norm": 0.328125, "learning_rate": 9.984630251162468e-05, "loss": 0.0985, "step": 398 }, { "epoch": 0.05130952891251232, "grad_norm": 0.283203125, "learning_rate": 9.984550964434634e-05, "loss": 0.0946, "step": 399 }, { "epoch": 0.05143812422307, "grad_norm": 0.318359375, "learning_rate": 9.984471474043876e-05, "loss": 0.1126, "step": 400 }, { "epoch": 0.051566719533627674, "grad_norm": 0.3046875, "learning_rate": 9.984391779993444e-05, "loss": 0.0954, "step": 401 }, { "epoch": 0.051695314844185346, "grad_norm": 0.298828125, "learning_rate": 9.98431188228659e-05, "loss": 0.0902, "step": 402 }, { "epoch": 0.051823910154743025, "grad_norm": 0.330078125, "learning_rate": 9.984231780926581e-05, "loss": 0.1161, "step": 403 }, { "epoch": 0.0519525054653007, "grad_norm": 0.306640625, "learning_rate": 9.984151475916687e-05, "loss": 0.1002, "step": 404 }, { "epoch": 0.052081100775858376, "grad_norm": 0.3125, "learning_rate": 9.984070967260194e-05, "loss": 0.1024, "step": 405 }, { "epoch": 0.05220969608641605, "grad_norm": 0.2890625, "learning_rate": 9.983990254960387e-05, "loss": 0.0914, "step": 406 }, { "epoch": 0.05233829139697373, "grad_norm": 0.31640625, "learning_rate": 9.983909339020567e-05, "loss": 0.1108, "step": 407 }, { "epoch": 0.0524668867075314, "grad_norm": 0.34375, "learning_rate": 9.983828219444038e-05, "loss": 0.0983, "step": 408 }, { "epoch": 0.05259548201808907, "grad_norm": 0.279296875, "learning_rate": 9.983746896234114e-05, "loss": 0.0998, "step": 409 }, { "epoch": 0.05272407732864675, "grad_norm": 0.341796875, "learning_rate": 9.983665369394119e-05, "loss": 0.1075, "step": 410 }, { "epoch": 0.05285267263920442, "grad_norm": 0.3125, "learning_rate": 9.983583638927386e-05, "loss": 0.1132, "step": 411 }, { "epoch": 0.0529812679497621, "grad_norm": 0.283203125, "learning_rate": 9.983501704837252e-05, "loss": 0.0802, "step": 412 }, { "epoch": 0.05310986326031977, "grad_norm": 0.326171875, "learning_rate": 9.983419567127064e-05, "loss": 0.0936, "step": 413 }, { "epoch": 0.05323845857087745, "grad_norm": 0.2890625, "learning_rate": 9.98333722580018e-05, "loss": 0.0986, "step": 414 }, { "epoch": 0.053367053881435124, "grad_norm": 0.30078125, "learning_rate": 9.983254680859962e-05, "loss": 0.0992, "step": 415 }, { "epoch": 0.053495649191992796, "grad_norm": 0.29296875, "learning_rate": 9.983171932309786e-05, "loss": 0.0916, "step": 416 }, { "epoch": 0.053624244502550475, "grad_norm": 0.291015625, "learning_rate": 9.98308898015303e-05, "loss": 0.1006, "step": 417 }, { "epoch": 0.05375283981310815, "grad_norm": 0.3515625, "learning_rate": 9.983005824393086e-05, "loss": 0.1049, "step": 418 }, { "epoch": 0.053881435123665826, "grad_norm": 0.302734375, "learning_rate": 9.98292246503335e-05, "loss": 0.0914, "step": 419 }, { "epoch": 0.0540100304342235, "grad_norm": 0.29296875, "learning_rate": 9.982838902077227e-05, "loss": 0.0908, "step": 420 }, { "epoch": 0.05413862574478118, "grad_norm": 0.34765625, "learning_rate": 9.982755135528134e-05, "loss": 0.1058, "step": 421 }, { "epoch": 0.05426722105533885, "grad_norm": 0.287109375, "learning_rate": 9.982671165389491e-05, "loss": 0.1015, "step": 422 }, { "epoch": 0.05439581636589652, "grad_norm": 0.265625, "learning_rate": 9.98258699166473e-05, "loss": 0.0942, "step": 423 }, { "epoch": 0.0545244116764542, "grad_norm": 0.26953125, "learning_rate": 9.982502614357292e-05, "loss": 0.0974, "step": 424 }, { "epoch": 0.05465300698701187, "grad_norm": 0.28125, "learning_rate": 9.98241803347062e-05, "loss": 0.1038, "step": 425 }, { "epoch": 0.05478160229756955, "grad_norm": 0.326171875, "learning_rate": 9.982333249008175e-05, "loss": 0.0956, "step": 426 }, { "epoch": 0.05491019760812722, "grad_norm": 0.302734375, "learning_rate": 9.982248260973418e-05, "loss": 0.1036, "step": 427 }, { "epoch": 0.0550387929186849, "grad_norm": 0.279296875, "learning_rate": 9.982163069369823e-05, "loss": 0.0956, "step": 428 }, { "epoch": 0.055167388229242574, "grad_norm": 0.29296875, "learning_rate": 9.982077674200869e-05, "loss": 0.0984, "step": 429 }, { "epoch": 0.055295983539800246, "grad_norm": 0.314453125, "learning_rate": 9.981992075470049e-05, "loss": 0.0961, "step": 430 }, { "epoch": 0.055424578850357925, "grad_norm": 0.3203125, "learning_rate": 9.981906273180855e-05, "loss": 0.0948, "step": 431 }, { "epoch": 0.0555531741609156, "grad_norm": 0.310546875, "learning_rate": 9.981820267336797e-05, "loss": 0.0963, "step": 432 }, { "epoch": 0.055681769471473276, "grad_norm": 0.28515625, "learning_rate": 9.981734057941386e-05, "loss": 0.0939, "step": 433 }, { "epoch": 0.05581036478203095, "grad_norm": 0.275390625, "learning_rate": 9.981647644998147e-05, "loss": 0.0838, "step": 434 }, { "epoch": 0.05593896009258862, "grad_norm": 0.330078125, "learning_rate": 9.981561028510611e-05, "loss": 0.1186, "step": 435 }, { "epoch": 0.0560675554031463, "grad_norm": 0.291015625, "learning_rate": 9.981474208482316e-05, "loss": 0.0874, "step": 436 }, { "epoch": 0.05619615071370397, "grad_norm": 0.27734375, "learning_rate": 9.981387184916808e-05, "loss": 0.0932, "step": 437 }, { "epoch": 0.05632474602426165, "grad_norm": 0.314453125, "learning_rate": 9.981299957817644e-05, "loss": 0.1016, "step": 438 }, { "epoch": 0.05645334133481932, "grad_norm": 0.30859375, "learning_rate": 9.981212527188389e-05, "loss": 0.1058, "step": 439 }, { "epoch": 0.056581936645377, "grad_norm": 0.296875, "learning_rate": 9.981124893032614e-05, "loss": 0.0915, "step": 440 }, { "epoch": 0.05671053195593467, "grad_norm": 0.30078125, "learning_rate": 9.981037055353899e-05, "loss": 0.0995, "step": 441 }, { "epoch": 0.056839127266492345, "grad_norm": 0.2734375, "learning_rate": 9.980949014155834e-05, "loss": 0.086, "step": 442 }, { "epoch": 0.056967722577050024, "grad_norm": 0.296875, "learning_rate": 9.980860769442018e-05, "loss": 0.0873, "step": 443 }, { "epoch": 0.057096317887607696, "grad_norm": 0.271484375, "learning_rate": 9.980772321216052e-05, "loss": 0.0826, "step": 444 }, { "epoch": 0.057224913198165375, "grad_norm": 0.28125, "learning_rate": 9.980683669481554e-05, "loss": 0.0911, "step": 445 }, { "epoch": 0.05735350850872305, "grad_norm": 0.345703125, "learning_rate": 9.980594814242145e-05, "loss": 0.1066, "step": 446 }, { "epoch": 0.057482103819280726, "grad_norm": 0.330078125, "learning_rate": 9.980505755501454e-05, "loss": 0.1097, "step": 447 }, { "epoch": 0.0576106991298384, "grad_norm": 0.365234375, "learning_rate": 9.980416493263123e-05, "loss": 0.1012, "step": 448 }, { "epoch": 0.05773929444039607, "grad_norm": 0.3203125, "learning_rate": 9.980327027530796e-05, "loss": 0.1027, "step": 449 }, { "epoch": 0.05786788975095375, "grad_norm": 0.345703125, "learning_rate": 9.98023735830813e-05, "loss": 0.098, "step": 450 }, { "epoch": 0.05799648506151142, "grad_norm": 0.314453125, "learning_rate": 9.980147485598788e-05, "loss": 0.1076, "step": 451 }, { "epoch": 0.0581250803720691, "grad_norm": 0.314453125, "learning_rate": 9.980057409406444e-05, "loss": 0.0896, "step": 452 }, { "epoch": 0.05825367568262677, "grad_norm": 0.28515625, "learning_rate": 9.979967129734776e-05, "loss": 0.1061, "step": 453 }, { "epoch": 0.05838227099318445, "grad_norm": 0.28515625, "learning_rate": 9.979876646587473e-05, "loss": 0.0937, "step": 454 }, { "epoch": 0.05851086630374212, "grad_norm": 0.287109375, "learning_rate": 9.979785959968235e-05, "loss": 0.092, "step": 455 }, { "epoch": 0.058639461614299795, "grad_norm": 0.32421875, "learning_rate": 9.979695069880763e-05, "loss": 0.1093, "step": 456 }, { "epoch": 0.058768056924857474, "grad_norm": 0.34375, "learning_rate": 9.979603976328775e-05, "loss": 0.1015, "step": 457 }, { "epoch": 0.058896652235415146, "grad_norm": 0.291015625, "learning_rate": 9.979512679315989e-05, "loss": 0.0982, "step": 458 }, { "epoch": 0.059025247545972825, "grad_norm": 0.28515625, "learning_rate": 9.979421178846138e-05, "loss": 0.0966, "step": 459 }, { "epoch": 0.0591538428565305, "grad_norm": 0.3125, "learning_rate": 9.979329474922958e-05, "loss": 0.1095, "step": 460 }, { "epoch": 0.059282438167088176, "grad_norm": 0.349609375, "learning_rate": 9.979237567550199e-05, "loss": 0.1002, "step": 461 }, { "epoch": 0.05941103347764585, "grad_norm": 0.3515625, "learning_rate": 9.979145456731616e-05, "loss": 0.1176, "step": 462 }, { "epoch": 0.05953962878820352, "grad_norm": 0.330078125, "learning_rate": 9.979053142470969e-05, "loss": 0.112, "step": 463 }, { "epoch": 0.0596682240987612, "grad_norm": 0.3046875, "learning_rate": 9.978960624772034e-05, "loss": 0.091, "step": 464 }, { "epoch": 0.05979681940931887, "grad_norm": 0.29296875, "learning_rate": 9.978867903638589e-05, "loss": 0.1004, "step": 465 }, { "epoch": 0.05992541471987655, "grad_norm": 0.359375, "learning_rate": 9.978774979074421e-05, "loss": 0.1129, "step": 466 }, { "epoch": 0.06005401003043422, "grad_norm": 0.326171875, "learning_rate": 9.978681851083331e-05, "loss": 0.0895, "step": 467 }, { "epoch": 0.0601826053409919, "grad_norm": 0.337890625, "learning_rate": 9.978588519669121e-05, "loss": 0.1073, "step": 468 }, { "epoch": 0.060311200651549574, "grad_norm": 0.318359375, "learning_rate": 9.978494984835605e-05, "loss": 0.1029, "step": 469 }, { "epoch": 0.060439795962107246, "grad_norm": 0.306640625, "learning_rate": 9.978401246586605e-05, "loss": 0.0832, "step": 470 }, { "epoch": 0.060568391272664925, "grad_norm": 0.3203125, "learning_rate": 9.978307304925952e-05, "loss": 0.1014, "step": 471 }, { "epoch": 0.0606969865832226, "grad_norm": 0.34765625, "learning_rate": 9.978213159857481e-05, "loss": 0.1112, "step": 472 }, { "epoch": 0.060825581893780276, "grad_norm": 0.298828125, "learning_rate": 9.978118811385042e-05, "loss": 0.1071, "step": 473 }, { "epoch": 0.06095417720433795, "grad_norm": 0.3125, "learning_rate": 9.978024259512488e-05, "loss": 0.0877, "step": 474 }, { "epoch": 0.06108277251489563, "grad_norm": 0.279296875, "learning_rate": 9.977929504243683e-05, "loss": 0.0785, "step": 475 }, { "epoch": 0.0612113678254533, "grad_norm": 0.28125, "learning_rate": 9.9778345455825e-05, "loss": 0.1002, "step": 476 }, { "epoch": 0.06133996313601097, "grad_norm": 0.27734375, "learning_rate": 9.977739383532818e-05, "loss": 0.0998, "step": 477 }, { "epoch": 0.06146855844656865, "grad_norm": 0.369140625, "learning_rate": 9.977644018098524e-05, "loss": 0.1074, "step": 478 }, { "epoch": 0.06159715375712632, "grad_norm": 0.318359375, "learning_rate": 9.977548449283517e-05, "loss": 0.107, "step": 479 }, { "epoch": 0.061725749067684, "grad_norm": 0.294921875, "learning_rate": 9.977452677091699e-05, "loss": 0.0903, "step": 480 }, { "epoch": 0.06185434437824167, "grad_norm": 0.31640625, "learning_rate": 9.977356701526983e-05, "loss": 0.0898, "step": 481 }, { "epoch": 0.06198293968879935, "grad_norm": 0.298828125, "learning_rate": 9.977260522593295e-05, "loss": 0.1001, "step": 482 }, { "epoch": 0.062111534999357024, "grad_norm": 0.36328125, "learning_rate": 9.97716414029456e-05, "loss": 0.1158, "step": 483 }, { "epoch": 0.062240130309914696, "grad_norm": 0.283203125, "learning_rate": 9.977067554634716e-05, "loss": 0.0916, "step": 484 }, { "epoch": 0.062368725620472375, "grad_norm": 0.283203125, "learning_rate": 9.976970765617712e-05, "loss": 0.105, "step": 485 }, { "epoch": 0.06249732093103005, "grad_norm": 0.31640625, "learning_rate": 9.976873773247501e-05, "loss": 0.0911, "step": 486 }, { "epoch": 0.06262591624158773, "grad_norm": 0.3203125, "learning_rate": 9.976776577528049e-05, "loss": 0.0855, "step": 487 }, { "epoch": 0.0627545115521454, "grad_norm": 0.298828125, "learning_rate": 9.976679178463323e-05, "loss": 0.1002, "step": 488 }, { "epoch": 0.06288310686270307, "grad_norm": 0.306640625, "learning_rate": 9.976581576057306e-05, "loss": 0.0893, "step": 489 }, { "epoch": 0.06301170217326074, "grad_norm": 0.30859375, "learning_rate": 9.976483770313982e-05, "loss": 0.0891, "step": 490 }, { "epoch": 0.06314029748381843, "grad_norm": 0.29296875, "learning_rate": 9.976385761237352e-05, "loss": 0.093, "step": 491 }, { "epoch": 0.0632688927943761, "grad_norm": 0.310546875, "learning_rate": 9.976287548831418e-05, "loss": 0.1046, "step": 492 }, { "epoch": 0.06339748810493377, "grad_norm": 0.32421875, "learning_rate": 9.976189133100194e-05, "loss": 0.0954, "step": 493 }, { "epoch": 0.06352608341549144, "grad_norm": 0.28125, "learning_rate": 9.976090514047698e-05, "loss": 0.0986, "step": 494 }, { "epoch": 0.06365467872604913, "grad_norm": 0.26171875, "learning_rate": 9.975991691677964e-05, "loss": 0.0831, "step": 495 }, { "epoch": 0.0637832740366068, "grad_norm": 0.279296875, "learning_rate": 9.975892665995026e-05, "loss": 0.1016, "step": 496 }, { "epoch": 0.06391186934716447, "grad_norm": 0.283203125, "learning_rate": 9.975793437002932e-05, "loss": 0.0806, "step": 497 }, { "epoch": 0.06404046465772215, "grad_norm": 0.32421875, "learning_rate": 9.975694004705735e-05, "loss": 0.0982, "step": 498 }, { "epoch": 0.06416905996827982, "grad_norm": 0.27734375, "learning_rate": 9.9755943691075e-05, "loss": 0.0942, "step": 499 }, { "epoch": 0.0642976552788375, "grad_norm": 0.29296875, "learning_rate": 9.975494530212298e-05, "loss": 0.0825, "step": 500 }, { "epoch": 0.0642976552788375, "eval_loss": 0.09297225624322891, "eval_runtime": 1043.0149, "eval_samples_per_second": 94.175, "eval_steps_per_second": 1.177, "step": 500 }, { "epoch": 0.06442625058939518, "grad_norm": 0.291015625, "learning_rate": 9.975394488024203e-05, "loss": 0.0921, "step": 501 }, { "epoch": 0.06455484589995285, "grad_norm": 0.283203125, "learning_rate": 9.97529424254731e-05, "loss": 0.1094, "step": 502 }, { "epoch": 0.06468344121051052, "grad_norm": 0.2890625, "learning_rate": 9.97519379378571e-05, "loss": 0.093, "step": 503 }, { "epoch": 0.06481203652106819, "grad_norm": 0.27734375, "learning_rate": 9.97509314174351e-05, "loss": 0.0845, "step": 504 }, { "epoch": 0.06494063183162588, "grad_norm": 0.255859375, "learning_rate": 9.974992286424819e-05, "loss": 0.0747, "step": 505 }, { "epoch": 0.06506922714218355, "grad_norm": 0.259765625, "learning_rate": 9.974891227833761e-05, "loss": 0.0826, "step": 506 }, { "epoch": 0.06519782245274122, "grad_norm": 0.26953125, "learning_rate": 9.974789965974464e-05, "loss": 0.0909, "step": 507 }, { "epoch": 0.0653264177632989, "grad_norm": 0.291015625, "learning_rate": 9.974688500851067e-05, "loss": 0.0918, "step": 508 }, { "epoch": 0.06545501307385658, "grad_norm": 0.2890625, "learning_rate": 9.974586832467712e-05, "loss": 0.0903, "step": 509 }, { "epoch": 0.06558360838441425, "grad_norm": 0.28515625, "learning_rate": 9.974484960828558e-05, "loss": 0.085, "step": 510 }, { "epoch": 0.06571220369497192, "grad_norm": 0.365234375, "learning_rate": 9.974382885937763e-05, "loss": 0.1074, "step": 511 }, { "epoch": 0.0658407990055296, "grad_norm": 0.25390625, "learning_rate": 9.9742806077995e-05, "loss": 0.0818, "step": 512 }, { "epoch": 0.06596939431608727, "grad_norm": 0.298828125, "learning_rate": 9.974178126417948e-05, "loss": 0.0935, "step": 513 }, { "epoch": 0.06609798962664495, "grad_norm": 0.2890625, "learning_rate": 9.974075441797292e-05, "loss": 0.087, "step": 514 }, { "epoch": 0.06622658493720263, "grad_norm": 0.2890625, "learning_rate": 9.973972553941731e-05, "loss": 0.0923, "step": 515 }, { "epoch": 0.0663551802477603, "grad_norm": 0.291015625, "learning_rate": 9.973869462855465e-05, "loss": 0.0879, "step": 516 }, { "epoch": 0.06648377555831797, "grad_norm": 0.298828125, "learning_rate": 9.973766168542711e-05, "loss": 0.0892, "step": 517 }, { "epoch": 0.06661237086887564, "grad_norm": 0.322265625, "learning_rate": 9.973662671007684e-05, "loss": 0.099, "step": 518 }, { "epoch": 0.06674096617943333, "grad_norm": 0.26171875, "learning_rate": 9.973558970254617e-05, "loss": 0.0829, "step": 519 }, { "epoch": 0.066869561489991, "grad_norm": 0.3125, "learning_rate": 9.973455066287747e-05, "loss": 0.1044, "step": 520 }, { "epoch": 0.06699815680054867, "grad_norm": 0.30859375, "learning_rate": 9.973350959111316e-05, "loss": 0.0961, "step": 521 }, { "epoch": 0.06712675211110634, "grad_norm": 0.29296875, "learning_rate": 9.97324664872958e-05, "loss": 0.0933, "step": 522 }, { "epoch": 0.06725534742166403, "grad_norm": 0.283203125, "learning_rate": 9.973142135146802e-05, "loss": 0.0873, "step": 523 }, { "epoch": 0.0673839427322217, "grad_norm": 0.259765625, "learning_rate": 9.97303741836725e-05, "loss": 0.0863, "step": 524 }, { "epoch": 0.06751253804277937, "grad_norm": 0.28125, "learning_rate": 9.972932498395204e-05, "loss": 0.0951, "step": 525 }, { "epoch": 0.06764113335333705, "grad_norm": 0.3125, "learning_rate": 9.97282737523495e-05, "loss": 0.1012, "step": 526 }, { "epoch": 0.06776972866389472, "grad_norm": 0.267578125, "learning_rate": 9.972722048890784e-05, "loss": 0.0907, "step": 527 }, { "epoch": 0.0678983239744524, "grad_norm": 0.2451171875, "learning_rate": 9.97261651936701e-05, "loss": 0.0878, "step": 528 }, { "epoch": 0.06802691928501008, "grad_norm": 0.306640625, "learning_rate": 9.972510786667939e-05, "loss": 0.0991, "step": 529 }, { "epoch": 0.06815551459556775, "grad_norm": 0.26171875, "learning_rate": 9.972404850797892e-05, "loss": 0.0814, "step": 530 }, { "epoch": 0.06828410990612542, "grad_norm": 0.2734375, "learning_rate": 9.972298711761194e-05, "loss": 0.0893, "step": 531 }, { "epoch": 0.06841270521668309, "grad_norm": 0.283203125, "learning_rate": 9.972192369562187e-05, "loss": 0.095, "step": 532 }, { "epoch": 0.06854130052724078, "grad_norm": 0.28515625, "learning_rate": 9.972085824205212e-05, "loss": 0.0968, "step": 533 }, { "epoch": 0.06866989583779845, "grad_norm": 0.2890625, "learning_rate": 9.971979075694624e-05, "loss": 0.0983, "step": 534 }, { "epoch": 0.06879849114835612, "grad_norm": 0.294921875, "learning_rate": 9.971872124034785e-05, "loss": 0.0948, "step": 535 }, { "epoch": 0.0689270864589138, "grad_norm": 0.283203125, "learning_rate": 9.971764969230062e-05, "loss": 0.0975, "step": 536 }, { "epoch": 0.06905568176947147, "grad_norm": 0.296875, "learning_rate": 9.971657611284836e-05, "loss": 0.0803, "step": 537 }, { "epoch": 0.06918427708002915, "grad_norm": 0.291015625, "learning_rate": 9.971550050203495e-05, "loss": 0.0958, "step": 538 }, { "epoch": 0.06931287239058682, "grad_norm": 0.236328125, "learning_rate": 9.97144228599043e-05, "loss": 0.0758, "step": 539 }, { "epoch": 0.0694414677011445, "grad_norm": 0.271484375, "learning_rate": 9.971334318650047e-05, "loss": 0.085, "step": 540 }, { "epoch": 0.06957006301170217, "grad_norm": 0.283203125, "learning_rate": 9.971226148186757e-05, "loss": 0.1047, "step": 541 }, { "epoch": 0.06969865832225985, "grad_norm": 0.228515625, "learning_rate": 9.971117774604977e-05, "loss": 0.0772, "step": 542 }, { "epoch": 0.06982725363281753, "grad_norm": 0.28515625, "learning_rate": 9.971009197909138e-05, "loss": 0.0908, "step": 543 }, { "epoch": 0.0699558489433752, "grad_norm": 0.302734375, "learning_rate": 9.970900418103675e-05, "loss": 0.0901, "step": 544 }, { "epoch": 0.07008444425393287, "grad_norm": 0.265625, "learning_rate": 9.970791435193034e-05, "loss": 0.0795, "step": 545 }, { "epoch": 0.07021303956449054, "grad_norm": 0.26171875, "learning_rate": 9.970682249181666e-05, "loss": 0.0832, "step": 546 }, { "epoch": 0.07034163487504823, "grad_norm": 0.27734375, "learning_rate": 9.970572860074034e-05, "loss": 0.0804, "step": 547 }, { "epoch": 0.0704702301856059, "grad_norm": 0.267578125, "learning_rate": 9.970463267874606e-05, "loss": 0.0888, "step": 548 }, { "epoch": 0.07059882549616357, "grad_norm": 0.30078125, "learning_rate": 9.97035347258786e-05, "loss": 0.11, "step": 549 }, { "epoch": 0.07072742080672124, "grad_norm": 0.28515625, "learning_rate": 9.970243474218282e-05, "loss": 0.0846, "step": 550 }, { "epoch": 0.07085601611727892, "grad_norm": 0.2890625, "learning_rate": 9.970133272770369e-05, "loss": 0.0818, "step": 551 }, { "epoch": 0.0709846114278366, "grad_norm": 0.2734375, "learning_rate": 9.970022868248621e-05, "loss": 0.0921, "step": 552 }, { "epoch": 0.07111320673839427, "grad_norm": 0.2734375, "learning_rate": 9.96991226065755e-05, "loss": 0.0885, "step": 553 }, { "epoch": 0.07124180204895195, "grad_norm": 0.328125, "learning_rate": 9.969801450001673e-05, "loss": 0.0863, "step": 554 }, { "epoch": 0.07137039735950962, "grad_norm": 0.2734375, "learning_rate": 9.969690436285521e-05, "loss": 0.0854, "step": 555 }, { "epoch": 0.0714989926700673, "grad_norm": 0.291015625, "learning_rate": 9.969579219513627e-05, "loss": 0.1045, "step": 556 }, { "epoch": 0.07162758798062498, "grad_norm": 0.28515625, "learning_rate": 9.969467799690539e-05, "loss": 0.0946, "step": 557 }, { "epoch": 0.07175618329118265, "grad_norm": 0.310546875, "learning_rate": 9.969356176820805e-05, "loss": 0.1259, "step": 558 }, { "epoch": 0.07188477860174032, "grad_norm": 0.2490234375, "learning_rate": 9.96924435090899e-05, "loss": 0.0865, "step": 559 }, { "epoch": 0.07201337391229799, "grad_norm": 0.3046875, "learning_rate": 9.96913232195966e-05, "loss": 0.1026, "step": 560 }, { "epoch": 0.07214196922285568, "grad_norm": 0.306640625, "learning_rate": 9.969020089977392e-05, "loss": 0.1109, "step": 561 }, { "epoch": 0.07227056453341335, "grad_norm": 0.30078125, "learning_rate": 9.968907654966774e-05, "loss": 0.095, "step": 562 }, { "epoch": 0.07239915984397102, "grad_norm": 0.2890625, "learning_rate": 9.9687950169324e-05, "loss": 0.0823, "step": 563 }, { "epoch": 0.0725277551545287, "grad_norm": 0.27734375, "learning_rate": 9.96868217587887e-05, "loss": 0.0873, "step": 564 }, { "epoch": 0.07265635046508637, "grad_norm": 0.271484375, "learning_rate": 9.968569131810796e-05, "loss": 0.0894, "step": 565 }, { "epoch": 0.07278494577564405, "grad_norm": 0.26953125, "learning_rate": 9.968455884732797e-05, "loss": 0.1013, "step": 566 }, { "epoch": 0.07291354108620172, "grad_norm": 0.283203125, "learning_rate": 9.9683424346495e-05, "loss": 0.0902, "step": 567 }, { "epoch": 0.0730421363967594, "grad_norm": 0.26953125, "learning_rate": 9.968228781565538e-05, "loss": 0.0933, "step": 568 }, { "epoch": 0.07317073170731707, "grad_norm": 0.255859375, "learning_rate": 9.968114925485558e-05, "loss": 0.0772, "step": 569 }, { "epoch": 0.07329932701787475, "grad_norm": 0.267578125, "learning_rate": 9.968000866414214e-05, "loss": 0.0922, "step": 570 }, { "epoch": 0.07342792232843243, "grad_norm": 0.28125, "learning_rate": 9.96788660435616e-05, "loss": 0.0769, "step": 571 }, { "epoch": 0.0735565176389901, "grad_norm": 0.29296875, "learning_rate": 9.967772139316069e-05, "loss": 0.0972, "step": 572 }, { "epoch": 0.07368511294954777, "grad_norm": 0.267578125, "learning_rate": 9.967657471298617e-05, "loss": 0.0984, "step": 573 }, { "epoch": 0.07381370826010544, "grad_norm": 0.28125, "learning_rate": 9.967542600308488e-05, "loss": 0.0844, "step": 574 }, { "epoch": 0.07394230357066313, "grad_norm": 0.30078125, "learning_rate": 9.967427526350377e-05, "loss": 0.0958, "step": 575 }, { "epoch": 0.0740708988812208, "grad_norm": 0.275390625, "learning_rate": 9.967312249428986e-05, "loss": 0.0876, "step": 576 }, { "epoch": 0.07419949419177847, "grad_norm": 0.259765625, "learning_rate": 9.967196769549023e-05, "loss": 0.0784, "step": 577 }, { "epoch": 0.07432808950233614, "grad_norm": 0.26953125, "learning_rate": 9.967081086715208e-05, "loss": 0.0786, "step": 578 }, { "epoch": 0.07445668481289382, "grad_norm": 0.255859375, "learning_rate": 9.966965200932268e-05, "loss": 0.0899, "step": 579 }, { "epoch": 0.0745852801234515, "grad_norm": 0.251953125, "learning_rate": 9.966849112204938e-05, "loss": 0.0766, "step": 580 }, { "epoch": 0.07471387543400917, "grad_norm": 0.296875, "learning_rate": 9.96673282053796e-05, "loss": 0.088, "step": 581 }, { "epoch": 0.07484247074456685, "grad_norm": 0.2734375, "learning_rate": 9.966616325936087e-05, "loss": 0.0924, "step": 582 }, { "epoch": 0.07497106605512452, "grad_norm": 0.283203125, "learning_rate": 9.966499628404075e-05, "loss": 0.0991, "step": 583 }, { "epoch": 0.0750996613656822, "grad_norm": 0.33984375, "learning_rate": 9.966382727946699e-05, "loss": 0.1013, "step": 584 }, { "epoch": 0.07522825667623988, "grad_norm": 0.271484375, "learning_rate": 9.966265624568729e-05, "loss": 0.0954, "step": 585 }, { "epoch": 0.07535685198679755, "grad_norm": 0.2734375, "learning_rate": 9.966148318274953e-05, "loss": 0.0811, "step": 586 }, { "epoch": 0.07548544729735522, "grad_norm": 0.26171875, "learning_rate": 9.966030809070164e-05, "loss": 0.0851, "step": 587 }, { "epoch": 0.07561404260791289, "grad_norm": 0.26953125, "learning_rate": 9.96591309695916e-05, "loss": 0.0853, "step": 588 }, { "epoch": 0.07574263791847058, "grad_norm": 0.291015625, "learning_rate": 9.965795181946756e-05, "loss": 0.0955, "step": 589 }, { "epoch": 0.07587123322902825, "grad_norm": 0.265625, "learning_rate": 9.965677064037765e-05, "loss": 0.0947, "step": 590 }, { "epoch": 0.07599982853958592, "grad_norm": 0.255859375, "learning_rate": 9.965558743237016e-05, "loss": 0.073, "step": 591 }, { "epoch": 0.0761284238501436, "grad_norm": 0.255859375, "learning_rate": 9.965440219549343e-05, "loss": 0.0877, "step": 592 }, { "epoch": 0.07625701916070127, "grad_norm": 0.263671875, "learning_rate": 9.965321492979587e-05, "loss": 0.0843, "step": 593 }, { "epoch": 0.07638561447125895, "grad_norm": 0.3125, "learning_rate": 9.965202563532602e-05, "loss": 0.0831, "step": 594 }, { "epoch": 0.07651420978181663, "grad_norm": 0.294921875, "learning_rate": 9.965083431213245e-05, "loss": 0.0776, "step": 595 }, { "epoch": 0.0766428050923743, "grad_norm": 0.3046875, "learning_rate": 9.964964096026384e-05, "loss": 0.0972, "step": 596 }, { "epoch": 0.07677140040293197, "grad_norm": 0.263671875, "learning_rate": 9.964844557976896e-05, "loss": 0.0825, "step": 597 }, { "epoch": 0.07689999571348966, "grad_norm": 0.298828125, "learning_rate": 9.964724817069662e-05, "loss": 0.0879, "step": 598 }, { "epoch": 0.07702859102404733, "grad_norm": 0.26171875, "learning_rate": 9.96460487330958e-05, "loss": 0.078, "step": 599 }, { "epoch": 0.077157186334605, "grad_norm": 0.259765625, "learning_rate": 9.964484726701545e-05, "loss": 0.0817, "step": 600 }, { "epoch": 0.07728578164516267, "grad_norm": 0.314453125, "learning_rate": 9.96436437725047e-05, "loss": 0.1086, "step": 601 }, { "epoch": 0.07741437695572034, "grad_norm": 0.279296875, "learning_rate": 9.96424382496127e-05, "loss": 0.0921, "step": 602 }, { "epoch": 0.07754297226627803, "grad_norm": 0.25390625, "learning_rate": 9.964123069838873e-05, "loss": 0.0792, "step": 603 }, { "epoch": 0.0776715675768357, "grad_norm": 0.28125, "learning_rate": 9.96400211188821e-05, "loss": 0.0809, "step": 604 }, { "epoch": 0.07780016288739337, "grad_norm": 0.28515625, "learning_rate": 9.963880951114227e-05, "loss": 0.0887, "step": 605 }, { "epoch": 0.07792875819795105, "grad_norm": 0.2734375, "learning_rate": 9.963759587521871e-05, "loss": 0.0876, "step": 606 }, { "epoch": 0.07805735350850872, "grad_norm": 0.283203125, "learning_rate": 9.963638021116101e-05, "loss": 0.0823, "step": 607 }, { "epoch": 0.0781859488190664, "grad_norm": 0.2421875, "learning_rate": 9.963516251901885e-05, "loss": 0.0772, "step": 608 }, { "epoch": 0.07831454412962408, "grad_norm": 0.2734375, "learning_rate": 9.9633942798842e-05, "loss": 0.097, "step": 609 }, { "epoch": 0.07844313944018175, "grad_norm": 0.28125, "learning_rate": 9.963272105068026e-05, "loss": 0.0871, "step": 610 }, { "epoch": 0.07857173475073942, "grad_norm": 0.302734375, "learning_rate": 9.963149727458358e-05, "loss": 0.0797, "step": 611 }, { "epoch": 0.07870033006129709, "grad_norm": 0.26171875, "learning_rate": 9.963027147060196e-05, "loss": 0.0842, "step": 612 }, { "epoch": 0.07882892537185478, "grad_norm": 0.296875, "learning_rate": 9.962904363878547e-05, "loss": 0.0976, "step": 613 }, { "epoch": 0.07895752068241245, "grad_norm": 0.25390625, "learning_rate": 9.962781377918428e-05, "loss": 0.0785, "step": 614 }, { "epoch": 0.07908611599297012, "grad_norm": 0.291015625, "learning_rate": 9.962658189184865e-05, "loss": 0.084, "step": 615 }, { "epoch": 0.0792147113035278, "grad_norm": 0.259765625, "learning_rate": 9.96253479768289e-05, "loss": 0.0819, "step": 616 }, { "epoch": 0.07934330661408548, "grad_norm": 0.287109375, "learning_rate": 9.962411203417548e-05, "loss": 0.0907, "step": 617 }, { "epoch": 0.07947190192464315, "grad_norm": 0.33984375, "learning_rate": 9.962287406393884e-05, "loss": 0.1012, "step": 618 }, { "epoch": 0.07960049723520082, "grad_norm": 0.33984375, "learning_rate": 9.962163406616958e-05, "loss": 0.0875, "step": 619 }, { "epoch": 0.0797290925457585, "grad_norm": 0.275390625, "learning_rate": 9.962039204091839e-05, "loss": 0.086, "step": 620 }, { "epoch": 0.07985768785631617, "grad_norm": 0.271484375, "learning_rate": 9.961914798823599e-05, "loss": 0.0921, "step": 621 }, { "epoch": 0.07998628316687385, "grad_norm": 0.3125, "learning_rate": 9.961790190817322e-05, "loss": 0.0859, "step": 622 }, { "epoch": 0.08011487847743153, "grad_norm": 0.28515625, "learning_rate": 9.961665380078097e-05, "loss": 0.0901, "step": 623 }, { "epoch": 0.0802434737879892, "grad_norm": 0.2734375, "learning_rate": 9.961540366611029e-05, "loss": 0.0888, "step": 624 }, { "epoch": 0.08037206909854687, "grad_norm": 0.3125, "learning_rate": 9.961415150421222e-05, "loss": 0.0961, "step": 625 }, { "epoch": 0.08050066440910454, "grad_norm": 0.2890625, "learning_rate": 9.961289731513793e-05, "loss": 0.0891, "step": 626 }, { "epoch": 0.08062925971966223, "grad_norm": 0.26171875, "learning_rate": 9.961164109893866e-05, "loss": 0.0806, "step": 627 }, { "epoch": 0.0807578550302199, "grad_norm": 0.2490234375, "learning_rate": 9.961038285566574e-05, "loss": 0.0795, "step": 628 }, { "epoch": 0.08088645034077757, "grad_norm": 0.259765625, "learning_rate": 9.960912258537058e-05, "loss": 0.076, "step": 629 }, { "epoch": 0.08101504565133524, "grad_norm": 0.265625, "learning_rate": 9.96078602881047e-05, "loss": 0.0821, "step": 630 }, { "epoch": 0.08114364096189293, "grad_norm": 0.263671875, "learning_rate": 9.960659596391961e-05, "loss": 0.0891, "step": 631 }, { "epoch": 0.0812722362724506, "grad_norm": 0.251953125, "learning_rate": 9.960532961286704e-05, "loss": 0.0741, "step": 632 }, { "epoch": 0.08140083158300827, "grad_norm": 0.2490234375, "learning_rate": 9.96040612349987e-05, "loss": 0.0764, "step": 633 }, { "epoch": 0.08152942689356595, "grad_norm": 0.25, "learning_rate": 9.96027908303664e-05, "loss": 0.0752, "step": 634 }, { "epoch": 0.08165802220412362, "grad_norm": 0.26953125, "learning_rate": 9.960151839902207e-05, "loss": 0.0857, "step": 635 }, { "epoch": 0.0817866175146813, "grad_norm": 0.27734375, "learning_rate": 9.960024394101769e-05, "loss": 0.0813, "step": 636 }, { "epoch": 0.08191521282523898, "grad_norm": 0.251953125, "learning_rate": 9.959896745640534e-05, "loss": 0.0824, "step": 637 }, { "epoch": 0.08204380813579665, "grad_norm": 0.2578125, "learning_rate": 9.959768894523718e-05, "loss": 0.0712, "step": 638 }, { "epoch": 0.08217240344635432, "grad_norm": 0.259765625, "learning_rate": 9.959640840756543e-05, "loss": 0.0762, "step": 639 }, { "epoch": 0.08230099875691199, "grad_norm": 0.2734375, "learning_rate": 9.959512584344242e-05, "loss": 0.0709, "step": 640 }, { "epoch": 0.08242959406746968, "grad_norm": 0.2890625, "learning_rate": 9.959384125292058e-05, "loss": 0.0902, "step": 641 }, { "epoch": 0.08255818937802735, "grad_norm": 0.318359375, "learning_rate": 9.959255463605234e-05, "loss": 0.0854, "step": 642 }, { "epoch": 0.08268678468858502, "grad_norm": 0.2578125, "learning_rate": 9.95912659928903e-05, "loss": 0.0766, "step": 643 }, { "epoch": 0.0828153799991427, "grad_norm": 0.26171875, "learning_rate": 9.958997532348713e-05, "loss": 0.0777, "step": 644 }, { "epoch": 0.08294397530970038, "grad_norm": 0.271484375, "learning_rate": 9.958868262789556e-05, "loss": 0.098, "step": 645 }, { "epoch": 0.08307257062025805, "grad_norm": 0.2470703125, "learning_rate": 9.958738790616839e-05, "loss": 0.0756, "step": 646 }, { "epoch": 0.08320116593081572, "grad_norm": 0.263671875, "learning_rate": 9.958609115835854e-05, "loss": 0.0894, "step": 647 }, { "epoch": 0.0833297612413734, "grad_norm": 0.259765625, "learning_rate": 9.958479238451895e-05, "loss": 0.0756, "step": 648 }, { "epoch": 0.08345835655193107, "grad_norm": 0.267578125, "learning_rate": 9.958349158470275e-05, "loss": 0.079, "step": 649 }, { "epoch": 0.08358695186248875, "grad_norm": 0.26171875, "learning_rate": 9.958218875896303e-05, "loss": 0.0852, "step": 650 }, { "epoch": 0.08371554717304643, "grad_norm": 0.2490234375, "learning_rate": 9.958088390735308e-05, "loss": 0.0841, "step": 651 }, { "epoch": 0.0838441424836041, "grad_norm": 0.2431640625, "learning_rate": 9.957957702992615e-05, "loss": 0.0671, "step": 652 }, { "epoch": 0.08397273779416177, "grad_norm": 0.30078125, "learning_rate": 9.95782681267357e-05, "loss": 0.0872, "step": 653 }, { "epoch": 0.08410133310471944, "grad_norm": 0.29296875, "learning_rate": 9.957695719783518e-05, "loss": 0.0819, "step": 654 }, { "epoch": 0.08422992841527713, "grad_norm": 0.28125, "learning_rate": 9.957564424327814e-05, "loss": 0.0822, "step": 655 }, { "epoch": 0.0843585237258348, "grad_norm": 0.26171875, "learning_rate": 9.957432926311825e-05, "loss": 0.0784, "step": 656 }, { "epoch": 0.08448711903639247, "grad_norm": 0.328125, "learning_rate": 9.957301225740923e-05, "loss": 0.0913, "step": 657 }, { "epoch": 0.08461571434695014, "grad_norm": 0.287109375, "learning_rate": 9.95716932262049e-05, "loss": 0.0849, "step": 658 }, { "epoch": 0.08474430965750783, "grad_norm": 0.251953125, "learning_rate": 9.957037216955914e-05, "loss": 0.0836, "step": 659 }, { "epoch": 0.0848729049680655, "grad_norm": 0.267578125, "learning_rate": 9.956904908752591e-05, "loss": 0.0862, "step": 660 }, { "epoch": 0.08500150027862317, "grad_norm": 0.2734375, "learning_rate": 9.956772398015933e-05, "loss": 0.0871, "step": 661 }, { "epoch": 0.08513009558918085, "grad_norm": 0.244140625, "learning_rate": 9.956639684751347e-05, "loss": 0.0807, "step": 662 }, { "epoch": 0.08525869089973852, "grad_norm": 0.248046875, "learning_rate": 9.95650676896426e-05, "loss": 0.0827, "step": 663 }, { "epoch": 0.0853872862102962, "grad_norm": 0.248046875, "learning_rate": 9.956373650660102e-05, "loss": 0.0723, "step": 664 }, { "epoch": 0.08551588152085388, "grad_norm": 0.29296875, "learning_rate": 9.956240329844311e-05, "loss": 0.087, "step": 665 }, { "epoch": 0.08564447683141155, "grad_norm": 0.271484375, "learning_rate": 9.956106806522336e-05, "loss": 0.0799, "step": 666 }, { "epoch": 0.08577307214196922, "grad_norm": 0.251953125, "learning_rate": 9.955973080699631e-05, "loss": 0.0883, "step": 667 }, { "epoch": 0.08590166745252689, "grad_norm": 0.275390625, "learning_rate": 9.955839152381661e-05, "loss": 0.0765, "step": 668 }, { "epoch": 0.08603026276308458, "grad_norm": 0.322265625, "learning_rate": 9.955705021573898e-05, "loss": 0.0937, "step": 669 }, { "epoch": 0.08615885807364225, "grad_norm": 0.28125, "learning_rate": 9.955570688281821e-05, "loss": 0.0797, "step": 670 }, { "epoch": 0.08628745338419992, "grad_norm": 0.271484375, "learning_rate": 9.955436152510921e-05, "loss": 0.0809, "step": 671 }, { "epoch": 0.0864160486947576, "grad_norm": 0.2578125, "learning_rate": 9.955301414266694e-05, "loss": 0.0894, "step": 672 }, { "epoch": 0.08654464400531527, "grad_norm": 0.25390625, "learning_rate": 9.955166473554643e-05, "loss": 0.0782, "step": 673 }, { "epoch": 0.08667323931587295, "grad_norm": 0.236328125, "learning_rate": 9.955031330380284e-05, "loss": 0.0702, "step": 674 }, { "epoch": 0.08680183462643062, "grad_norm": 0.2734375, "learning_rate": 9.95489598474914e-05, "loss": 0.0837, "step": 675 }, { "epoch": 0.0869304299369883, "grad_norm": 0.2578125, "learning_rate": 9.954760436666739e-05, "loss": 0.0873, "step": 676 }, { "epoch": 0.08705902524754597, "grad_norm": 0.265625, "learning_rate": 9.954624686138619e-05, "loss": 0.0767, "step": 677 }, { "epoch": 0.08718762055810365, "grad_norm": 0.2890625, "learning_rate": 9.954488733170328e-05, "loss": 0.0919, "step": 678 }, { "epoch": 0.08731621586866133, "grad_norm": 0.279296875, "learning_rate": 9.954352577767421e-05, "loss": 0.0832, "step": 679 }, { "epoch": 0.087444811179219, "grad_norm": 0.251953125, "learning_rate": 9.954216219935459e-05, "loss": 0.0822, "step": 680 }, { "epoch": 0.08757340648977667, "grad_norm": 0.26953125, "learning_rate": 9.954079659680016e-05, "loss": 0.0801, "step": 681 }, { "epoch": 0.08770200180033434, "grad_norm": 0.279296875, "learning_rate": 9.95394289700667e-05, "loss": 0.0836, "step": 682 }, { "epoch": 0.08783059711089203, "grad_norm": 0.310546875, "learning_rate": 9.953805931921007e-05, "loss": 0.0878, "step": 683 }, { "epoch": 0.0879591924214497, "grad_norm": 0.2734375, "learning_rate": 9.95366876442863e-05, "loss": 0.0754, "step": 684 }, { "epoch": 0.08808778773200737, "grad_norm": 0.29296875, "learning_rate": 9.953531394535137e-05, "loss": 0.09, "step": 685 }, { "epoch": 0.08821638304256504, "grad_norm": 0.255859375, "learning_rate": 9.953393822246142e-05, "loss": 0.0889, "step": 686 }, { "epoch": 0.08834497835312272, "grad_norm": 0.26171875, "learning_rate": 9.95325604756727e-05, "loss": 0.0879, "step": 687 }, { "epoch": 0.0884735736636804, "grad_norm": 0.314453125, "learning_rate": 9.953118070504143e-05, "loss": 0.0821, "step": 688 }, { "epoch": 0.08860216897423807, "grad_norm": 0.2470703125, "learning_rate": 9.952979891062407e-05, "loss": 0.0711, "step": 689 }, { "epoch": 0.08873076428479575, "grad_norm": 0.2890625, "learning_rate": 9.952841509247702e-05, "loss": 0.0896, "step": 690 }, { "epoch": 0.08885935959535342, "grad_norm": 0.265625, "learning_rate": 9.952702925065684e-05, "loss": 0.0887, "step": 691 }, { "epoch": 0.0889879549059111, "grad_norm": 0.330078125, "learning_rate": 9.952564138522016e-05, "loss": 0.0912, "step": 692 }, { "epoch": 0.08911655021646878, "grad_norm": 0.28125, "learning_rate": 9.952425149622367e-05, "loss": 0.0859, "step": 693 }, { "epoch": 0.08924514552702645, "grad_norm": 0.232421875, "learning_rate": 9.952285958372419e-05, "loss": 0.0773, "step": 694 }, { "epoch": 0.08937374083758412, "grad_norm": 0.298828125, "learning_rate": 9.952146564777854e-05, "loss": 0.0825, "step": 695 }, { "epoch": 0.08950233614814179, "grad_norm": 0.33984375, "learning_rate": 9.952006968844372e-05, "loss": 0.0842, "step": 696 }, { "epoch": 0.08963093145869948, "grad_norm": 0.318359375, "learning_rate": 9.951867170577675e-05, "loss": 0.092, "step": 697 }, { "epoch": 0.08975952676925715, "grad_norm": 0.25, "learning_rate": 9.951727169983475e-05, "loss": 0.0722, "step": 698 }, { "epoch": 0.08988812207981482, "grad_norm": 0.32421875, "learning_rate": 9.951586967067494e-05, "loss": 0.0928, "step": 699 }, { "epoch": 0.0900167173903725, "grad_norm": 0.328125, "learning_rate": 9.951446561835456e-05, "loss": 0.085, "step": 700 }, { "epoch": 0.09014531270093017, "grad_norm": 0.291015625, "learning_rate": 9.951305954293105e-05, "loss": 0.0892, "step": 701 }, { "epoch": 0.09027390801148785, "grad_norm": 0.26953125, "learning_rate": 9.951165144446177e-05, "loss": 0.0859, "step": 702 }, { "epoch": 0.09040250332204552, "grad_norm": 0.28125, "learning_rate": 9.951024132300433e-05, "loss": 0.0792, "step": 703 }, { "epoch": 0.0905310986326032, "grad_norm": 0.26953125, "learning_rate": 9.950882917861632e-05, "loss": 0.0739, "step": 704 }, { "epoch": 0.09065969394316087, "grad_norm": 0.26171875, "learning_rate": 9.950741501135544e-05, "loss": 0.0801, "step": 705 }, { "epoch": 0.09078828925371855, "grad_norm": 0.251953125, "learning_rate": 9.950599882127946e-05, "loss": 0.0834, "step": 706 }, { "epoch": 0.09091688456427623, "grad_norm": 0.259765625, "learning_rate": 9.950458060844625e-05, "loss": 0.0791, "step": 707 }, { "epoch": 0.0910454798748339, "grad_norm": 0.271484375, "learning_rate": 9.950316037291375e-05, "loss": 0.0899, "step": 708 }, { "epoch": 0.09117407518539157, "grad_norm": 0.271484375, "learning_rate": 9.950173811474e-05, "loss": 0.0823, "step": 709 }, { "epoch": 0.09130267049594924, "grad_norm": 0.291015625, "learning_rate": 9.950031383398313e-05, "loss": 0.0846, "step": 710 }, { "epoch": 0.09143126580650693, "grad_norm": 0.248046875, "learning_rate": 9.949888753070129e-05, "loss": 0.0802, "step": 711 }, { "epoch": 0.0915598611170646, "grad_norm": 0.296875, "learning_rate": 9.949745920495279e-05, "loss": 0.0861, "step": 712 }, { "epoch": 0.09168845642762227, "grad_norm": 0.3046875, "learning_rate": 9.949602885679599e-05, "loss": 0.0785, "step": 713 }, { "epoch": 0.09181705173817994, "grad_norm": 0.29296875, "learning_rate": 9.949459648628931e-05, "loss": 0.082, "step": 714 }, { "epoch": 0.09194564704873762, "grad_norm": 0.236328125, "learning_rate": 9.949316209349129e-05, "loss": 0.0665, "step": 715 }, { "epoch": 0.0920742423592953, "grad_norm": 0.26171875, "learning_rate": 9.949172567846052e-05, "loss": 0.0847, "step": 716 }, { "epoch": 0.09220283766985297, "grad_norm": 0.3046875, "learning_rate": 9.949028724125574e-05, "loss": 0.1028, "step": 717 }, { "epoch": 0.09233143298041065, "grad_norm": 0.263671875, "learning_rate": 9.948884678193566e-05, "loss": 0.0809, "step": 718 }, { "epoch": 0.09246002829096832, "grad_norm": 0.244140625, "learning_rate": 9.948740430055917e-05, "loss": 0.0755, "step": 719 }, { "epoch": 0.092588623601526, "grad_norm": 0.283203125, "learning_rate": 9.94859597971852e-05, "loss": 0.0883, "step": 720 }, { "epoch": 0.09271721891208368, "grad_norm": 0.26953125, "learning_rate": 9.948451327187278e-05, "loss": 0.0872, "step": 721 }, { "epoch": 0.09284581422264135, "grad_norm": 0.296875, "learning_rate": 9.9483064724681e-05, "loss": 0.0824, "step": 722 }, { "epoch": 0.09297440953319902, "grad_norm": 0.28515625, "learning_rate": 9.948161415566906e-05, "loss": 0.08, "step": 723 }, { "epoch": 0.09310300484375669, "grad_norm": 0.26171875, "learning_rate": 9.948016156489622e-05, "loss": 0.0782, "step": 724 }, { "epoch": 0.09323160015431438, "grad_norm": 0.267578125, "learning_rate": 9.947870695242183e-05, "loss": 0.0747, "step": 725 }, { "epoch": 0.09336019546487205, "grad_norm": 0.3515625, "learning_rate": 9.947725031830533e-05, "loss": 0.0901, "step": 726 }, { "epoch": 0.09348879077542972, "grad_norm": 0.275390625, "learning_rate": 9.947579166260622e-05, "loss": 0.0728, "step": 727 }, { "epoch": 0.0936173860859874, "grad_norm": 0.2578125, "learning_rate": 9.947433098538411e-05, "loss": 0.0768, "step": 728 }, { "epoch": 0.09374598139654507, "grad_norm": 0.228515625, "learning_rate": 9.947286828669868e-05, "loss": 0.0707, "step": 729 }, { "epoch": 0.09387457670710275, "grad_norm": 0.33984375, "learning_rate": 9.94714035666097e-05, "loss": 0.0795, "step": 730 }, { "epoch": 0.09400317201766042, "grad_norm": 0.251953125, "learning_rate": 9.946993682517704e-05, "loss": 0.077, "step": 731 }, { "epoch": 0.0941317673282181, "grad_norm": 0.25, "learning_rate": 9.946846806246058e-05, "loss": 0.0715, "step": 732 }, { "epoch": 0.09426036263877577, "grad_norm": 0.259765625, "learning_rate": 9.946699727852034e-05, "loss": 0.0794, "step": 733 }, { "epoch": 0.09438895794933344, "grad_norm": 0.265625, "learning_rate": 9.946552447341646e-05, "loss": 0.0777, "step": 734 }, { "epoch": 0.09451755325989113, "grad_norm": 0.255859375, "learning_rate": 9.946404964720905e-05, "loss": 0.0791, "step": 735 }, { "epoch": 0.0946461485704488, "grad_norm": 0.2578125, "learning_rate": 9.946257279995843e-05, "loss": 0.0915, "step": 736 }, { "epoch": 0.09477474388100647, "grad_norm": 0.27734375, "learning_rate": 9.946109393172491e-05, "loss": 0.0813, "step": 737 }, { "epoch": 0.09490333919156414, "grad_norm": 0.294921875, "learning_rate": 9.945961304256895e-05, "loss": 0.0865, "step": 738 }, { "epoch": 0.09503193450212183, "grad_norm": 0.32421875, "learning_rate": 9.9458130132551e-05, "loss": 0.0874, "step": 739 }, { "epoch": 0.0951605298126795, "grad_norm": 0.267578125, "learning_rate": 9.945664520173167e-05, "loss": 0.0845, "step": 740 }, { "epoch": 0.09528912512323717, "grad_norm": 0.232421875, "learning_rate": 9.945515825017166e-05, "loss": 0.0636, "step": 741 }, { "epoch": 0.09541772043379484, "grad_norm": 0.2431640625, "learning_rate": 9.945366927793171e-05, "loss": 0.0807, "step": 742 }, { "epoch": 0.09554631574435252, "grad_norm": 0.2490234375, "learning_rate": 9.945217828507265e-05, "loss": 0.0702, "step": 743 }, { "epoch": 0.0956749110549102, "grad_norm": 0.2578125, "learning_rate": 9.945068527165542e-05, "loss": 0.0821, "step": 744 }, { "epoch": 0.09580350636546787, "grad_norm": 0.296875, "learning_rate": 9.9449190237741e-05, "loss": 0.0954, "step": 745 }, { "epoch": 0.09593210167602555, "grad_norm": 0.2578125, "learning_rate": 9.944769318339049e-05, "loss": 0.0789, "step": 746 }, { "epoch": 0.09606069698658322, "grad_norm": 0.25390625, "learning_rate": 9.944619410866504e-05, "loss": 0.0761, "step": 747 }, { "epoch": 0.09618929229714089, "grad_norm": 0.27734375, "learning_rate": 9.944469301362592e-05, "loss": 0.0872, "step": 748 }, { "epoch": 0.09631788760769858, "grad_norm": 0.251953125, "learning_rate": 9.944318989833445e-05, "loss": 0.0841, "step": 749 }, { "epoch": 0.09644648291825625, "grad_norm": 0.263671875, "learning_rate": 9.944168476285205e-05, "loss": 0.074, "step": 750 }, { "epoch": 0.09657507822881392, "grad_norm": 0.287109375, "learning_rate": 9.944017760724022e-05, "loss": 0.086, "step": 751 }, { "epoch": 0.09670367353937159, "grad_norm": 0.26171875, "learning_rate": 9.943866843156053e-05, "loss": 0.081, "step": 752 }, { "epoch": 0.09683226884992928, "grad_norm": 0.255859375, "learning_rate": 9.943715723587467e-05, "loss": 0.0761, "step": 753 }, { "epoch": 0.09696086416048695, "grad_norm": 0.27734375, "learning_rate": 9.943564402024436e-05, "loss": 0.0828, "step": 754 }, { "epoch": 0.09708945947104462, "grad_norm": 0.294921875, "learning_rate": 9.943412878473144e-05, "loss": 0.082, "step": 755 }, { "epoch": 0.0972180547816023, "grad_norm": 0.283203125, "learning_rate": 9.94326115293978e-05, "loss": 0.0856, "step": 756 }, { "epoch": 0.09734665009215997, "grad_norm": 0.248046875, "learning_rate": 9.943109225430546e-05, "loss": 0.0765, "step": 757 }, { "epoch": 0.09747524540271765, "grad_norm": 0.30859375, "learning_rate": 9.942957095951651e-05, "loss": 0.0869, "step": 758 }, { "epoch": 0.09760384071327532, "grad_norm": 0.251953125, "learning_rate": 9.942804764509305e-05, "loss": 0.0703, "step": 759 }, { "epoch": 0.097732436023833, "grad_norm": 0.28515625, "learning_rate": 9.942652231109737e-05, "loss": 0.0835, "step": 760 }, { "epoch": 0.09786103133439067, "grad_norm": 0.26171875, "learning_rate": 9.942499495759178e-05, "loss": 0.0771, "step": 761 }, { "epoch": 0.09798962664494834, "grad_norm": 0.2734375, "learning_rate": 9.94234655846387e-05, "loss": 0.0926, "step": 762 }, { "epoch": 0.09811822195550603, "grad_norm": 0.2451171875, "learning_rate": 9.942193419230058e-05, "loss": 0.0719, "step": 763 }, { "epoch": 0.0982468172660637, "grad_norm": 0.287109375, "learning_rate": 9.942040078064003e-05, "loss": 0.0908, "step": 764 }, { "epoch": 0.09837541257662137, "grad_norm": 0.2578125, "learning_rate": 9.941886534971967e-05, "loss": 0.0841, "step": 765 }, { "epoch": 0.09850400788717904, "grad_norm": 0.271484375, "learning_rate": 9.941732789960227e-05, "loss": 0.0834, "step": 766 }, { "epoch": 0.09863260319773673, "grad_norm": 0.265625, "learning_rate": 9.941578843035063e-05, "loss": 0.0799, "step": 767 }, { "epoch": 0.0987611985082944, "grad_norm": 0.267578125, "learning_rate": 9.941424694202764e-05, "loss": 0.0753, "step": 768 }, { "epoch": 0.09888979381885207, "grad_norm": 0.2216796875, "learning_rate": 9.94127034346963e-05, "loss": 0.0782, "step": 769 }, { "epoch": 0.09901838912940975, "grad_norm": 0.244140625, "learning_rate": 9.941115790841969e-05, "loss": 0.076, "step": 770 }, { "epoch": 0.09914698443996742, "grad_norm": 0.287109375, "learning_rate": 9.94096103632609e-05, "loss": 0.0829, "step": 771 }, { "epoch": 0.0992755797505251, "grad_norm": 0.279296875, "learning_rate": 9.940806079928325e-05, "loss": 0.0796, "step": 772 }, { "epoch": 0.09940417506108278, "grad_norm": 0.271484375, "learning_rate": 9.940650921654997e-05, "loss": 0.0798, "step": 773 }, { "epoch": 0.09953277037164045, "grad_norm": 0.27734375, "learning_rate": 9.94049556151245e-05, "loss": 0.086, "step": 774 }, { "epoch": 0.09966136568219812, "grad_norm": 0.228515625, "learning_rate": 9.94033999950703e-05, "loss": 0.0716, "step": 775 }, { "epoch": 0.09978996099275579, "grad_norm": 0.255859375, "learning_rate": 9.940184235645097e-05, "loss": 0.0815, "step": 776 }, { "epoch": 0.09991855630331348, "grad_norm": 0.25, "learning_rate": 9.940028269933011e-05, "loss": 0.0695, "step": 777 }, { "epoch": 0.10004715161387115, "grad_norm": 0.25390625, "learning_rate": 9.939872102377144e-05, "loss": 0.0735, "step": 778 }, { "epoch": 0.10017574692442882, "grad_norm": 0.2373046875, "learning_rate": 9.939715732983881e-05, "loss": 0.0728, "step": 779 }, { "epoch": 0.1003043422349865, "grad_norm": 0.2578125, "learning_rate": 9.939559161759607e-05, "loss": 0.0855, "step": 780 }, { "epoch": 0.10043293754554418, "grad_norm": 0.2451171875, "learning_rate": 9.939402388710721e-05, "loss": 0.0786, "step": 781 }, { "epoch": 0.10056153285610185, "grad_norm": 0.251953125, "learning_rate": 9.93924541384363e-05, "loss": 0.0639, "step": 782 }, { "epoch": 0.10069012816665952, "grad_norm": 0.294921875, "learning_rate": 9.939088237164746e-05, "loss": 0.0849, "step": 783 }, { "epoch": 0.1008187234772172, "grad_norm": 0.314453125, "learning_rate": 9.93893085868049e-05, "loss": 0.0939, "step": 784 }, { "epoch": 0.10094731878777487, "grad_norm": 0.251953125, "learning_rate": 9.938773278397296e-05, "loss": 0.0811, "step": 785 }, { "epoch": 0.10107591409833255, "grad_norm": 0.26171875, "learning_rate": 9.938615496321599e-05, "loss": 0.0642, "step": 786 }, { "epoch": 0.10120450940889023, "grad_norm": 0.3046875, "learning_rate": 9.93845751245985e-05, "loss": 0.0877, "step": 787 }, { "epoch": 0.1013331047194479, "grad_norm": 0.265625, "learning_rate": 9.938299326818498e-05, "loss": 0.0784, "step": 788 }, { "epoch": 0.10146170003000557, "grad_norm": 0.2890625, "learning_rate": 9.93814093940401e-05, "loss": 0.0906, "step": 789 }, { "epoch": 0.10159029534056324, "grad_norm": 0.265625, "learning_rate": 9.937982350222858e-05, "loss": 0.0815, "step": 790 }, { "epoch": 0.10171889065112093, "grad_norm": 0.251953125, "learning_rate": 9.937823559281519e-05, "loss": 0.0774, "step": 791 }, { "epoch": 0.1018474859616786, "grad_norm": 0.251953125, "learning_rate": 9.937664566586484e-05, "loss": 0.0876, "step": 792 }, { "epoch": 0.10197608127223627, "grad_norm": 0.25390625, "learning_rate": 9.937505372144249e-05, "loss": 0.0728, "step": 793 }, { "epoch": 0.10210467658279394, "grad_norm": 0.265625, "learning_rate": 9.937345975961316e-05, "loss": 0.0877, "step": 794 }, { "epoch": 0.10223327189335163, "grad_norm": 0.2421875, "learning_rate": 9.937186378044198e-05, "loss": 0.0688, "step": 795 }, { "epoch": 0.1023618672039093, "grad_norm": 0.283203125, "learning_rate": 9.93702657839942e-05, "loss": 0.0756, "step": 796 }, { "epoch": 0.10249046251446697, "grad_norm": 0.279296875, "learning_rate": 9.936866577033508e-05, "loss": 0.0853, "step": 797 }, { "epoch": 0.10261905782502465, "grad_norm": 0.28125, "learning_rate": 9.936706373953e-05, "loss": 0.0829, "step": 798 }, { "epoch": 0.10274765313558232, "grad_norm": 0.25, "learning_rate": 9.93654596916444e-05, "loss": 0.0729, "step": 799 }, { "epoch": 0.10287624844614, "grad_norm": 0.267578125, "learning_rate": 9.936385362674385e-05, "loss": 0.0798, "step": 800 }, { "epoch": 0.10300484375669768, "grad_norm": 0.224609375, "learning_rate": 9.936224554489395e-05, "loss": 0.0626, "step": 801 }, { "epoch": 0.10313343906725535, "grad_norm": 0.26953125, "learning_rate": 9.936063544616042e-05, "loss": 0.0676, "step": 802 }, { "epoch": 0.10326203437781302, "grad_norm": 0.302734375, "learning_rate": 9.935902333060903e-05, "loss": 0.0905, "step": 803 }, { "epoch": 0.10339062968837069, "grad_norm": 0.25, "learning_rate": 9.935740919830567e-05, "loss": 0.07, "step": 804 }, { "epoch": 0.10351922499892838, "grad_norm": 0.28125, "learning_rate": 9.935579304931628e-05, "loss": 0.0818, "step": 805 }, { "epoch": 0.10364782030948605, "grad_norm": 0.2734375, "learning_rate": 9.935417488370688e-05, "loss": 0.0735, "step": 806 }, { "epoch": 0.10377641562004372, "grad_norm": 0.2578125, "learning_rate": 9.935255470154362e-05, "loss": 0.0787, "step": 807 }, { "epoch": 0.1039050109306014, "grad_norm": 0.251953125, "learning_rate": 9.935093250289265e-05, "loss": 0.0659, "step": 808 }, { "epoch": 0.10403360624115907, "grad_norm": 0.25390625, "learning_rate": 9.93493082878203e-05, "loss": 0.0809, "step": 809 }, { "epoch": 0.10416220155171675, "grad_norm": 0.232421875, "learning_rate": 9.934768205639291e-05, "loss": 0.0712, "step": 810 }, { "epoch": 0.10429079686227442, "grad_norm": 0.29296875, "learning_rate": 9.934605380867694e-05, "loss": 0.0842, "step": 811 }, { "epoch": 0.1044193921728321, "grad_norm": 0.271484375, "learning_rate": 9.93444235447389e-05, "loss": 0.0837, "step": 812 }, { "epoch": 0.10454798748338977, "grad_norm": 0.2451171875, "learning_rate": 9.934279126464541e-05, "loss": 0.0756, "step": 813 }, { "epoch": 0.10467658279394745, "grad_norm": 0.2431640625, "learning_rate": 9.934115696846314e-05, "loss": 0.0854, "step": 814 }, { "epoch": 0.10480517810450513, "grad_norm": 0.236328125, "learning_rate": 9.933952065625892e-05, "loss": 0.0742, "step": 815 }, { "epoch": 0.1049337734150628, "grad_norm": 0.2421875, "learning_rate": 9.933788232809955e-05, "loss": 0.0678, "step": 816 }, { "epoch": 0.10506236872562047, "grad_norm": 0.25390625, "learning_rate": 9.9336241984052e-05, "loss": 0.0732, "step": 817 }, { "epoch": 0.10519096403617814, "grad_norm": 0.251953125, "learning_rate": 9.93345996241833e-05, "loss": 0.0856, "step": 818 }, { "epoch": 0.10531955934673583, "grad_norm": 0.2578125, "learning_rate": 9.933295524856053e-05, "loss": 0.082, "step": 819 }, { "epoch": 0.1054481546572935, "grad_norm": 0.28515625, "learning_rate": 9.93313088572509e-05, "loss": 0.0818, "step": 820 }, { "epoch": 0.10557674996785117, "grad_norm": 0.265625, "learning_rate": 9.932966045032165e-05, "loss": 0.0816, "step": 821 }, { "epoch": 0.10570534527840884, "grad_norm": 0.2431640625, "learning_rate": 9.932801002784018e-05, "loss": 0.0743, "step": 822 }, { "epoch": 0.10583394058896652, "grad_norm": 0.25390625, "learning_rate": 9.932635758987388e-05, "loss": 0.0723, "step": 823 }, { "epoch": 0.1059625358995242, "grad_norm": 0.2578125, "learning_rate": 9.932470313649028e-05, "loss": 0.0809, "step": 824 }, { "epoch": 0.10609113121008187, "grad_norm": 0.26171875, "learning_rate": 9.932304666775698e-05, "loss": 0.083, "step": 825 }, { "epoch": 0.10621972652063955, "grad_norm": 0.27734375, "learning_rate": 9.932138818374165e-05, "loss": 0.0728, "step": 826 }, { "epoch": 0.10634832183119722, "grad_norm": 0.2314453125, "learning_rate": 9.93197276845121e-05, "loss": 0.072, "step": 827 }, { "epoch": 0.1064769171417549, "grad_norm": 0.2138671875, "learning_rate": 9.931806517013612e-05, "loss": 0.0713, "step": 828 }, { "epoch": 0.10660551245231258, "grad_norm": 0.236328125, "learning_rate": 9.931640064068166e-05, "loss": 0.0699, "step": 829 }, { "epoch": 0.10673410776287025, "grad_norm": 0.263671875, "learning_rate": 9.931473409621675e-05, "loss": 0.087, "step": 830 }, { "epoch": 0.10686270307342792, "grad_norm": 0.251953125, "learning_rate": 9.931306553680945e-05, "loss": 0.0756, "step": 831 }, { "epoch": 0.10699129838398559, "grad_norm": 0.259765625, "learning_rate": 9.931139496252795e-05, "loss": 0.0744, "step": 832 }, { "epoch": 0.10711989369454328, "grad_norm": 0.224609375, "learning_rate": 9.930972237344052e-05, "loss": 0.0707, "step": 833 }, { "epoch": 0.10724848900510095, "grad_norm": 0.271484375, "learning_rate": 9.930804776961546e-05, "loss": 0.0885, "step": 834 }, { "epoch": 0.10737708431565862, "grad_norm": 0.2353515625, "learning_rate": 9.930637115112124e-05, "loss": 0.0664, "step": 835 }, { "epoch": 0.1075056796262163, "grad_norm": 0.234375, "learning_rate": 9.930469251802635e-05, "loss": 0.0641, "step": 836 }, { "epoch": 0.10763427493677397, "grad_norm": 0.2470703125, "learning_rate": 9.930301187039937e-05, "loss": 0.0765, "step": 837 }, { "epoch": 0.10776287024733165, "grad_norm": 0.2490234375, "learning_rate": 9.930132920830896e-05, "loss": 0.0804, "step": 838 }, { "epoch": 0.10789146555788932, "grad_norm": 0.26953125, "learning_rate": 9.929964453182388e-05, "loss": 0.0809, "step": 839 }, { "epoch": 0.108020060868447, "grad_norm": 0.240234375, "learning_rate": 9.929795784101298e-05, "loss": 0.0747, "step": 840 }, { "epoch": 0.10814865617900467, "grad_norm": 0.251953125, "learning_rate": 9.929626913594516e-05, "loss": 0.0744, "step": 841 }, { "epoch": 0.10827725148956235, "grad_norm": 0.228515625, "learning_rate": 9.929457841668941e-05, "loss": 0.0691, "step": 842 }, { "epoch": 0.10840584680012003, "grad_norm": 0.2236328125, "learning_rate": 9.929288568331483e-05, "loss": 0.0594, "step": 843 }, { "epoch": 0.1085344421106777, "grad_norm": 0.2412109375, "learning_rate": 9.929119093589059e-05, "loss": 0.0745, "step": 844 }, { "epoch": 0.10866303742123537, "grad_norm": 0.2333984375, "learning_rate": 9.92894941744859e-05, "loss": 0.0693, "step": 845 }, { "epoch": 0.10879163273179304, "grad_norm": 0.2216796875, "learning_rate": 9.928779539917011e-05, "loss": 0.0609, "step": 846 }, { "epoch": 0.10892022804235073, "grad_norm": 0.23828125, "learning_rate": 9.928609461001264e-05, "loss": 0.072, "step": 847 }, { "epoch": 0.1090488233529084, "grad_norm": 0.2470703125, "learning_rate": 9.928439180708296e-05, "loss": 0.0722, "step": 848 }, { "epoch": 0.10917741866346607, "grad_norm": 0.2275390625, "learning_rate": 9.928268699045065e-05, "loss": 0.0631, "step": 849 }, { "epoch": 0.10930601397402374, "grad_norm": 0.24609375, "learning_rate": 9.92809801601854e-05, "loss": 0.0748, "step": 850 }, { "epoch": 0.10943460928458142, "grad_norm": 0.240234375, "learning_rate": 9.92792713163569e-05, "loss": 0.0717, "step": 851 }, { "epoch": 0.1095632045951391, "grad_norm": 0.26171875, "learning_rate": 9.927756045903498e-05, "loss": 0.0809, "step": 852 }, { "epoch": 0.10969179990569677, "grad_norm": 0.263671875, "learning_rate": 9.927584758828957e-05, "loss": 0.0781, "step": 853 }, { "epoch": 0.10982039521625445, "grad_norm": 0.251953125, "learning_rate": 9.927413270419065e-05, "loss": 0.0787, "step": 854 }, { "epoch": 0.10994899052681212, "grad_norm": 0.23046875, "learning_rate": 9.927241580680826e-05, "loss": 0.0771, "step": 855 }, { "epoch": 0.1100775858373698, "grad_norm": 0.298828125, "learning_rate": 9.927069689621259e-05, "loss": 0.1042, "step": 856 }, { "epoch": 0.11020618114792748, "grad_norm": 0.263671875, "learning_rate": 9.926897597247384e-05, "loss": 0.0678, "step": 857 }, { "epoch": 0.11033477645848515, "grad_norm": 0.2294921875, "learning_rate": 9.926725303566235e-05, "loss": 0.0725, "step": 858 }, { "epoch": 0.11046337176904282, "grad_norm": 0.2490234375, "learning_rate": 9.926552808584851e-05, "loss": 0.0808, "step": 859 }, { "epoch": 0.11059196707960049, "grad_norm": 0.2470703125, "learning_rate": 9.926380112310277e-05, "loss": 0.0888, "step": 860 }, { "epoch": 0.11072056239015818, "grad_norm": 0.2578125, "learning_rate": 9.926207214749573e-05, "loss": 0.0808, "step": 861 }, { "epoch": 0.11084915770071585, "grad_norm": 0.220703125, "learning_rate": 9.926034115909801e-05, "loss": 0.067, "step": 862 }, { "epoch": 0.11097775301127352, "grad_norm": 0.2734375, "learning_rate": 9.925860815798036e-05, "loss": 0.0733, "step": 863 }, { "epoch": 0.1111063483218312, "grad_norm": 0.2578125, "learning_rate": 9.925687314421357e-05, "loss": 0.0702, "step": 864 }, { "epoch": 0.11123494363238887, "grad_norm": 0.232421875, "learning_rate": 9.925513611786855e-05, "loss": 0.0651, "step": 865 }, { "epoch": 0.11136353894294655, "grad_norm": 0.236328125, "learning_rate": 9.925339707901624e-05, "loss": 0.0659, "step": 866 }, { "epoch": 0.11149213425350422, "grad_norm": 0.244140625, "learning_rate": 9.925165602772771e-05, "loss": 0.0673, "step": 867 }, { "epoch": 0.1116207295640619, "grad_norm": 0.296875, "learning_rate": 9.924991296407412e-05, "loss": 0.0821, "step": 868 }, { "epoch": 0.11174932487461957, "grad_norm": 0.23046875, "learning_rate": 9.924816788812666e-05, "loss": 0.0721, "step": 869 }, { "epoch": 0.11187792018517724, "grad_norm": 0.265625, "learning_rate": 9.924642079995663e-05, "loss": 0.0733, "step": 870 }, { "epoch": 0.11200651549573493, "grad_norm": 0.236328125, "learning_rate": 9.924467169963545e-05, "loss": 0.0769, "step": 871 }, { "epoch": 0.1121351108062926, "grad_norm": 0.2275390625, "learning_rate": 9.924292058723455e-05, "loss": 0.0645, "step": 872 }, { "epoch": 0.11226370611685027, "grad_norm": 0.279296875, "learning_rate": 9.924116746282547e-05, "loss": 0.0862, "step": 873 }, { "epoch": 0.11239230142740794, "grad_norm": 0.263671875, "learning_rate": 9.923941232647989e-05, "loss": 0.0728, "step": 874 }, { "epoch": 0.11252089673796563, "grad_norm": 0.2421875, "learning_rate": 9.923765517826948e-05, "loss": 0.0628, "step": 875 }, { "epoch": 0.1126494920485233, "grad_norm": 0.2421875, "learning_rate": 9.923589601826606e-05, "loss": 0.0729, "step": 876 }, { "epoch": 0.11277808735908097, "grad_norm": 0.251953125, "learning_rate": 9.923413484654149e-05, "loss": 0.0775, "step": 877 }, { "epoch": 0.11290668266963864, "grad_norm": 0.2578125, "learning_rate": 9.923237166316775e-05, "loss": 0.0748, "step": 878 }, { "epoch": 0.11303527798019632, "grad_norm": 0.267578125, "learning_rate": 9.923060646821684e-05, "loss": 0.0755, "step": 879 }, { "epoch": 0.113163873290754, "grad_norm": 0.2451171875, "learning_rate": 9.922883926176094e-05, "loss": 0.0793, "step": 880 }, { "epoch": 0.11329246860131167, "grad_norm": 0.2138671875, "learning_rate": 9.92270700438722e-05, "loss": 0.0643, "step": 881 }, { "epoch": 0.11342106391186935, "grad_norm": 0.234375, "learning_rate": 9.922529881462295e-05, "loss": 0.0711, "step": 882 }, { "epoch": 0.11354965922242702, "grad_norm": 0.2490234375, "learning_rate": 9.922352557408554e-05, "loss": 0.0816, "step": 883 }, { "epoch": 0.11367825453298469, "grad_norm": 0.234375, "learning_rate": 9.922175032233243e-05, "loss": 0.0668, "step": 884 }, { "epoch": 0.11380684984354238, "grad_norm": 0.255859375, "learning_rate": 9.921997305943615e-05, "loss": 0.0687, "step": 885 }, { "epoch": 0.11393544515410005, "grad_norm": 0.25390625, "learning_rate": 9.921819378546933e-05, "loss": 0.081, "step": 886 }, { "epoch": 0.11406404046465772, "grad_norm": 0.251953125, "learning_rate": 9.921641250050467e-05, "loss": 0.0667, "step": 887 }, { "epoch": 0.11419263577521539, "grad_norm": 0.2353515625, "learning_rate": 9.921462920461491e-05, "loss": 0.065, "step": 888 }, { "epoch": 0.11432123108577308, "grad_norm": 0.251953125, "learning_rate": 9.921284389787295e-05, "loss": 0.0695, "step": 889 }, { "epoch": 0.11444982639633075, "grad_norm": 0.298828125, "learning_rate": 9.921105658035174e-05, "loss": 0.075, "step": 890 }, { "epoch": 0.11457842170688842, "grad_norm": 0.244140625, "learning_rate": 9.92092672521243e-05, "loss": 0.0763, "step": 891 }, { "epoch": 0.1147070170174461, "grad_norm": 0.26953125, "learning_rate": 9.920747591326371e-05, "loss": 0.0831, "step": 892 }, { "epoch": 0.11483561232800377, "grad_norm": 0.25, "learning_rate": 9.920568256384321e-05, "loss": 0.0788, "step": 893 }, { "epoch": 0.11496420763856145, "grad_norm": 0.251953125, "learning_rate": 9.920388720393606e-05, "loss": 0.0745, "step": 894 }, { "epoch": 0.11509280294911912, "grad_norm": 0.248046875, "learning_rate": 9.920208983361559e-05, "loss": 0.0822, "step": 895 }, { "epoch": 0.1152213982596768, "grad_norm": 0.259765625, "learning_rate": 9.920029045295527e-05, "loss": 0.0793, "step": 896 }, { "epoch": 0.11534999357023447, "grad_norm": 0.251953125, "learning_rate": 9.919848906202861e-05, "loss": 0.0768, "step": 897 }, { "epoch": 0.11547858888079214, "grad_norm": 0.224609375, "learning_rate": 9.919668566090921e-05, "loss": 0.0623, "step": 898 }, { "epoch": 0.11560718419134983, "grad_norm": 0.236328125, "learning_rate": 9.919488024967075e-05, "loss": 0.0759, "step": 899 }, { "epoch": 0.1157357795019075, "grad_norm": 0.25, "learning_rate": 9.9193072828387e-05, "loss": 0.0847, "step": 900 }, { "epoch": 0.11586437481246517, "grad_norm": 0.263671875, "learning_rate": 9.91912633971318e-05, "loss": 0.0781, "step": 901 }, { "epoch": 0.11599297012302284, "grad_norm": 0.271484375, "learning_rate": 9.918945195597911e-05, "loss": 0.0748, "step": 902 }, { "epoch": 0.11612156543358053, "grad_norm": 0.248046875, "learning_rate": 9.918763850500294e-05, "loss": 0.0778, "step": 903 }, { "epoch": 0.1162501607441382, "grad_norm": 0.24609375, "learning_rate": 9.918582304427735e-05, "loss": 0.0697, "step": 904 }, { "epoch": 0.11637875605469587, "grad_norm": 0.26171875, "learning_rate": 9.918400557387654e-05, "loss": 0.0787, "step": 905 }, { "epoch": 0.11650735136525354, "grad_norm": 0.244140625, "learning_rate": 9.918218609387478e-05, "loss": 0.0724, "step": 906 }, { "epoch": 0.11663594667581122, "grad_norm": 0.27734375, "learning_rate": 9.91803646043464e-05, "loss": 0.0893, "step": 907 }, { "epoch": 0.1167645419863689, "grad_norm": 0.28125, "learning_rate": 9.917854110536581e-05, "loss": 0.0854, "step": 908 }, { "epoch": 0.11689313729692657, "grad_norm": 0.28125, "learning_rate": 9.917671559700755e-05, "loss": 0.0826, "step": 909 }, { "epoch": 0.11702173260748425, "grad_norm": 0.25390625, "learning_rate": 9.917488807934618e-05, "loss": 0.0832, "step": 910 }, { "epoch": 0.11715032791804192, "grad_norm": 0.294921875, "learning_rate": 9.917305855245638e-05, "loss": 0.0929, "step": 911 }, { "epoch": 0.11727892322859959, "grad_norm": 0.26171875, "learning_rate": 9.91712270164129e-05, "loss": 0.0886, "step": 912 }, { "epoch": 0.11740751853915728, "grad_norm": 0.224609375, "learning_rate": 9.916939347129058e-05, "loss": 0.0658, "step": 913 }, { "epoch": 0.11753611384971495, "grad_norm": 0.2392578125, "learning_rate": 9.916755791716434e-05, "loss": 0.068, "step": 914 }, { "epoch": 0.11766470916027262, "grad_norm": 0.2734375, "learning_rate": 9.916572035410916e-05, "loss": 0.0696, "step": 915 }, { "epoch": 0.11779330447083029, "grad_norm": 0.216796875, "learning_rate": 9.916388078220013e-05, "loss": 0.0619, "step": 916 }, { "epoch": 0.11792189978138798, "grad_norm": 0.2265625, "learning_rate": 9.916203920151241e-05, "loss": 0.064, "step": 917 }, { "epoch": 0.11805049509194565, "grad_norm": 0.220703125, "learning_rate": 9.916019561212126e-05, "loss": 0.0685, "step": 918 }, { "epoch": 0.11817909040250332, "grad_norm": 0.23046875, "learning_rate": 9.915835001410201e-05, "loss": 0.0696, "step": 919 }, { "epoch": 0.118307685713061, "grad_norm": 0.25, "learning_rate": 9.915650240753003e-05, "loss": 0.0792, "step": 920 }, { "epoch": 0.11843628102361867, "grad_norm": 0.2451171875, "learning_rate": 9.915465279248086e-05, "loss": 0.0703, "step": 921 }, { "epoch": 0.11856487633417635, "grad_norm": 0.251953125, "learning_rate": 9.915280116903003e-05, "loss": 0.0849, "step": 922 }, { "epoch": 0.11869347164473402, "grad_norm": 0.251953125, "learning_rate": 9.915094753725322e-05, "loss": 0.0727, "step": 923 }, { "epoch": 0.1188220669552917, "grad_norm": 0.291015625, "learning_rate": 9.914909189722617e-05, "loss": 0.0892, "step": 924 }, { "epoch": 0.11895066226584937, "grad_norm": 0.28125, "learning_rate": 9.914723424902468e-05, "loss": 0.0769, "step": 925 }, { "epoch": 0.11907925757640704, "grad_norm": 0.263671875, "learning_rate": 9.914537459272466e-05, "loss": 0.0907, "step": 926 }, { "epoch": 0.11920785288696473, "grad_norm": 0.244140625, "learning_rate": 9.91435129284021e-05, "loss": 0.0746, "step": 927 }, { "epoch": 0.1193364481975224, "grad_norm": 0.2578125, "learning_rate": 9.914164925613307e-05, "loss": 0.0854, "step": 928 }, { "epoch": 0.11946504350808007, "grad_norm": 0.298828125, "learning_rate": 9.91397835759937e-05, "loss": 0.0822, "step": 929 }, { "epoch": 0.11959363881863774, "grad_norm": 0.2314453125, "learning_rate": 9.913791588806023e-05, "loss": 0.0595, "step": 930 }, { "epoch": 0.11972223412919542, "grad_norm": 0.255859375, "learning_rate": 9.913604619240897e-05, "loss": 0.0759, "step": 931 }, { "epoch": 0.1198508294397531, "grad_norm": 0.2353515625, "learning_rate": 9.91341744891163e-05, "loss": 0.062, "step": 932 }, { "epoch": 0.11997942475031077, "grad_norm": 0.2353515625, "learning_rate": 9.913230077825873e-05, "loss": 0.0754, "step": 933 }, { "epoch": 0.12010802006086845, "grad_norm": 0.244140625, "learning_rate": 9.913042505991279e-05, "loss": 0.0696, "step": 934 }, { "epoch": 0.12023661537142612, "grad_norm": 0.255859375, "learning_rate": 9.912854733415513e-05, "loss": 0.0655, "step": 935 }, { "epoch": 0.1203652106819838, "grad_norm": 0.2451171875, "learning_rate": 9.912666760106246e-05, "loss": 0.0669, "step": 936 }, { "epoch": 0.12049380599254148, "grad_norm": 0.26953125, "learning_rate": 9.91247858607116e-05, "loss": 0.0853, "step": 937 }, { "epoch": 0.12062240130309915, "grad_norm": 0.244140625, "learning_rate": 9.912290211317942e-05, "loss": 0.0668, "step": 938 }, { "epoch": 0.12075099661365682, "grad_norm": 0.2412109375, "learning_rate": 9.91210163585429e-05, "loss": 0.0631, "step": 939 }, { "epoch": 0.12087959192421449, "grad_norm": 0.259765625, "learning_rate": 9.911912859687909e-05, "loss": 0.0819, "step": 940 }, { "epoch": 0.12100818723477218, "grad_norm": 0.2392578125, "learning_rate": 9.911723882826511e-05, "loss": 0.0758, "step": 941 }, { "epoch": 0.12113678254532985, "grad_norm": 0.2333984375, "learning_rate": 9.911534705277817e-05, "loss": 0.0711, "step": 942 }, { "epoch": 0.12126537785588752, "grad_norm": 0.32421875, "learning_rate": 9.91134532704956e-05, "loss": 0.0808, "step": 943 }, { "epoch": 0.1213939731664452, "grad_norm": 0.28125, "learning_rate": 9.911155748149473e-05, "loss": 0.0829, "step": 944 }, { "epoch": 0.12152256847700287, "grad_norm": 0.248046875, "learning_rate": 9.910965968585307e-05, "loss": 0.0726, "step": 945 }, { "epoch": 0.12165116378756055, "grad_norm": 0.234375, "learning_rate": 9.910775988364811e-05, "loss": 0.0728, "step": 946 }, { "epoch": 0.12177975909811822, "grad_norm": 0.2373046875, "learning_rate": 9.91058580749575e-05, "loss": 0.0712, "step": 947 }, { "epoch": 0.1219083544086759, "grad_norm": 0.248046875, "learning_rate": 9.910395425985896e-05, "loss": 0.072, "step": 948 }, { "epoch": 0.12203694971923357, "grad_norm": 0.259765625, "learning_rate": 9.910204843843026e-05, "loss": 0.0909, "step": 949 }, { "epoch": 0.12216554502979125, "grad_norm": 0.25390625, "learning_rate": 9.910014061074926e-05, "loss": 0.0863, "step": 950 }, { "epoch": 0.12229414034034893, "grad_norm": 0.240234375, "learning_rate": 9.909823077689392e-05, "loss": 0.08, "step": 951 }, { "epoch": 0.1224227356509066, "grad_norm": 0.265625, "learning_rate": 9.909631893694229e-05, "loss": 0.0902, "step": 952 }, { "epoch": 0.12255133096146427, "grad_norm": 0.23828125, "learning_rate": 9.909440509097248e-05, "loss": 0.0783, "step": 953 }, { "epoch": 0.12267992627202194, "grad_norm": 0.275390625, "learning_rate": 9.909248923906267e-05, "loss": 0.0814, "step": 954 }, { "epoch": 0.12280852158257963, "grad_norm": 0.2412109375, "learning_rate": 9.909057138129115e-05, "loss": 0.0734, "step": 955 }, { "epoch": 0.1229371168931373, "grad_norm": 0.2392578125, "learning_rate": 9.908865151773627e-05, "loss": 0.073, "step": 956 }, { "epoch": 0.12306571220369497, "grad_norm": 0.2158203125, "learning_rate": 9.908672964847649e-05, "loss": 0.0672, "step": 957 }, { "epoch": 0.12319430751425264, "grad_norm": 0.224609375, "learning_rate": 9.908480577359034e-05, "loss": 0.0699, "step": 958 }, { "epoch": 0.12332290282481032, "grad_norm": 0.263671875, "learning_rate": 9.908287989315641e-05, "loss": 0.076, "step": 959 }, { "epoch": 0.123451498135368, "grad_norm": 0.2265625, "learning_rate": 9.90809520072534e-05, "loss": 0.0671, "step": 960 }, { "epoch": 0.12358009344592567, "grad_norm": 0.2265625, "learning_rate": 9.907902211596007e-05, "loss": 0.0616, "step": 961 }, { "epoch": 0.12370868875648335, "grad_norm": 0.2255859375, "learning_rate": 9.907709021935529e-05, "loss": 0.0732, "step": 962 }, { "epoch": 0.12383728406704102, "grad_norm": 0.259765625, "learning_rate": 9.907515631751798e-05, "loss": 0.0808, "step": 963 }, { "epoch": 0.1239658793775987, "grad_norm": 0.2265625, "learning_rate": 9.907322041052718e-05, "loss": 0.0515, "step": 964 }, { "epoch": 0.12409447468815638, "grad_norm": 0.251953125, "learning_rate": 9.907128249846195e-05, "loss": 0.0775, "step": 965 }, { "epoch": 0.12422306999871405, "grad_norm": 0.298828125, "learning_rate": 9.906934258140149e-05, "loss": 0.0764, "step": 966 }, { "epoch": 0.12435166530927172, "grad_norm": 0.2373046875, "learning_rate": 9.906740065942508e-05, "loss": 0.0578, "step": 967 }, { "epoch": 0.12448026061982939, "grad_norm": 0.265625, "learning_rate": 9.906545673261203e-05, "loss": 0.0715, "step": 968 }, { "epoch": 0.12460885593038708, "grad_norm": 0.296875, "learning_rate": 9.906351080104181e-05, "loss": 0.0775, "step": 969 }, { "epoch": 0.12473745124094475, "grad_norm": 0.212890625, "learning_rate": 9.906156286479388e-05, "loss": 0.0546, "step": 970 }, { "epoch": 0.12486604655150242, "grad_norm": 0.244140625, "learning_rate": 9.905961292394787e-05, "loss": 0.0704, "step": 971 }, { "epoch": 0.1249946418620601, "grad_norm": 0.236328125, "learning_rate": 9.905766097858344e-05, "loss": 0.0711, "step": 972 }, { "epoch": 0.12512323717261778, "grad_norm": 0.259765625, "learning_rate": 9.905570702878034e-05, "loss": 0.087, "step": 973 }, { "epoch": 0.12525183248317545, "grad_norm": 0.251953125, "learning_rate": 9.905375107461841e-05, "loss": 0.0726, "step": 974 }, { "epoch": 0.12538042779373312, "grad_norm": 0.2392578125, "learning_rate": 9.905179311617756e-05, "loss": 0.0663, "step": 975 }, { "epoch": 0.1255090231042908, "grad_norm": 0.2470703125, "learning_rate": 9.90498331535378e-05, "loss": 0.0741, "step": 976 }, { "epoch": 0.12563761841484847, "grad_norm": 0.2431640625, "learning_rate": 9.904787118677921e-05, "loss": 0.0733, "step": 977 }, { "epoch": 0.12576621372540614, "grad_norm": 0.23828125, "learning_rate": 9.904590721598194e-05, "loss": 0.0672, "step": 978 }, { "epoch": 0.1258948090359638, "grad_norm": 0.259765625, "learning_rate": 9.904394124122624e-05, "loss": 0.0736, "step": 979 }, { "epoch": 0.12602340434652148, "grad_norm": 0.263671875, "learning_rate": 9.904197326259246e-05, "loss": 0.0752, "step": 980 }, { "epoch": 0.12615199965707918, "grad_norm": 0.240234375, "learning_rate": 9.904000328016099e-05, "loss": 0.0696, "step": 981 }, { "epoch": 0.12628059496763686, "grad_norm": 0.232421875, "learning_rate": 9.903803129401233e-05, "loss": 0.0679, "step": 982 }, { "epoch": 0.12640919027819453, "grad_norm": 0.240234375, "learning_rate": 9.903605730422704e-05, "loss": 0.0733, "step": 983 }, { "epoch": 0.1265377855887522, "grad_norm": 0.2333984375, "learning_rate": 9.903408131088579e-05, "loss": 0.0719, "step": 984 }, { "epoch": 0.12666638089930987, "grad_norm": 0.2265625, "learning_rate": 9.903210331406929e-05, "loss": 0.0683, "step": 985 }, { "epoch": 0.12679497620986754, "grad_norm": 0.2421875, "learning_rate": 9.90301233138584e-05, "loss": 0.064, "step": 986 }, { "epoch": 0.12692357152042522, "grad_norm": 0.259765625, "learning_rate": 9.902814131033399e-05, "loss": 0.0621, "step": 987 }, { "epoch": 0.1270521668309829, "grad_norm": 0.25, "learning_rate": 9.902615730357705e-05, "loss": 0.0746, "step": 988 }, { "epoch": 0.12718076214154056, "grad_norm": 0.2275390625, "learning_rate": 9.902417129366864e-05, "loss": 0.0662, "step": 989 }, { "epoch": 0.12730935745209826, "grad_norm": 0.275390625, "learning_rate": 9.902218328068992e-05, "loss": 0.0785, "step": 990 }, { "epoch": 0.12743795276265593, "grad_norm": 0.26953125, "learning_rate": 9.902019326472211e-05, "loss": 0.0794, "step": 991 }, { "epoch": 0.1275665480732136, "grad_norm": 0.2373046875, "learning_rate": 9.90182012458465e-05, "loss": 0.0593, "step": 992 }, { "epoch": 0.12769514338377128, "grad_norm": 0.279296875, "learning_rate": 9.901620722414452e-05, "loss": 0.0752, "step": 993 }, { "epoch": 0.12782373869432895, "grad_norm": 0.279296875, "learning_rate": 9.901421119969762e-05, "loss": 0.0794, "step": 994 }, { "epoch": 0.12795233400488662, "grad_norm": 0.2255859375, "learning_rate": 9.901221317258735e-05, "loss": 0.0721, "step": 995 }, { "epoch": 0.1280809293154443, "grad_norm": 0.2353515625, "learning_rate": 9.901021314289537e-05, "loss": 0.0726, "step": 996 }, { "epoch": 0.12820952462600196, "grad_norm": 0.23046875, "learning_rate": 9.900821111070338e-05, "loss": 0.0708, "step": 997 }, { "epoch": 0.12833811993655964, "grad_norm": 0.25, "learning_rate": 9.900620707609318e-05, "loss": 0.0696, "step": 998 }, { "epoch": 0.1284667152471173, "grad_norm": 0.23046875, "learning_rate": 9.900420103914665e-05, "loss": 0.0659, "step": 999 }, { "epoch": 0.128595310557675, "grad_norm": 0.23046875, "learning_rate": 9.900219299994578e-05, "loss": 0.0682, "step": 1000 }, { "epoch": 0.128595310557675, "eval_loss": 0.0710105374455452, "eval_runtime": 1046.874, "eval_samples_per_second": 93.828, "eval_steps_per_second": 1.173, "step": 1000 }, { "epoch": 0.12872390586823268, "grad_norm": 0.2197265625, "learning_rate": 9.900018295857259e-05, "loss": 0.0628, "step": 1001 }, { "epoch": 0.12885250117879035, "grad_norm": 0.228515625, "learning_rate": 9.899817091510922e-05, "loss": 0.069, "step": 1002 }, { "epoch": 0.12898109648934802, "grad_norm": 0.251953125, "learning_rate": 9.899615686963787e-05, "loss": 0.078, "step": 1003 }, { "epoch": 0.1291096917999057, "grad_norm": 0.2392578125, "learning_rate": 9.899414082224084e-05, "loss": 0.077, "step": 1004 }, { "epoch": 0.12923828711046337, "grad_norm": 0.2255859375, "learning_rate": 9.89921227730005e-05, "loss": 0.0584, "step": 1005 }, { "epoch": 0.12936688242102104, "grad_norm": 0.23828125, "learning_rate": 9.899010272199931e-05, "loss": 0.0777, "step": 1006 }, { "epoch": 0.1294954777315787, "grad_norm": 0.2060546875, "learning_rate": 9.89880806693198e-05, "loss": 0.07, "step": 1007 }, { "epoch": 0.12962407304213638, "grad_norm": 0.2197265625, "learning_rate": 9.898605661504459e-05, "loss": 0.073, "step": 1008 }, { "epoch": 0.12975266835269408, "grad_norm": 0.2138671875, "learning_rate": 9.898403055925638e-05, "loss": 0.0607, "step": 1009 }, { "epoch": 0.12988126366325176, "grad_norm": 0.2353515625, "learning_rate": 9.898200250203797e-05, "loss": 0.0735, "step": 1010 }, { "epoch": 0.13000985897380943, "grad_norm": 0.25, "learning_rate": 9.897997244347218e-05, "loss": 0.0697, "step": 1011 }, { "epoch": 0.1301384542843671, "grad_norm": 0.2412109375, "learning_rate": 9.897794038364201e-05, "loss": 0.0811, "step": 1012 }, { "epoch": 0.13026704959492477, "grad_norm": 0.21875, "learning_rate": 9.897590632263045e-05, "loss": 0.0747, "step": 1013 }, { "epoch": 0.13039564490548244, "grad_norm": 0.2275390625, "learning_rate": 9.897387026052063e-05, "loss": 0.062, "step": 1014 }, { "epoch": 0.13052424021604012, "grad_norm": 0.2177734375, "learning_rate": 9.897183219739574e-05, "loss": 0.0615, "step": 1015 }, { "epoch": 0.1306528355265978, "grad_norm": 0.2421875, "learning_rate": 9.896979213333902e-05, "loss": 0.0763, "step": 1016 }, { "epoch": 0.13078143083715546, "grad_norm": 0.2392578125, "learning_rate": 9.896775006843387e-05, "loss": 0.0672, "step": 1017 }, { "epoch": 0.13091002614771316, "grad_norm": 0.2294921875, "learning_rate": 9.89657060027637e-05, "loss": 0.0726, "step": 1018 }, { "epoch": 0.13103862145827083, "grad_norm": 0.2275390625, "learning_rate": 9.896365993641203e-05, "loss": 0.074, "step": 1019 }, { "epoch": 0.1311672167688285, "grad_norm": 0.279296875, "learning_rate": 9.896161186946246e-05, "loss": 0.072, "step": 1020 }, { "epoch": 0.13129581207938618, "grad_norm": 0.220703125, "learning_rate": 9.895956180199869e-05, "loss": 0.0611, "step": 1021 }, { "epoch": 0.13142440738994385, "grad_norm": 0.2412109375, "learning_rate": 9.895750973410448e-05, "loss": 0.0602, "step": 1022 }, { "epoch": 0.13155300270050152, "grad_norm": 0.220703125, "learning_rate": 9.895545566586363e-05, "loss": 0.0643, "step": 1023 }, { "epoch": 0.1316815980110592, "grad_norm": 0.26953125, "learning_rate": 9.895339959736012e-05, "loss": 0.0812, "step": 1024 }, { "epoch": 0.13181019332161686, "grad_norm": 0.26171875, "learning_rate": 9.895134152867793e-05, "loss": 0.0823, "step": 1025 }, { "epoch": 0.13193878863217454, "grad_norm": 0.216796875, "learning_rate": 9.894928145990118e-05, "loss": 0.0605, "step": 1026 }, { "epoch": 0.1320673839427322, "grad_norm": 0.2265625, "learning_rate": 9.894721939111401e-05, "loss": 0.0629, "step": 1027 }, { "epoch": 0.1321959792532899, "grad_norm": 0.26953125, "learning_rate": 9.894515532240069e-05, "loss": 0.0787, "step": 1028 }, { "epoch": 0.13232457456384758, "grad_norm": 0.24609375, "learning_rate": 9.894308925384554e-05, "loss": 0.0735, "step": 1029 }, { "epoch": 0.13245316987440525, "grad_norm": 0.25390625, "learning_rate": 9.894102118553298e-05, "loss": 0.0774, "step": 1030 }, { "epoch": 0.13258176518496292, "grad_norm": 0.275390625, "learning_rate": 9.893895111754755e-05, "loss": 0.0704, "step": 1031 }, { "epoch": 0.1327103604955206, "grad_norm": 0.236328125, "learning_rate": 9.893687904997378e-05, "loss": 0.0694, "step": 1032 }, { "epoch": 0.13283895580607827, "grad_norm": 0.2265625, "learning_rate": 9.893480498289635e-05, "loss": 0.0665, "step": 1033 }, { "epoch": 0.13296755111663594, "grad_norm": 0.259765625, "learning_rate": 9.893272891640001e-05, "loss": 0.0732, "step": 1034 }, { "epoch": 0.1330961464271936, "grad_norm": 0.4375, "learning_rate": 9.893065085056957e-05, "loss": 0.0778, "step": 1035 }, { "epoch": 0.13322474173775128, "grad_norm": 0.2392578125, "learning_rate": 9.892857078548994e-05, "loss": 0.0695, "step": 1036 }, { "epoch": 0.13335333704830898, "grad_norm": 0.251953125, "learning_rate": 9.892648872124615e-05, "loss": 0.0684, "step": 1037 }, { "epoch": 0.13348193235886666, "grad_norm": 0.232421875, "learning_rate": 9.892440465792321e-05, "loss": 0.0726, "step": 1038 }, { "epoch": 0.13361052766942433, "grad_norm": 0.2431640625, "learning_rate": 9.89223185956063e-05, "loss": 0.0673, "step": 1039 }, { "epoch": 0.133739122979982, "grad_norm": 0.2578125, "learning_rate": 9.892023053438066e-05, "loss": 0.0816, "step": 1040 }, { "epoch": 0.13386771829053967, "grad_norm": 0.2060546875, "learning_rate": 9.891814047433159e-05, "loss": 0.0591, "step": 1041 }, { "epoch": 0.13399631360109734, "grad_norm": 0.232421875, "learning_rate": 9.89160484155445e-05, "loss": 0.0771, "step": 1042 }, { "epoch": 0.13412490891165502, "grad_norm": 0.267578125, "learning_rate": 9.891395435810487e-05, "loss": 0.0855, "step": 1043 }, { "epoch": 0.1342535042222127, "grad_norm": 0.236328125, "learning_rate": 9.891185830209825e-05, "loss": 0.0699, "step": 1044 }, { "epoch": 0.13438209953277036, "grad_norm": 0.259765625, "learning_rate": 9.890976024761029e-05, "loss": 0.0807, "step": 1045 }, { "epoch": 0.13451069484332806, "grad_norm": 0.275390625, "learning_rate": 9.890766019472672e-05, "loss": 0.0838, "step": 1046 }, { "epoch": 0.13463929015388573, "grad_norm": 0.2734375, "learning_rate": 9.890555814353333e-05, "loss": 0.0889, "step": 1047 }, { "epoch": 0.1347678854644434, "grad_norm": 0.2236328125, "learning_rate": 9.890345409411601e-05, "loss": 0.0634, "step": 1048 }, { "epoch": 0.13489648077500108, "grad_norm": 0.248046875, "learning_rate": 9.890134804656075e-05, "loss": 0.0765, "step": 1049 }, { "epoch": 0.13502507608555875, "grad_norm": 0.224609375, "learning_rate": 9.889924000095358e-05, "loss": 0.0629, "step": 1050 }, { "epoch": 0.13515367139611642, "grad_norm": 0.23046875, "learning_rate": 9.889712995738063e-05, "loss": 0.0722, "step": 1051 }, { "epoch": 0.1352822667066741, "grad_norm": 0.224609375, "learning_rate": 9.889501791592812e-05, "loss": 0.0718, "step": 1052 }, { "epoch": 0.13541086201723176, "grad_norm": 0.2255859375, "learning_rate": 9.889290387668235e-05, "loss": 0.072, "step": 1053 }, { "epoch": 0.13553945732778944, "grad_norm": 0.23828125, "learning_rate": 9.88907878397297e-05, "loss": 0.0646, "step": 1054 }, { "epoch": 0.1356680526383471, "grad_norm": 0.21875, "learning_rate": 9.888866980515663e-05, "loss": 0.0648, "step": 1055 }, { "epoch": 0.1357966479489048, "grad_norm": 0.25, "learning_rate": 9.888654977304965e-05, "loss": 0.0734, "step": 1056 }, { "epoch": 0.13592524325946248, "grad_norm": 0.23828125, "learning_rate": 9.888442774349542e-05, "loss": 0.0573, "step": 1057 }, { "epoch": 0.13605383857002015, "grad_norm": 0.2177734375, "learning_rate": 9.888230371658063e-05, "loss": 0.062, "step": 1058 }, { "epoch": 0.13618243388057782, "grad_norm": 0.2275390625, "learning_rate": 9.888017769239208e-05, "loss": 0.0633, "step": 1059 }, { "epoch": 0.1363110291911355, "grad_norm": 0.2431640625, "learning_rate": 9.887804967101659e-05, "loss": 0.075, "step": 1060 }, { "epoch": 0.13643962450169317, "grad_norm": 0.2314453125, "learning_rate": 9.887591965254116e-05, "loss": 0.0743, "step": 1061 }, { "epoch": 0.13656821981225084, "grad_norm": 0.25390625, "learning_rate": 9.88737876370528e-05, "loss": 0.0785, "step": 1062 }, { "epoch": 0.1366968151228085, "grad_norm": 0.2216796875, "learning_rate": 9.887165362463861e-05, "loss": 0.0577, "step": 1063 }, { "epoch": 0.13682541043336618, "grad_norm": 0.2119140625, "learning_rate": 9.886951761538579e-05, "loss": 0.0569, "step": 1064 }, { "epoch": 0.13695400574392388, "grad_norm": 0.2099609375, "learning_rate": 9.886737960938164e-05, "loss": 0.0672, "step": 1065 }, { "epoch": 0.13708260105448156, "grad_norm": 0.20703125, "learning_rate": 9.886523960671348e-05, "loss": 0.0614, "step": 1066 }, { "epoch": 0.13721119636503923, "grad_norm": 0.23046875, "learning_rate": 9.886309760746877e-05, "loss": 0.0628, "step": 1067 }, { "epoch": 0.1373397916755969, "grad_norm": 0.21875, "learning_rate": 9.886095361173501e-05, "loss": 0.0534, "step": 1068 }, { "epoch": 0.13746838698615457, "grad_norm": 0.2431640625, "learning_rate": 9.885880761959984e-05, "loss": 0.0782, "step": 1069 }, { "epoch": 0.13759698229671224, "grad_norm": 0.224609375, "learning_rate": 9.88566596311509e-05, "loss": 0.0747, "step": 1070 }, { "epoch": 0.13772557760726992, "grad_norm": 0.244140625, "learning_rate": 9.885450964647598e-05, "loss": 0.0701, "step": 1071 }, { "epoch": 0.1378541729178276, "grad_norm": 0.220703125, "learning_rate": 9.885235766566291e-05, "loss": 0.0653, "step": 1072 }, { "epoch": 0.13798276822838526, "grad_norm": 0.2275390625, "learning_rate": 9.885020368879962e-05, "loss": 0.0637, "step": 1073 }, { "epoch": 0.13811136353894293, "grad_norm": 0.232421875, "learning_rate": 9.884804771597413e-05, "loss": 0.0735, "step": 1074 }, { "epoch": 0.13823995884950063, "grad_norm": 0.2236328125, "learning_rate": 9.884588974727453e-05, "loss": 0.0677, "step": 1075 }, { "epoch": 0.1383685541600583, "grad_norm": 0.224609375, "learning_rate": 9.8843729782789e-05, "loss": 0.0647, "step": 1076 }, { "epoch": 0.13849714947061598, "grad_norm": 0.263671875, "learning_rate": 9.884156782260576e-05, "loss": 0.0833, "step": 1077 }, { "epoch": 0.13862574478117365, "grad_norm": 0.28125, "learning_rate": 9.883940386681317e-05, "loss": 0.0604, "step": 1078 }, { "epoch": 0.13875434009173132, "grad_norm": 0.259765625, "learning_rate": 9.883723791549964e-05, "loss": 0.0702, "step": 1079 }, { "epoch": 0.138882935402289, "grad_norm": 0.2412109375, "learning_rate": 9.883506996875366e-05, "loss": 0.0739, "step": 1080 }, { "epoch": 0.13901153071284666, "grad_norm": 0.2373046875, "learning_rate": 9.883290002666384e-05, "loss": 0.0642, "step": 1081 }, { "epoch": 0.13914012602340434, "grad_norm": 0.271484375, "learning_rate": 9.883072808931882e-05, "loss": 0.0814, "step": 1082 }, { "epoch": 0.139268721333962, "grad_norm": 0.2294921875, "learning_rate": 9.882855415680733e-05, "loss": 0.0619, "step": 1083 }, { "epoch": 0.1393973166445197, "grad_norm": 0.2412109375, "learning_rate": 9.882637822921821e-05, "loss": 0.0665, "step": 1084 }, { "epoch": 0.13952591195507738, "grad_norm": 0.251953125, "learning_rate": 9.882420030664038e-05, "loss": 0.0824, "step": 1085 }, { "epoch": 0.13965450726563505, "grad_norm": 0.2392578125, "learning_rate": 9.88220203891628e-05, "loss": 0.0769, "step": 1086 }, { "epoch": 0.13978310257619272, "grad_norm": 0.220703125, "learning_rate": 9.881983847687455e-05, "loss": 0.0682, "step": 1087 }, { "epoch": 0.1399116978867504, "grad_norm": 0.2421875, "learning_rate": 9.881765456986478e-05, "loss": 0.0725, "step": 1088 }, { "epoch": 0.14004029319730807, "grad_norm": 0.2353515625, "learning_rate": 9.881546866822272e-05, "loss": 0.0717, "step": 1089 }, { "epoch": 0.14016888850786574, "grad_norm": 0.224609375, "learning_rate": 9.881328077203769e-05, "loss": 0.0677, "step": 1090 }, { "epoch": 0.1402974838184234, "grad_norm": 0.234375, "learning_rate": 9.881109088139909e-05, "loss": 0.0718, "step": 1091 }, { "epoch": 0.14042607912898109, "grad_norm": 0.2431640625, "learning_rate": 9.880889899639638e-05, "loss": 0.0714, "step": 1092 }, { "epoch": 0.14055467443953878, "grad_norm": 0.228515625, "learning_rate": 9.880670511711913e-05, "loss": 0.0708, "step": 1093 }, { "epoch": 0.14068326975009646, "grad_norm": 0.234375, "learning_rate": 9.880450924365697e-05, "loss": 0.0662, "step": 1094 }, { "epoch": 0.14081186506065413, "grad_norm": 0.232421875, "learning_rate": 9.880231137609962e-05, "loss": 0.0672, "step": 1095 }, { "epoch": 0.1409404603712118, "grad_norm": 0.24609375, "learning_rate": 9.880011151453689e-05, "loss": 0.06, "step": 1096 }, { "epoch": 0.14106905568176947, "grad_norm": 0.2138671875, "learning_rate": 9.879790965905866e-05, "loss": 0.0642, "step": 1097 }, { "epoch": 0.14119765099232715, "grad_norm": 0.2734375, "learning_rate": 9.879570580975491e-05, "loss": 0.0698, "step": 1098 }, { "epoch": 0.14132624630288482, "grad_norm": 0.2138671875, "learning_rate": 9.879349996671568e-05, "loss": 0.0594, "step": 1099 }, { "epoch": 0.1414548416134425, "grad_norm": 0.2138671875, "learning_rate": 9.879129213003107e-05, "loss": 0.0662, "step": 1100 }, { "epoch": 0.14158343692400016, "grad_norm": 0.228515625, "learning_rate": 9.878908229979131e-05, "loss": 0.0659, "step": 1101 }, { "epoch": 0.14171203223455783, "grad_norm": 0.224609375, "learning_rate": 9.87868704760867e-05, "loss": 0.0762, "step": 1102 }, { "epoch": 0.14184062754511553, "grad_norm": 0.236328125, "learning_rate": 9.878465665900761e-05, "loss": 0.0711, "step": 1103 }, { "epoch": 0.1419692228556732, "grad_norm": 0.25390625, "learning_rate": 9.878244084864449e-05, "loss": 0.0823, "step": 1104 }, { "epoch": 0.14209781816623088, "grad_norm": 0.21875, "learning_rate": 9.878022304508788e-05, "loss": 0.0666, "step": 1105 }, { "epoch": 0.14222641347678855, "grad_norm": 0.25390625, "learning_rate": 9.877800324842836e-05, "loss": 0.0716, "step": 1106 }, { "epoch": 0.14235500878734622, "grad_norm": 0.25390625, "learning_rate": 9.877578145875669e-05, "loss": 0.0825, "step": 1107 }, { "epoch": 0.1424836040979039, "grad_norm": 0.220703125, "learning_rate": 9.877355767616361e-05, "loss": 0.0646, "step": 1108 }, { "epoch": 0.14261219940846157, "grad_norm": 0.2294921875, "learning_rate": 9.877133190073998e-05, "loss": 0.0662, "step": 1109 }, { "epoch": 0.14274079471901924, "grad_norm": 0.240234375, "learning_rate": 9.876910413257676e-05, "loss": 0.0709, "step": 1110 }, { "epoch": 0.1428693900295769, "grad_norm": 0.2275390625, "learning_rate": 9.876687437176497e-05, "loss": 0.0621, "step": 1111 }, { "epoch": 0.1429979853401346, "grad_norm": 0.220703125, "learning_rate": 9.876464261839572e-05, "loss": 0.0693, "step": 1112 }, { "epoch": 0.14312658065069228, "grad_norm": 0.2109375, "learning_rate": 9.876240887256019e-05, "loss": 0.065, "step": 1113 }, { "epoch": 0.14325517596124995, "grad_norm": 0.26171875, "learning_rate": 9.876017313434963e-05, "loss": 0.0815, "step": 1114 }, { "epoch": 0.14338377127180763, "grad_norm": 0.2275390625, "learning_rate": 9.875793540385542e-05, "loss": 0.0599, "step": 1115 }, { "epoch": 0.1435123665823653, "grad_norm": 0.25, "learning_rate": 9.875569568116897e-05, "loss": 0.0718, "step": 1116 }, { "epoch": 0.14364096189292297, "grad_norm": 0.265625, "learning_rate": 9.87534539663818e-05, "loss": 0.0603, "step": 1117 }, { "epoch": 0.14376955720348064, "grad_norm": 0.2099609375, "learning_rate": 9.875121025958552e-05, "loss": 0.0633, "step": 1118 }, { "epoch": 0.1438981525140383, "grad_norm": 0.2578125, "learning_rate": 9.874896456087176e-05, "loss": 0.0763, "step": 1119 }, { "epoch": 0.14402674782459599, "grad_norm": 0.271484375, "learning_rate": 9.874671687033235e-05, "loss": 0.0952, "step": 1120 }, { "epoch": 0.14415534313515369, "grad_norm": 0.232421875, "learning_rate": 9.874446718805905e-05, "loss": 0.0625, "step": 1121 }, { "epoch": 0.14428393844571136, "grad_norm": 0.2294921875, "learning_rate": 9.874221551414383e-05, "loss": 0.0688, "step": 1122 }, { "epoch": 0.14441253375626903, "grad_norm": 0.2412109375, "learning_rate": 9.873996184867866e-05, "loss": 0.0794, "step": 1123 }, { "epoch": 0.1445411290668267, "grad_norm": 0.251953125, "learning_rate": 9.873770619175564e-05, "loss": 0.0749, "step": 1124 }, { "epoch": 0.14466972437738437, "grad_norm": 0.216796875, "learning_rate": 9.873544854346693e-05, "loss": 0.0599, "step": 1125 }, { "epoch": 0.14479831968794205, "grad_norm": 0.224609375, "learning_rate": 9.87331889039048e-05, "loss": 0.071, "step": 1126 }, { "epoch": 0.14492691499849972, "grad_norm": 0.24609375, "learning_rate": 9.87309272731615e-05, "loss": 0.0787, "step": 1127 }, { "epoch": 0.1450555103090574, "grad_norm": 0.20703125, "learning_rate": 9.872866365132954e-05, "loss": 0.0571, "step": 1128 }, { "epoch": 0.14518410561961506, "grad_norm": 0.2265625, "learning_rate": 9.872639803850133e-05, "loss": 0.0658, "step": 1129 }, { "epoch": 0.14531270093017273, "grad_norm": 0.2275390625, "learning_rate": 9.872413043476948e-05, "loss": 0.0682, "step": 1130 }, { "epoch": 0.14544129624073043, "grad_norm": 0.25, "learning_rate": 9.872186084022662e-05, "loss": 0.065, "step": 1131 }, { "epoch": 0.1455698915512881, "grad_norm": 0.255859375, "learning_rate": 9.871958925496549e-05, "loss": 0.0633, "step": 1132 }, { "epoch": 0.14569848686184578, "grad_norm": 0.228515625, "learning_rate": 9.871731567907893e-05, "loss": 0.0655, "step": 1133 }, { "epoch": 0.14582708217240345, "grad_norm": 0.22265625, "learning_rate": 9.87150401126598e-05, "loss": 0.0556, "step": 1134 }, { "epoch": 0.14595567748296112, "grad_norm": 0.2578125, "learning_rate": 9.871276255580108e-05, "loss": 0.0671, "step": 1135 }, { "epoch": 0.1460842727935188, "grad_norm": 0.24609375, "learning_rate": 9.871048300859584e-05, "loss": 0.0798, "step": 1136 }, { "epoch": 0.14621286810407647, "grad_norm": 0.28125, "learning_rate": 9.870820147113722e-05, "loss": 0.0744, "step": 1137 }, { "epoch": 0.14634146341463414, "grad_norm": 0.20703125, "learning_rate": 9.870591794351845e-05, "loss": 0.0608, "step": 1138 }, { "epoch": 0.1464700587251918, "grad_norm": 0.2099609375, "learning_rate": 9.87036324258328e-05, "loss": 0.0576, "step": 1139 }, { "epoch": 0.1465986540357495, "grad_norm": 0.25, "learning_rate": 9.870134491817368e-05, "loss": 0.0607, "step": 1140 }, { "epoch": 0.14672724934630718, "grad_norm": 0.2412109375, "learning_rate": 9.869905542063456e-05, "loss": 0.0743, "step": 1141 }, { "epoch": 0.14685584465686485, "grad_norm": 0.2578125, "learning_rate": 9.869676393330898e-05, "loss": 0.0662, "step": 1142 }, { "epoch": 0.14698443996742253, "grad_norm": 0.21484375, "learning_rate": 9.869447045629056e-05, "loss": 0.0638, "step": 1143 }, { "epoch": 0.1471130352779802, "grad_norm": 0.216796875, "learning_rate": 9.8692174989673e-05, "loss": 0.0581, "step": 1144 }, { "epoch": 0.14724163058853787, "grad_norm": 0.2236328125, "learning_rate": 9.86898775335501e-05, "loss": 0.0743, "step": 1145 }, { "epoch": 0.14737022589909554, "grad_norm": 0.2119140625, "learning_rate": 9.868757808801574e-05, "loss": 0.0627, "step": 1146 }, { "epoch": 0.1474988212096532, "grad_norm": 0.23828125, "learning_rate": 9.868527665316387e-05, "loss": 0.0808, "step": 1147 }, { "epoch": 0.14762741652021089, "grad_norm": 0.2216796875, "learning_rate": 9.868297322908852e-05, "loss": 0.0696, "step": 1148 }, { "epoch": 0.14775601183076856, "grad_norm": 0.197265625, "learning_rate": 9.86806678158838e-05, "loss": 0.0501, "step": 1149 }, { "epoch": 0.14788460714132626, "grad_norm": 0.2412109375, "learning_rate": 9.867836041364391e-05, "loss": 0.0776, "step": 1150 }, { "epoch": 0.14801320245188393, "grad_norm": 0.240234375, "learning_rate": 9.867605102246314e-05, "loss": 0.0763, "step": 1151 }, { "epoch": 0.1481417977624416, "grad_norm": 0.2294921875, "learning_rate": 9.867373964243582e-05, "loss": 0.0688, "step": 1152 }, { "epoch": 0.14827039307299927, "grad_norm": 0.25390625, "learning_rate": 9.867142627365642e-05, "loss": 0.0715, "step": 1153 }, { "epoch": 0.14839898838355695, "grad_norm": 0.2109375, "learning_rate": 9.866911091621945e-05, "loss": 0.0675, "step": 1154 }, { "epoch": 0.14852758369411462, "grad_norm": 0.21484375, "learning_rate": 9.866679357021951e-05, "loss": 0.0584, "step": 1155 }, { "epoch": 0.1486561790046723, "grad_norm": 0.2294921875, "learning_rate": 9.86644742357513e-05, "loss": 0.0728, "step": 1156 }, { "epoch": 0.14878477431522996, "grad_norm": 0.2412109375, "learning_rate": 9.866215291290955e-05, "loss": 0.0665, "step": 1157 }, { "epoch": 0.14891336962578763, "grad_norm": 0.2314453125, "learning_rate": 9.865982960178914e-05, "loss": 0.0671, "step": 1158 }, { "epoch": 0.14904196493634533, "grad_norm": 0.220703125, "learning_rate": 9.865750430248499e-05, "loss": 0.0701, "step": 1159 }, { "epoch": 0.149170560246903, "grad_norm": 0.2109375, "learning_rate": 9.865517701509209e-05, "loss": 0.0658, "step": 1160 }, { "epoch": 0.14929915555746068, "grad_norm": 0.2578125, "learning_rate": 9.865284773970556e-05, "loss": 0.0675, "step": 1161 }, { "epoch": 0.14942775086801835, "grad_norm": 0.23046875, "learning_rate": 9.865051647642055e-05, "loss": 0.0749, "step": 1162 }, { "epoch": 0.14955634617857602, "grad_norm": 0.224609375, "learning_rate": 9.864818322533233e-05, "loss": 0.0708, "step": 1163 }, { "epoch": 0.1496849414891337, "grad_norm": 0.251953125, "learning_rate": 9.864584798653622e-05, "loss": 0.0777, "step": 1164 }, { "epoch": 0.14981353679969137, "grad_norm": 0.263671875, "learning_rate": 9.864351076012763e-05, "loss": 0.0773, "step": 1165 }, { "epoch": 0.14994213211024904, "grad_norm": 0.255859375, "learning_rate": 9.864117154620207e-05, "loss": 0.0644, "step": 1166 }, { "epoch": 0.1500707274208067, "grad_norm": 0.228515625, "learning_rate": 9.863883034485513e-05, "loss": 0.0546, "step": 1167 }, { "epoch": 0.1501993227313644, "grad_norm": 0.2265625, "learning_rate": 9.863648715618243e-05, "loss": 0.0629, "step": 1168 }, { "epoch": 0.15032791804192208, "grad_norm": 0.25, "learning_rate": 9.863414198027974e-05, "loss": 0.0721, "step": 1169 }, { "epoch": 0.15045651335247975, "grad_norm": 0.2109375, "learning_rate": 9.863179481724288e-05, "loss": 0.0662, "step": 1170 }, { "epoch": 0.15058510866303743, "grad_norm": 0.240234375, "learning_rate": 9.862944566716771e-05, "loss": 0.0761, "step": 1171 }, { "epoch": 0.1507137039735951, "grad_norm": 0.240234375, "learning_rate": 9.862709453015028e-05, "loss": 0.0677, "step": 1172 }, { "epoch": 0.15084229928415277, "grad_norm": 0.25390625, "learning_rate": 9.862474140628662e-05, "loss": 0.076, "step": 1173 }, { "epoch": 0.15097089459471044, "grad_norm": 0.2392578125, "learning_rate": 9.862238629567289e-05, "loss": 0.0704, "step": 1174 }, { "epoch": 0.15109948990526811, "grad_norm": 0.22265625, "learning_rate": 9.862002919840528e-05, "loss": 0.0682, "step": 1175 }, { "epoch": 0.15122808521582579, "grad_norm": 0.22265625, "learning_rate": 9.861767011458014e-05, "loss": 0.0669, "step": 1176 }, { "epoch": 0.15135668052638346, "grad_norm": 0.2275390625, "learning_rate": 9.861530904429385e-05, "loss": 0.0654, "step": 1177 }, { "epoch": 0.15148527583694116, "grad_norm": 0.2353515625, "learning_rate": 9.861294598764286e-05, "loss": 0.0709, "step": 1178 }, { "epoch": 0.15161387114749883, "grad_norm": 0.248046875, "learning_rate": 9.861058094472374e-05, "loss": 0.0628, "step": 1179 }, { "epoch": 0.1517424664580565, "grad_norm": 0.28125, "learning_rate": 9.860821391563313e-05, "loss": 0.0704, "step": 1180 }, { "epoch": 0.15187106176861417, "grad_norm": 0.251953125, "learning_rate": 9.860584490046771e-05, "loss": 0.0794, "step": 1181 }, { "epoch": 0.15199965707917185, "grad_norm": 0.244140625, "learning_rate": 9.860347389932431e-05, "loss": 0.0642, "step": 1182 }, { "epoch": 0.15212825238972952, "grad_norm": 0.2294921875, "learning_rate": 9.860110091229981e-05, "loss": 0.0669, "step": 1183 }, { "epoch": 0.1522568477002872, "grad_norm": 0.26953125, "learning_rate": 9.859872593949113e-05, "loss": 0.0633, "step": 1184 }, { "epoch": 0.15238544301084486, "grad_norm": 0.2392578125, "learning_rate": 9.859634898099535e-05, "loss": 0.0714, "step": 1185 }, { "epoch": 0.15251403832140253, "grad_norm": 0.2236328125, "learning_rate": 9.859397003690956e-05, "loss": 0.0704, "step": 1186 }, { "epoch": 0.15264263363196023, "grad_norm": 0.2294921875, "learning_rate": 9.859158910733097e-05, "loss": 0.0712, "step": 1187 }, { "epoch": 0.1527712289425179, "grad_norm": 0.189453125, "learning_rate": 9.858920619235688e-05, "loss": 0.0493, "step": 1188 }, { "epoch": 0.15289982425307558, "grad_norm": 0.2353515625, "learning_rate": 9.858682129208461e-05, "loss": 0.0664, "step": 1189 }, { "epoch": 0.15302841956363325, "grad_norm": 0.21875, "learning_rate": 9.858443440661165e-05, "loss": 0.0611, "step": 1190 }, { "epoch": 0.15315701487419092, "grad_norm": 0.240234375, "learning_rate": 9.85820455360355e-05, "loss": 0.0803, "step": 1191 }, { "epoch": 0.1532856101847486, "grad_norm": 0.2177734375, "learning_rate": 9.857965468045376e-05, "loss": 0.0599, "step": 1192 }, { "epoch": 0.15341420549530627, "grad_norm": 0.2333984375, "learning_rate": 9.857726183996414e-05, "loss": 0.0621, "step": 1193 }, { "epoch": 0.15354280080586394, "grad_norm": 0.263671875, "learning_rate": 9.85748670146644e-05, "loss": 0.0741, "step": 1194 }, { "epoch": 0.1536713961164216, "grad_norm": 0.23828125, "learning_rate": 9.857247020465239e-05, "loss": 0.0559, "step": 1195 }, { "epoch": 0.1537999914269793, "grad_norm": 0.244140625, "learning_rate": 9.857007141002603e-05, "loss": 0.0692, "step": 1196 }, { "epoch": 0.15392858673753698, "grad_norm": 0.2451171875, "learning_rate": 9.856767063088335e-05, "loss": 0.0796, "step": 1197 }, { "epoch": 0.15405718204809465, "grad_norm": 0.2158203125, "learning_rate": 9.856526786732244e-05, "loss": 0.0614, "step": 1198 }, { "epoch": 0.15418577735865233, "grad_norm": 0.232421875, "learning_rate": 9.856286311944146e-05, "loss": 0.0758, "step": 1199 }, { "epoch": 0.15431437266921, "grad_norm": 0.26953125, "learning_rate": 9.856045638733868e-05, "loss": 0.0749, "step": 1200 }, { "epoch": 0.15444296797976767, "grad_norm": 0.25390625, "learning_rate": 9.855804767111243e-05, "loss": 0.0597, "step": 1201 }, { "epoch": 0.15457156329032534, "grad_norm": 0.251953125, "learning_rate": 9.855563697086111e-05, "loss": 0.0618, "step": 1202 }, { "epoch": 0.15470015860088301, "grad_norm": 0.2265625, "learning_rate": 9.855322428668325e-05, "loss": 0.0686, "step": 1203 }, { "epoch": 0.1548287539114407, "grad_norm": 0.26171875, "learning_rate": 9.855080961867742e-05, "loss": 0.0794, "step": 1204 }, { "epoch": 0.15495734922199836, "grad_norm": 0.240234375, "learning_rate": 9.854839296694228e-05, "loss": 0.0686, "step": 1205 }, { "epoch": 0.15508594453255606, "grad_norm": 0.224609375, "learning_rate": 9.854597433157655e-05, "loss": 0.0657, "step": 1206 }, { "epoch": 0.15521453984311373, "grad_norm": 0.283203125, "learning_rate": 9.854355371267907e-05, "loss": 0.0738, "step": 1207 }, { "epoch": 0.1553431351536714, "grad_norm": 0.2392578125, "learning_rate": 9.854113111034875e-05, "loss": 0.0737, "step": 1208 }, { "epoch": 0.15547173046422907, "grad_norm": 0.244140625, "learning_rate": 9.853870652468457e-05, "loss": 0.0635, "step": 1209 }, { "epoch": 0.15560032577478675, "grad_norm": 0.24609375, "learning_rate": 9.853627995578558e-05, "loss": 0.0806, "step": 1210 }, { "epoch": 0.15572892108534442, "grad_norm": 0.251953125, "learning_rate": 9.853385140375095e-05, "loss": 0.0647, "step": 1211 }, { "epoch": 0.1558575163959021, "grad_norm": 0.265625, "learning_rate": 9.853142086867991e-05, "loss": 0.0643, "step": 1212 }, { "epoch": 0.15598611170645976, "grad_norm": 0.22265625, "learning_rate": 9.852898835067175e-05, "loss": 0.0608, "step": 1213 }, { "epoch": 0.15611470701701743, "grad_norm": 0.208984375, "learning_rate": 9.852655384982585e-05, "loss": 0.0605, "step": 1214 }, { "epoch": 0.15624330232757513, "grad_norm": 0.232421875, "learning_rate": 9.852411736624172e-05, "loss": 0.0675, "step": 1215 }, { "epoch": 0.1563718976381328, "grad_norm": 0.23046875, "learning_rate": 9.852167890001889e-05, "loss": 0.0686, "step": 1216 }, { "epoch": 0.15650049294869048, "grad_norm": 0.2197265625, "learning_rate": 9.851923845125697e-05, "loss": 0.072, "step": 1217 }, { "epoch": 0.15662908825924815, "grad_norm": 0.224609375, "learning_rate": 9.851679602005573e-05, "loss": 0.0629, "step": 1218 }, { "epoch": 0.15675768356980582, "grad_norm": 0.205078125, "learning_rate": 9.85143516065149e-05, "loss": 0.053, "step": 1219 }, { "epoch": 0.1568862788803635, "grad_norm": 0.2490234375, "learning_rate": 9.85119052107344e-05, "loss": 0.064, "step": 1220 }, { "epoch": 0.15701487419092117, "grad_norm": 0.2392578125, "learning_rate": 9.850945683281419e-05, "loss": 0.0749, "step": 1221 }, { "epoch": 0.15714346950147884, "grad_norm": 0.2353515625, "learning_rate": 9.850700647285427e-05, "loss": 0.0772, "step": 1222 }, { "epoch": 0.1572720648120365, "grad_norm": 0.244140625, "learning_rate": 9.850455413095479e-05, "loss": 0.0697, "step": 1223 }, { "epoch": 0.15740066012259418, "grad_norm": 0.220703125, "learning_rate": 9.850209980721593e-05, "loss": 0.071, "step": 1224 }, { "epoch": 0.15752925543315188, "grad_norm": 0.2255859375, "learning_rate": 9.849964350173799e-05, "loss": 0.0601, "step": 1225 }, { "epoch": 0.15765785074370955, "grad_norm": 0.255859375, "learning_rate": 9.849718521462133e-05, "loss": 0.0748, "step": 1226 }, { "epoch": 0.15778644605426723, "grad_norm": 0.224609375, "learning_rate": 9.849472494596637e-05, "loss": 0.0725, "step": 1227 }, { "epoch": 0.1579150413648249, "grad_norm": 0.236328125, "learning_rate": 9.849226269587366e-05, "loss": 0.0684, "step": 1228 }, { "epoch": 0.15804363667538257, "grad_norm": 0.2294921875, "learning_rate": 9.84897984644438e-05, "loss": 0.0655, "step": 1229 }, { "epoch": 0.15817223198594024, "grad_norm": 0.2109375, "learning_rate": 9.848733225177745e-05, "loss": 0.0662, "step": 1230 }, { "epoch": 0.15830082729649791, "grad_norm": 0.259765625, "learning_rate": 9.848486405797541e-05, "loss": 0.0611, "step": 1231 }, { "epoch": 0.1584294226070556, "grad_norm": 0.251953125, "learning_rate": 9.848239388313852e-05, "loss": 0.0675, "step": 1232 }, { "epoch": 0.15855801791761326, "grad_norm": 0.24609375, "learning_rate": 9.847992172736769e-05, "loss": 0.0742, "step": 1233 }, { "epoch": 0.15868661322817096, "grad_norm": 0.30859375, "learning_rate": 9.847744759076396e-05, "loss": 0.0678, "step": 1234 }, { "epoch": 0.15881520853872863, "grad_norm": 0.2001953125, "learning_rate": 9.847497147342839e-05, "loss": 0.0525, "step": 1235 }, { "epoch": 0.1589438038492863, "grad_norm": 0.2294921875, "learning_rate": 9.847249337546216e-05, "loss": 0.0636, "step": 1236 }, { "epoch": 0.15907239915984397, "grad_norm": 0.20703125, "learning_rate": 9.847001329696653e-05, "loss": 0.0577, "step": 1237 }, { "epoch": 0.15920099447040165, "grad_norm": 0.23046875, "learning_rate": 9.846753123804282e-05, "loss": 0.0662, "step": 1238 }, { "epoch": 0.15932958978095932, "grad_norm": 0.2421875, "learning_rate": 9.846504719879245e-05, "loss": 0.0704, "step": 1239 }, { "epoch": 0.159458185091517, "grad_norm": 0.2255859375, "learning_rate": 9.846256117931693e-05, "loss": 0.063, "step": 1240 }, { "epoch": 0.15958678040207466, "grad_norm": 0.240234375, "learning_rate": 9.846007317971784e-05, "loss": 0.0692, "step": 1241 }, { "epoch": 0.15971537571263233, "grad_norm": 0.2314453125, "learning_rate": 9.845758320009679e-05, "loss": 0.0742, "step": 1242 }, { "epoch": 0.15984397102319003, "grad_norm": 0.21875, "learning_rate": 9.845509124055556e-05, "loss": 0.0675, "step": 1243 }, { "epoch": 0.1599725663337477, "grad_norm": 0.28125, "learning_rate": 9.845259730119597e-05, "loss": 0.0671, "step": 1244 }, { "epoch": 0.16010116164430538, "grad_norm": 0.2158203125, "learning_rate": 9.84501013821199e-05, "loss": 0.061, "step": 1245 }, { "epoch": 0.16022975695486305, "grad_norm": 0.2431640625, "learning_rate": 9.844760348342931e-05, "loss": 0.0727, "step": 1246 }, { "epoch": 0.16035835226542072, "grad_norm": 0.2333984375, "learning_rate": 9.844510360522632e-05, "loss": 0.0698, "step": 1247 }, { "epoch": 0.1604869475759784, "grad_norm": 0.2373046875, "learning_rate": 9.844260174761303e-05, "loss": 0.0652, "step": 1248 }, { "epoch": 0.16061554288653607, "grad_norm": 0.2255859375, "learning_rate": 9.844009791069167e-05, "loss": 0.0669, "step": 1249 }, { "epoch": 0.16074413819709374, "grad_norm": 0.251953125, "learning_rate": 9.843759209456453e-05, "loss": 0.07, "step": 1250 }, { "epoch": 0.1608727335076514, "grad_norm": 0.236328125, "learning_rate": 9.843508429933404e-05, "loss": 0.0567, "step": 1251 }, { "epoch": 0.16100132881820908, "grad_norm": 0.224609375, "learning_rate": 9.843257452510262e-05, "loss": 0.0621, "step": 1252 }, { "epoch": 0.16112992412876678, "grad_norm": 0.2275390625, "learning_rate": 9.843006277197285e-05, "loss": 0.0699, "step": 1253 }, { "epoch": 0.16125851943932445, "grad_norm": 0.228515625, "learning_rate": 9.842754904004731e-05, "loss": 0.0638, "step": 1254 }, { "epoch": 0.16138711474988213, "grad_norm": 0.2490234375, "learning_rate": 9.842503332942876e-05, "loss": 0.0661, "step": 1255 }, { "epoch": 0.1615157100604398, "grad_norm": 0.25390625, "learning_rate": 9.842251564021998e-05, "loss": 0.0724, "step": 1256 }, { "epoch": 0.16164430537099747, "grad_norm": 0.275390625, "learning_rate": 9.841999597252381e-05, "loss": 0.0682, "step": 1257 }, { "epoch": 0.16177290068155514, "grad_norm": 0.2333984375, "learning_rate": 9.841747432644321e-05, "loss": 0.078, "step": 1258 }, { "epoch": 0.16190149599211281, "grad_norm": 0.2470703125, "learning_rate": 9.841495070208124e-05, "loss": 0.0635, "step": 1259 }, { "epoch": 0.1620300913026705, "grad_norm": 0.22265625, "learning_rate": 9.841242509954098e-05, "loss": 0.0569, "step": 1260 }, { "epoch": 0.16215868661322816, "grad_norm": 0.2392578125, "learning_rate": 9.840989751892563e-05, "loss": 0.0683, "step": 1261 }, { "epoch": 0.16228728192378586, "grad_norm": 0.232421875, "learning_rate": 9.840736796033848e-05, "loss": 0.0679, "step": 1262 }, { "epoch": 0.16241587723434353, "grad_norm": 0.271484375, "learning_rate": 9.840483642388287e-05, "loss": 0.0794, "step": 1263 }, { "epoch": 0.1625444725449012, "grad_norm": 0.2041015625, "learning_rate": 9.840230290966223e-05, "loss": 0.0589, "step": 1264 }, { "epoch": 0.16267306785545887, "grad_norm": 0.212890625, "learning_rate": 9.839976741778011e-05, "loss": 0.0593, "step": 1265 }, { "epoch": 0.16280166316601655, "grad_norm": 0.2470703125, "learning_rate": 9.839722994834006e-05, "loss": 0.0716, "step": 1266 }, { "epoch": 0.16293025847657422, "grad_norm": 0.224609375, "learning_rate": 9.83946905014458e-05, "loss": 0.072, "step": 1267 }, { "epoch": 0.1630588537871319, "grad_norm": 0.2294921875, "learning_rate": 9.839214907720105e-05, "loss": 0.0614, "step": 1268 }, { "epoch": 0.16318744909768956, "grad_norm": 0.263671875, "learning_rate": 9.83896056757097e-05, "loss": 0.0759, "step": 1269 }, { "epoch": 0.16331604440824724, "grad_norm": 0.2412109375, "learning_rate": 9.83870602970756e-05, "loss": 0.068, "step": 1270 }, { "epoch": 0.1634446397188049, "grad_norm": 0.1962890625, "learning_rate": 9.838451294140283e-05, "loss": 0.0475, "step": 1271 }, { "epoch": 0.1635732350293626, "grad_norm": 0.2255859375, "learning_rate": 9.838196360879541e-05, "loss": 0.0647, "step": 1272 }, { "epoch": 0.16370183033992028, "grad_norm": 0.232421875, "learning_rate": 9.837941229935753e-05, "loss": 0.0708, "step": 1273 }, { "epoch": 0.16383042565047795, "grad_norm": 0.267578125, "learning_rate": 9.837685901319342e-05, "loss": 0.0712, "step": 1274 }, { "epoch": 0.16395902096103562, "grad_norm": 0.2412109375, "learning_rate": 9.837430375040744e-05, "loss": 0.0742, "step": 1275 }, { "epoch": 0.1640876162715933, "grad_norm": 0.259765625, "learning_rate": 9.837174651110395e-05, "loss": 0.0687, "step": 1276 }, { "epoch": 0.16421621158215097, "grad_norm": 0.2294921875, "learning_rate": 9.836918729538747e-05, "loss": 0.0657, "step": 1277 }, { "epoch": 0.16434480689270864, "grad_norm": 0.240234375, "learning_rate": 9.836662610336254e-05, "loss": 0.0674, "step": 1278 }, { "epoch": 0.1644734022032663, "grad_norm": 0.259765625, "learning_rate": 9.836406293513381e-05, "loss": 0.0754, "step": 1279 }, { "epoch": 0.16460199751382398, "grad_norm": 0.2119140625, "learning_rate": 9.836149779080603e-05, "loss": 0.0551, "step": 1280 }, { "epoch": 0.16473059282438168, "grad_norm": 0.2431640625, "learning_rate": 9.835893067048398e-05, "loss": 0.0772, "step": 1281 }, { "epoch": 0.16485918813493936, "grad_norm": 0.224609375, "learning_rate": 9.835636157427258e-05, "loss": 0.0649, "step": 1282 }, { "epoch": 0.16498778344549703, "grad_norm": 0.216796875, "learning_rate": 9.835379050227678e-05, "loss": 0.0632, "step": 1283 }, { "epoch": 0.1651163787560547, "grad_norm": 0.232421875, "learning_rate": 9.835121745460166e-05, "loss": 0.0671, "step": 1284 }, { "epoch": 0.16524497406661237, "grad_norm": 0.228515625, "learning_rate": 9.834864243135228e-05, "loss": 0.0641, "step": 1285 }, { "epoch": 0.16537356937717004, "grad_norm": 0.2255859375, "learning_rate": 9.834606543263394e-05, "loss": 0.0672, "step": 1286 }, { "epoch": 0.16550216468772772, "grad_norm": 0.22265625, "learning_rate": 9.834348645855187e-05, "loss": 0.0753, "step": 1287 }, { "epoch": 0.1656307599982854, "grad_norm": 0.24609375, "learning_rate": 9.834090550921149e-05, "loss": 0.0696, "step": 1288 }, { "epoch": 0.16575935530884306, "grad_norm": 0.240234375, "learning_rate": 9.833832258471821e-05, "loss": 0.0636, "step": 1289 }, { "epoch": 0.16588795061940076, "grad_norm": 0.22265625, "learning_rate": 9.83357376851776e-05, "loss": 0.0635, "step": 1290 }, { "epoch": 0.16601654592995843, "grad_norm": 0.2255859375, "learning_rate": 9.833315081069527e-05, "loss": 0.0743, "step": 1291 }, { "epoch": 0.1661451412405161, "grad_norm": 0.228515625, "learning_rate": 9.833056196137689e-05, "loss": 0.0708, "step": 1292 }, { "epoch": 0.16627373655107378, "grad_norm": 0.240234375, "learning_rate": 9.832797113732826e-05, "loss": 0.0797, "step": 1293 }, { "epoch": 0.16640233186163145, "grad_norm": 0.216796875, "learning_rate": 9.832537833865525e-05, "loss": 0.0641, "step": 1294 }, { "epoch": 0.16653092717218912, "grad_norm": 0.2080078125, "learning_rate": 9.832278356546379e-05, "loss": 0.0594, "step": 1295 }, { "epoch": 0.1666595224827468, "grad_norm": 0.2109375, "learning_rate": 9.832018681785987e-05, "loss": 0.0646, "step": 1296 }, { "epoch": 0.16678811779330446, "grad_norm": 0.228515625, "learning_rate": 9.831758809594962e-05, "loss": 0.0766, "step": 1297 }, { "epoch": 0.16691671310386214, "grad_norm": 0.25, "learning_rate": 9.831498739983921e-05, "loss": 0.0817, "step": 1298 }, { "epoch": 0.1670453084144198, "grad_norm": 0.21484375, "learning_rate": 9.831238472963493e-05, "loss": 0.0618, "step": 1299 }, { "epoch": 0.1671739037249775, "grad_norm": 0.2353515625, "learning_rate": 9.830978008544308e-05, "loss": 0.0572, "step": 1300 }, { "epoch": 0.16730249903553518, "grad_norm": 0.2353515625, "learning_rate": 9.830717346737011e-05, "loss": 0.0654, "step": 1301 }, { "epoch": 0.16743109434609285, "grad_norm": 0.22265625, "learning_rate": 9.83045648755225e-05, "loss": 0.0683, "step": 1302 }, { "epoch": 0.16755968965665052, "grad_norm": 0.1982421875, "learning_rate": 9.830195431000686e-05, "loss": 0.0487, "step": 1303 }, { "epoch": 0.1676882849672082, "grad_norm": 0.2216796875, "learning_rate": 9.829934177092984e-05, "loss": 0.0666, "step": 1304 }, { "epoch": 0.16781688027776587, "grad_norm": 0.2158203125, "learning_rate": 9.829672725839818e-05, "loss": 0.0604, "step": 1305 }, { "epoch": 0.16794547558832354, "grad_norm": 0.208984375, "learning_rate": 9.829411077251871e-05, "loss": 0.0606, "step": 1306 }, { "epoch": 0.1680740708988812, "grad_norm": 0.2001953125, "learning_rate": 9.829149231339835e-05, "loss": 0.0538, "step": 1307 }, { "epoch": 0.16820266620943888, "grad_norm": 0.216796875, "learning_rate": 9.828887188114408e-05, "loss": 0.0615, "step": 1308 }, { "epoch": 0.16833126151999658, "grad_norm": 0.2353515625, "learning_rate": 9.828624947586296e-05, "loss": 0.0643, "step": 1309 }, { "epoch": 0.16845985683055426, "grad_norm": 0.26953125, "learning_rate": 9.828362509766213e-05, "loss": 0.0644, "step": 1310 }, { "epoch": 0.16858845214111193, "grad_norm": 0.25, "learning_rate": 9.828099874664886e-05, "loss": 0.0633, "step": 1311 }, { "epoch": 0.1687170474516696, "grad_norm": 0.2109375, "learning_rate": 9.827837042293042e-05, "loss": 0.0638, "step": 1312 }, { "epoch": 0.16884564276222727, "grad_norm": 0.2158203125, "learning_rate": 9.827574012661421e-05, "loss": 0.0528, "step": 1313 }, { "epoch": 0.16897423807278494, "grad_norm": 0.25390625, "learning_rate": 9.827310785780771e-05, "loss": 0.0665, "step": 1314 }, { "epoch": 0.16910283338334262, "grad_norm": 0.22265625, "learning_rate": 9.827047361661845e-05, "loss": 0.0655, "step": 1315 }, { "epoch": 0.1692314286939003, "grad_norm": 0.251953125, "learning_rate": 9.826783740315409e-05, "loss": 0.0802, "step": 1316 }, { "epoch": 0.16936002400445796, "grad_norm": 0.21875, "learning_rate": 9.826519921752233e-05, "loss": 0.0672, "step": 1317 }, { "epoch": 0.16948861931501566, "grad_norm": 0.279296875, "learning_rate": 9.826255905983097e-05, "loss": 0.065, "step": 1318 }, { "epoch": 0.16961721462557333, "grad_norm": 0.228515625, "learning_rate": 9.825991693018786e-05, "loss": 0.0678, "step": 1319 }, { "epoch": 0.169745809936131, "grad_norm": 0.2177734375, "learning_rate": 9.825727282870097e-05, "loss": 0.0625, "step": 1320 }, { "epoch": 0.16987440524668868, "grad_norm": 0.224609375, "learning_rate": 9.825462675547835e-05, "loss": 0.0638, "step": 1321 }, { "epoch": 0.17000300055724635, "grad_norm": 0.26171875, "learning_rate": 9.825197871062809e-05, "loss": 0.0781, "step": 1322 }, { "epoch": 0.17013159586780402, "grad_norm": 0.255859375, "learning_rate": 9.82493286942584e-05, "loss": 0.0745, "step": 1323 }, { "epoch": 0.1702601911783617, "grad_norm": 0.255859375, "learning_rate": 9.824667670647756e-05, "loss": 0.0741, "step": 1324 }, { "epoch": 0.17038878648891936, "grad_norm": 0.232421875, "learning_rate": 9.824402274739391e-05, "loss": 0.0694, "step": 1325 }, { "epoch": 0.17051738179947704, "grad_norm": 0.2314453125, "learning_rate": 9.82413668171159e-05, "loss": 0.062, "step": 1326 }, { "epoch": 0.1706459771100347, "grad_norm": 0.2255859375, "learning_rate": 9.823870891575206e-05, "loss": 0.0695, "step": 1327 }, { "epoch": 0.1707745724205924, "grad_norm": 0.2109375, "learning_rate": 9.823604904341096e-05, "loss": 0.062, "step": 1328 }, { "epoch": 0.17090316773115008, "grad_norm": 0.19921875, "learning_rate": 9.82333872002013e-05, "loss": 0.0606, "step": 1329 }, { "epoch": 0.17103176304170775, "grad_norm": 0.228515625, "learning_rate": 9.823072338623184e-05, "loss": 0.0624, "step": 1330 }, { "epoch": 0.17116035835226542, "grad_norm": 0.2333984375, "learning_rate": 9.82280576016114e-05, "loss": 0.0684, "step": 1331 }, { "epoch": 0.1712889536628231, "grad_norm": 0.263671875, "learning_rate": 9.822538984644893e-05, "loss": 0.0674, "step": 1332 }, { "epoch": 0.17141754897338077, "grad_norm": 0.2421875, "learning_rate": 9.82227201208534e-05, "loss": 0.0688, "step": 1333 }, { "epoch": 0.17154614428393844, "grad_norm": 0.2412109375, "learning_rate": 9.822004842493394e-05, "loss": 0.0653, "step": 1334 }, { "epoch": 0.1716747395944961, "grad_norm": 0.21875, "learning_rate": 9.821737475879965e-05, "loss": 0.0678, "step": 1335 }, { "epoch": 0.17180333490505378, "grad_norm": 0.2158203125, "learning_rate": 9.821469912255982e-05, "loss": 0.0533, "step": 1336 }, { "epoch": 0.17193193021561148, "grad_norm": 0.234375, "learning_rate": 9.821202151632376e-05, "loss": 0.0657, "step": 1337 }, { "epoch": 0.17206052552616916, "grad_norm": 0.2138671875, "learning_rate": 9.820934194020087e-05, "loss": 0.0579, "step": 1338 }, { "epoch": 0.17218912083672683, "grad_norm": 0.2490234375, "learning_rate": 9.820666039430062e-05, "loss": 0.0767, "step": 1339 }, { "epoch": 0.1723177161472845, "grad_norm": 0.244140625, "learning_rate": 9.820397687873259e-05, "loss": 0.0787, "step": 1340 }, { "epoch": 0.17244631145784217, "grad_norm": 0.203125, "learning_rate": 9.820129139360646e-05, "loss": 0.0504, "step": 1341 }, { "epoch": 0.17257490676839984, "grad_norm": 0.236328125, "learning_rate": 9.819860393903188e-05, "loss": 0.0704, "step": 1342 }, { "epoch": 0.17270350207895752, "grad_norm": 0.22265625, "learning_rate": 9.819591451511872e-05, "loss": 0.0603, "step": 1343 }, { "epoch": 0.1728320973895152, "grad_norm": 0.2197265625, "learning_rate": 9.819322312197684e-05, "loss": 0.0676, "step": 1344 }, { "epoch": 0.17296069270007286, "grad_norm": 0.232421875, "learning_rate": 9.81905297597162e-05, "loss": 0.061, "step": 1345 }, { "epoch": 0.17308928801063053, "grad_norm": 0.19921875, "learning_rate": 9.818783442844688e-05, "loss": 0.0488, "step": 1346 }, { "epoch": 0.17321788332118823, "grad_norm": 0.2216796875, "learning_rate": 9.8185137128279e-05, "loss": 0.0564, "step": 1347 }, { "epoch": 0.1733464786317459, "grad_norm": 0.220703125, "learning_rate": 9.818243785932272e-05, "loss": 0.0602, "step": 1348 }, { "epoch": 0.17347507394230358, "grad_norm": 0.2451171875, "learning_rate": 9.817973662168838e-05, "loss": 0.0785, "step": 1349 }, { "epoch": 0.17360366925286125, "grad_norm": 0.2060546875, "learning_rate": 9.817703341548634e-05, "loss": 0.0471, "step": 1350 }, { "epoch": 0.17373226456341892, "grad_norm": 0.2041015625, "learning_rate": 9.817432824082704e-05, "loss": 0.0627, "step": 1351 }, { "epoch": 0.1738608598739766, "grad_norm": 0.2177734375, "learning_rate": 9.817162109782101e-05, "loss": 0.0598, "step": 1352 }, { "epoch": 0.17398945518453426, "grad_norm": 0.2236328125, "learning_rate": 9.816891198657887e-05, "loss": 0.0605, "step": 1353 }, { "epoch": 0.17411805049509194, "grad_norm": 0.23046875, "learning_rate": 9.816620090721129e-05, "loss": 0.062, "step": 1354 }, { "epoch": 0.1742466458056496, "grad_norm": 0.2177734375, "learning_rate": 9.816348785982907e-05, "loss": 0.0721, "step": 1355 }, { "epoch": 0.1743752411162073, "grad_norm": 0.2158203125, "learning_rate": 9.816077284454306e-05, "loss": 0.0658, "step": 1356 }, { "epoch": 0.17450383642676498, "grad_norm": 0.193359375, "learning_rate": 9.815805586146418e-05, "loss": 0.0577, "step": 1357 }, { "epoch": 0.17463243173732265, "grad_norm": 0.2001953125, "learning_rate": 9.815533691070345e-05, "loss": 0.0634, "step": 1358 }, { "epoch": 0.17476102704788032, "grad_norm": 0.2392578125, "learning_rate": 9.815261599237193e-05, "loss": 0.0689, "step": 1359 }, { "epoch": 0.174889622358438, "grad_norm": 0.2109375, "learning_rate": 9.814989310658085e-05, "loss": 0.0669, "step": 1360 }, { "epoch": 0.17501821766899567, "grad_norm": 0.2265625, "learning_rate": 9.81471682534414e-05, "loss": 0.0649, "step": 1361 }, { "epoch": 0.17514681297955334, "grad_norm": 0.22265625, "learning_rate": 9.814444143306499e-05, "loss": 0.0625, "step": 1362 }, { "epoch": 0.175275408290111, "grad_norm": 0.2392578125, "learning_rate": 9.814171264556297e-05, "loss": 0.0613, "step": 1363 }, { "epoch": 0.17540400360066868, "grad_norm": 0.1982421875, "learning_rate": 9.813898189104688e-05, "loss": 0.0594, "step": 1364 }, { "epoch": 0.17553259891122638, "grad_norm": 0.23046875, "learning_rate": 9.813624916962827e-05, "loss": 0.0664, "step": 1365 }, { "epoch": 0.17566119422178406, "grad_norm": 0.2041015625, "learning_rate": 9.813351448141879e-05, "loss": 0.0607, "step": 1366 }, { "epoch": 0.17578978953234173, "grad_norm": 0.228515625, "learning_rate": 9.813077782653019e-05, "loss": 0.0678, "step": 1367 }, { "epoch": 0.1759183848428994, "grad_norm": 0.2236328125, "learning_rate": 9.812803920507428e-05, "loss": 0.0593, "step": 1368 }, { "epoch": 0.17604698015345707, "grad_norm": 0.263671875, "learning_rate": 9.812529861716297e-05, "loss": 0.0702, "step": 1369 }, { "epoch": 0.17617557546401474, "grad_norm": 0.21875, "learning_rate": 9.812255606290822e-05, "loss": 0.0547, "step": 1370 }, { "epoch": 0.17630417077457242, "grad_norm": 0.2294921875, "learning_rate": 9.811981154242209e-05, "loss": 0.0638, "step": 1371 }, { "epoch": 0.1764327660851301, "grad_norm": 0.2392578125, "learning_rate": 9.811706505581674e-05, "loss": 0.0585, "step": 1372 }, { "epoch": 0.17656136139568776, "grad_norm": 0.2177734375, "learning_rate": 9.811431660320436e-05, "loss": 0.0526, "step": 1373 }, { "epoch": 0.17668995670624543, "grad_norm": 0.22265625, "learning_rate": 9.811156618469725e-05, "loss": 0.0625, "step": 1374 }, { "epoch": 0.17681855201680313, "grad_norm": 0.2060546875, "learning_rate": 9.810881380040782e-05, "loss": 0.0651, "step": 1375 }, { "epoch": 0.1769471473273608, "grad_norm": 0.2177734375, "learning_rate": 9.810605945044851e-05, "loss": 0.0674, "step": 1376 }, { "epoch": 0.17707574263791848, "grad_norm": 0.25390625, "learning_rate": 9.810330313493183e-05, "loss": 0.074, "step": 1377 }, { "epoch": 0.17720433794847615, "grad_norm": 0.2470703125, "learning_rate": 9.810054485397044e-05, "loss": 0.0649, "step": 1378 }, { "epoch": 0.17733293325903382, "grad_norm": 0.2451171875, "learning_rate": 9.809778460767704e-05, "loss": 0.0789, "step": 1379 }, { "epoch": 0.1774615285695915, "grad_norm": 0.271484375, "learning_rate": 9.809502239616439e-05, "loss": 0.0747, "step": 1380 }, { "epoch": 0.17759012388014916, "grad_norm": 0.2177734375, "learning_rate": 9.809225821954535e-05, "loss": 0.0657, "step": 1381 }, { "epoch": 0.17771871919070684, "grad_norm": 0.2412109375, "learning_rate": 9.808949207793287e-05, "loss": 0.0678, "step": 1382 }, { "epoch": 0.1778473145012645, "grad_norm": 0.28125, "learning_rate": 9.808672397143999e-05, "loss": 0.0784, "step": 1383 }, { "epoch": 0.1779759098118222, "grad_norm": 0.2333984375, "learning_rate": 9.808395390017977e-05, "loss": 0.0656, "step": 1384 }, { "epoch": 0.17810450512237988, "grad_norm": 0.2119140625, "learning_rate": 9.808118186426543e-05, "loss": 0.0713, "step": 1385 }, { "epoch": 0.17823310043293755, "grad_norm": 0.216796875, "learning_rate": 9.807840786381022e-05, "loss": 0.0623, "step": 1386 }, { "epoch": 0.17836169574349522, "grad_norm": 0.26953125, "learning_rate": 9.807563189892746e-05, "loss": 0.075, "step": 1387 }, { "epoch": 0.1784902910540529, "grad_norm": 0.224609375, "learning_rate": 9.80728539697306e-05, "loss": 0.0599, "step": 1388 }, { "epoch": 0.17861888636461057, "grad_norm": 0.22265625, "learning_rate": 9.807007407633314e-05, "loss": 0.0627, "step": 1389 }, { "epoch": 0.17874748167516824, "grad_norm": 0.2265625, "learning_rate": 9.806729221884865e-05, "loss": 0.059, "step": 1390 }, { "epoch": 0.1788760769857259, "grad_norm": 0.228515625, "learning_rate": 9.806450839739081e-05, "loss": 0.0654, "step": 1391 }, { "epoch": 0.17900467229628358, "grad_norm": 0.2265625, "learning_rate": 9.806172261207335e-05, "loss": 0.0741, "step": 1392 }, { "epoch": 0.17913326760684128, "grad_norm": 0.2216796875, "learning_rate": 9.80589348630101e-05, "loss": 0.0555, "step": 1393 }, { "epoch": 0.17926186291739896, "grad_norm": 0.2021484375, "learning_rate": 9.805614515031496e-05, "loss": 0.0573, "step": 1394 }, { "epoch": 0.17939045822795663, "grad_norm": 0.228515625, "learning_rate": 9.805335347410192e-05, "loss": 0.0616, "step": 1395 }, { "epoch": 0.1795190535385143, "grad_norm": 0.392578125, "learning_rate": 9.805055983448503e-05, "loss": 0.0676, "step": 1396 }, { "epoch": 0.17964764884907197, "grad_norm": 0.220703125, "learning_rate": 9.804776423157847e-05, "loss": 0.0695, "step": 1397 }, { "epoch": 0.17977624415962964, "grad_norm": 0.2578125, "learning_rate": 9.804496666549643e-05, "loss": 0.0673, "step": 1398 }, { "epoch": 0.17990483947018732, "grad_norm": 0.205078125, "learning_rate": 9.804216713635321e-05, "loss": 0.0598, "step": 1399 }, { "epoch": 0.180033434780745, "grad_norm": 0.220703125, "learning_rate": 9.803936564426324e-05, "loss": 0.0623, "step": 1400 }, { "epoch": 0.18016203009130266, "grad_norm": 0.208984375, "learning_rate": 9.803656218934095e-05, "loss": 0.0619, "step": 1401 }, { "epoch": 0.18029062540186033, "grad_norm": 0.228515625, "learning_rate": 9.803375677170089e-05, "loss": 0.0553, "step": 1402 }, { "epoch": 0.18041922071241803, "grad_norm": 0.22265625, "learning_rate": 9.80309493914577e-05, "loss": 0.0621, "step": 1403 }, { "epoch": 0.1805478160229757, "grad_norm": 0.232421875, "learning_rate": 9.802814004872607e-05, "loss": 0.064, "step": 1404 }, { "epoch": 0.18067641133353338, "grad_norm": 0.255859375, "learning_rate": 9.802532874362078e-05, "loss": 0.0712, "step": 1405 }, { "epoch": 0.18080500664409105, "grad_norm": 0.2333984375, "learning_rate": 9.802251547625672e-05, "loss": 0.0659, "step": 1406 }, { "epoch": 0.18093360195464872, "grad_norm": 0.23828125, "learning_rate": 9.801970024674884e-05, "loss": 0.0632, "step": 1407 }, { "epoch": 0.1810621972652064, "grad_norm": 0.2197265625, "learning_rate": 9.801688305521213e-05, "loss": 0.0616, "step": 1408 }, { "epoch": 0.18119079257576406, "grad_norm": 0.2109375, "learning_rate": 9.801406390176173e-05, "loss": 0.0598, "step": 1409 }, { "epoch": 0.18131938788632174, "grad_norm": 0.2138671875, "learning_rate": 9.801124278651284e-05, "loss": 0.0654, "step": 1410 }, { "epoch": 0.1814479831968794, "grad_norm": 0.265625, "learning_rate": 9.800841970958068e-05, "loss": 0.0716, "step": 1411 }, { "epoch": 0.1815765785074371, "grad_norm": 0.2373046875, "learning_rate": 9.800559467108063e-05, "loss": 0.068, "step": 1412 }, { "epoch": 0.18170517381799478, "grad_norm": 0.224609375, "learning_rate": 9.800276767112811e-05, "loss": 0.0735, "step": 1413 }, { "epoch": 0.18183376912855245, "grad_norm": 0.1884765625, "learning_rate": 9.799993870983863e-05, "loss": 0.0507, "step": 1414 }, { "epoch": 0.18196236443911012, "grad_norm": 0.20703125, "learning_rate": 9.799710778732777e-05, "loss": 0.0565, "step": 1415 }, { "epoch": 0.1820909597496678, "grad_norm": 0.2119140625, "learning_rate": 9.79942749037112e-05, "loss": 0.0634, "step": 1416 }, { "epoch": 0.18221955506022547, "grad_norm": 0.2255859375, "learning_rate": 9.79914400591047e-05, "loss": 0.0611, "step": 1417 }, { "epoch": 0.18234815037078314, "grad_norm": 0.22265625, "learning_rate": 9.798860325362404e-05, "loss": 0.0709, "step": 1418 }, { "epoch": 0.1824767456813408, "grad_norm": 0.2294921875, "learning_rate": 9.798576448738518e-05, "loss": 0.0717, "step": 1419 }, { "epoch": 0.18260534099189848, "grad_norm": 0.234375, "learning_rate": 9.798292376050408e-05, "loss": 0.0733, "step": 1420 }, { "epoch": 0.18273393630245616, "grad_norm": 0.228515625, "learning_rate": 9.798008107309682e-05, "loss": 0.0697, "step": 1421 }, { "epoch": 0.18286253161301386, "grad_norm": 0.2177734375, "learning_rate": 9.797723642527956e-05, "loss": 0.0552, "step": 1422 }, { "epoch": 0.18299112692357153, "grad_norm": 0.205078125, "learning_rate": 9.797438981716852e-05, "loss": 0.064, "step": 1423 }, { "epoch": 0.1831197222341292, "grad_norm": 0.224609375, "learning_rate": 9.797154124887997e-05, "loss": 0.0712, "step": 1424 }, { "epoch": 0.18324831754468687, "grad_norm": 0.201171875, "learning_rate": 9.796869072053037e-05, "loss": 0.0625, "step": 1425 }, { "epoch": 0.18337691285524454, "grad_norm": 0.1923828125, "learning_rate": 9.796583823223613e-05, "loss": 0.0561, "step": 1426 }, { "epoch": 0.18350550816580222, "grad_norm": 0.1845703125, "learning_rate": 9.796298378411383e-05, "loss": 0.0489, "step": 1427 }, { "epoch": 0.1836341034763599, "grad_norm": 0.2294921875, "learning_rate": 9.796012737628009e-05, "loss": 0.0643, "step": 1428 }, { "epoch": 0.18376269878691756, "grad_norm": 0.1982421875, "learning_rate": 9.795726900885161e-05, "loss": 0.0574, "step": 1429 }, { "epoch": 0.18389129409747523, "grad_norm": 0.2353515625, "learning_rate": 9.795440868194521e-05, "loss": 0.0629, "step": 1430 }, { "epoch": 0.18401988940803293, "grad_norm": 0.1982421875, "learning_rate": 9.795154639567774e-05, "loss": 0.0666, "step": 1431 }, { "epoch": 0.1841484847185906, "grad_norm": 0.24609375, "learning_rate": 9.794868215016614e-05, "loss": 0.0676, "step": 1432 }, { "epoch": 0.18427708002914828, "grad_norm": 0.2001953125, "learning_rate": 9.794581594552745e-05, "loss": 0.0611, "step": 1433 }, { "epoch": 0.18440567533970595, "grad_norm": 0.212890625, "learning_rate": 9.794294778187878e-05, "loss": 0.0588, "step": 1434 }, { "epoch": 0.18453427065026362, "grad_norm": 0.2216796875, "learning_rate": 9.794007765933732e-05, "loss": 0.0607, "step": 1435 }, { "epoch": 0.1846628659608213, "grad_norm": 0.248046875, "learning_rate": 9.793720557802035e-05, "loss": 0.0723, "step": 1436 }, { "epoch": 0.18479146127137897, "grad_norm": 0.240234375, "learning_rate": 9.793433153804521e-05, "loss": 0.0558, "step": 1437 }, { "epoch": 0.18492005658193664, "grad_norm": 0.228515625, "learning_rate": 9.793145553952931e-05, "loss": 0.0644, "step": 1438 }, { "epoch": 0.1850486518924943, "grad_norm": 0.2431640625, "learning_rate": 9.79285775825902e-05, "loss": 0.0613, "step": 1439 }, { "epoch": 0.185177247203052, "grad_norm": 0.244140625, "learning_rate": 9.792569766734543e-05, "loss": 0.0645, "step": 1440 }, { "epoch": 0.18530584251360968, "grad_norm": 0.2197265625, "learning_rate": 9.79228157939127e-05, "loss": 0.0681, "step": 1441 }, { "epoch": 0.18543443782416735, "grad_norm": 0.228515625, "learning_rate": 9.791993196240975e-05, "loss": 0.0675, "step": 1442 }, { "epoch": 0.18556303313472503, "grad_norm": 0.203125, "learning_rate": 9.791704617295441e-05, "loss": 0.0523, "step": 1443 }, { "epoch": 0.1856916284452827, "grad_norm": 0.2041015625, "learning_rate": 9.791415842566458e-05, "loss": 0.0486, "step": 1444 }, { "epoch": 0.18582022375584037, "grad_norm": 0.26171875, "learning_rate": 9.791126872065825e-05, "loss": 0.0785, "step": 1445 }, { "epoch": 0.18594881906639804, "grad_norm": 0.2041015625, "learning_rate": 9.790837705805352e-05, "loss": 0.0594, "step": 1446 }, { "epoch": 0.1860774143769557, "grad_norm": 0.212890625, "learning_rate": 9.790548343796851e-05, "loss": 0.0678, "step": 1447 }, { "epoch": 0.18620600968751339, "grad_norm": 0.2177734375, "learning_rate": 9.790258786052146e-05, "loss": 0.0581, "step": 1448 }, { "epoch": 0.18633460499807106, "grad_norm": 0.23046875, "learning_rate": 9.789969032583068e-05, "loss": 0.0667, "step": 1449 }, { "epoch": 0.18646320030862876, "grad_norm": 0.216796875, "learning_rate": 9.789679083401457e-05, "loss": 0.0709, "step": 1450 }, { "epoch": 0.18659179561918643, "grad_norm": 0.22265625, "learning_rate": 9.789388938519157e-05, "loss": 0.0637, "step": 1451 }, { "epoch": 0.1867203909297441, "grad_norm": 0.2001953125, "learning_rate": 9.789098597948026e-05, "loss": 0.0568, "step": 1452 }, { "epoch": 0.18684898624030177, "grad_norm": 0.2412109375, "learning_rate": 9.788808061699925e-05, "loss": 0.0569, "step": 1453 }, { "epoch": 0.18697758155085945, "grad_norm": 0.22265625, "learning_rate": 9.788517329786726e-05, "loss": 0.0622, "step": 1454 }, { "epoch": 0.18710617686141712, "grad_norm": 0.2265625, "learning_rate": 9.788226402220308e-05, "loss": 0.0665, "step": 1455 }, { "epoch": 0.1872347721719748, "grad_norm": 0.2353515625, "learning_rate": 9.787935279012557e-05, "loss": 0.0801, "step": 1456 }, { "epoch": 0.18736336748253246, "grad_norm": 0.203125, "learning_rate": 9.787643960175369e-05, "loss": 0.0621, "step": 1457 }, { "epoch": 0.18749196279309013, "grad_norm": 0.20703125, "learning_rate": 9.787352445720647e-05, "loss": 0.0642, "step": 1458 }, { "epoch": 0.18762055810364783, "grad_norm": 0.220703125, "learning_rate": 9.787060735660301e-05, "loss": 0.0674, "step": 1459 }, { "epoch": 0.1877491534142055, "grad_norm": 0.2109375, "learning_rate": 9.78676883000625e-05, "loss": 0.0535, "step": 1460 }, { "epoch": 0.18787774872476318, "grad_norm": 0.2197265625, "learning_rate": 9.786476728770422e-05, "loss": 0.0637, "step": 1461 }, { "epoch": 0.18800634403532085, "grad_norm": 0.208984375, "learning_rate": 9.786184431964751e-05, "loss": 0.0617, "step": 1462 }, { "epoch": 0.18813493934587852, "grad_norm": 0.2236328125, "learning_rate": 9.785891939601182e-05, "loss": 0.0645, "step": 1463 }, { "epoch": 0.1882635346564362, "grad_norm": 0.216796875, "learning_rate": 9.785599251691661e-05, "loss": 0.0619, "step": 1464 }, { "epoch": 0.18839212996699387, "grad_norm": 0.24609375, "learning_rate": 9.785306368248153e-05, "loss": 0.065, "step": 1465 }, { "epoch": 0.18852072527755154, "grad_norm": 0.205078125, "learning_rate": 9.785013289282621e-05, "loss": 0.0532, "step": 1466 }, { "epoch": 0.1886493205881092, "grad_norm": 0.2236328125, "learning_rate": 9.78472001480704e-05, "loss": 0.0644, "step": 1467 }, { "epoch": 0.18877791589866688, "grad_norm": 0.2138671875, "learning_rate": 9.784426544833393e-05, "loss": 0.0532, "step": 1468 }, { "epoch": 0.18890651120922458, "grad_norm": 0.2138671875, "learning_rate": 9.784132879373673e-05, "loss": 0.0606, "step": 1469 }, { "epoch": 0.18903510651978225, "grad_norm": 0.2333984375, "learning_rate": 9.783839018439877e-05, "loss": 0.0686, "step": 1470 }, { "epoch": 0.18916370183033993, "grad_norm": 0.2021484375, "learning_rate": 9.783544962044011e-05, "loss": 0.0606, "step": 1471 }, { "epoch": 0.1892922971408976, "grad_norm": 0.2265625, "learning_rate": 9.783250710198091e-05, "loss": 0.0623, "step": 1472 }, { "epoch": 0.18942089245145527, "grad_norm": 0.224609375, "learning_rate": 9.78295626291414e-05, "loss": 0.0676, "step": 1473 }, { "epoch": 0.18954948776201294, "grad_norm": 0.203125, "learning_rate": 9.78266162020419e-05, "loss": 0.0543, "step": 1474 }, { "epoch": 0.1896780830725706, "grad_norm": 0.2080078125, "learning_rate": 9.782366782080277e-05, "loss": 0.0574, "step": 1475 }, { "epoch": 0.18980667838312829, "grad_norm": 0.2431640625, "learning_rate": 9.782071748554448e-05, "loss": 0.0752, "step": 1476 }, { "epoch": 0.18993527369368596, "grad_norm": 0.2294921875, "learning_rate": 9.781776519638759e-05, "loss": 0.0673, "step": 1477 }, { "epoch": 0.19006386900424366, "grad_norm": 0.2041015625, "learning_rate": 9.781481095345273e-05, "loss": 0.0561, "step": 1478 }, { "epoch": 0.19019246431480133, "grad_norm": 0.23046875, "learning_rate": 9.781185475686059e-05, "loss": 0.0632, "step": 1479 }, { "epoch": 0.190321059625359, "grad_norm": 0.259765625, "learning_rate": 9.780889660673198e-05, "loss": 0.072, "step": 1480 }, { "epoch": 0.19044965493591667, "grad_norm": 0.216796875, "learning_rate": 9.780593650318775e-05, "loss": 0.0633, "step": 1481 }, { "epoch": 0.19057825024647435, "grad_norm": 0.2119140625, "learning_rate": 9.780297444634884e-05, "loss": 0.0657, "step": 1482 }, { "epoch": 0.19070684555703202, "grad_norm": 0.2275390625, "learning_rate": 9.78000104363363e-05, "loss": 0.0666, "step": 1483 }, { "epoch": 0.1908354408675897, "grad_norm": 0.2578125, "learning_rate": 9.779704447327123e-05, "loss": 0.0654, "step": 1484 }, { "epoch": 0.19096403617814736, "grad_norm": 0.2294921875, "learning_rate": 9.779407655727478e-05, "loss": 0.072, "step": 1485 }, { "epoch": 0.19109263148870503, "grad_norm": 0.25390625, "learning_rate": 9.779110668846827e-05, "loss": 0.0695, "step": 1486 }, { "epoch": 0.19122122679926273, "grad_norm": 0.25390625, "learning_rate": 9.7788134866973e-05, "loss": 0.0626, "step": 1487 }, { "epoch": 0.1913498221098204, "grad_norm": 0.25, "learning_rate": 9.778516109291043e-05, "loss": 0.0719, "step": 1488 }, { "epoch": 0.19147841742037808, "grad_norm": 0.236328125, "learning_rate": 9.778218536640203e-05, "loss": 0.0627, "step": 1489 }, { "epoch": 0.19160701273093575, "grad_norm": 0.2314453125, "learning_rate": 9.777920768756943e-05, "loss": 0.0638, "step": 1490 }, { "epoch": 0.19173560804149342, "grad_norm": 0.228515625, "learning_rate": 9.777622805653425e-05, "loss": 0.0588, "step": 1491 }, { "epoch": 0.1918642033520511, "grad_norm": 0.28125, "learning_rate": 9.777324647341825e-05, "loss": 0.0831, "step": 1492 }, { "epoch": 0.19199279866260877, "grad_norm": 0.2275390625, "learning_rate": 9.777026293834327e-05, "loss": 0.0722, "step": 1493 }, { "epoch": 0.19212139397316644, "grad_norm": 0.25, "learning_rate": 9.776727745143119e-05, "loss": 0.0674, "step": 1494 }, { "epoch": 0.1922499892837241, "grad_norm": 0.23046875, "learning_rate": 9.776429001280401e-05, "loss": 0.0542, "step": 1495 }, { "epoch": 0.19237858459428178, "grad_norm": 0.2294921875, "learning_rate": 9.776130062258377e-05, "loss": 0.0654, "step": 1496 }, { "epoch": 0.19250717990483948, "grad_norm": 0.19921875, "learning_rate": 9.775830928089265e-05, "loss": 0.0552, "step": 1497 }, { "epoch": 0.19263577521539715, "grad_norm": 0.23046875, "learning_rate": 9.775531598785284e-05, "loss": 0.0676, "step": 1498 }, { "epoch": 0.19276437052595483, "grad_norm": 0.2333984375, "learning_rate": 9.775232074358666e-05, "loss": 0.0652, "step": 1499 }, { "epoch": 0.1928929658365125, "grad_norm": 0.1962890625, "learning_rate": 9.77493235482165e-05, "loss": 0.0493, "step": 1500 }, { "epoch": 0.1928929658365125, "eval_loss": 0.06209211051464081, "eval_runtime": 1046.0294, "eval_samples_per_second": 93.904, "eval_steps_per_second": 1.174, "step": 1500 }, { "epoch": 0.19302156114707017, "grad_norm": 0.2255859375, "learning_rate": 9.77463244018648e-05, "loss": 0.0622, "step": 1501 }, { "epoch": 0.19315015645762784, "grad_norm": 0.2197265625, "learning_rate": 9.77433233046541e-05, "loss": 0.0574, "step": 1502 }, { "epoch": 0.1932787517681855, "grad_norm": 0.216796875, "learning_rate": 9.774032025670705e-05, "loss": 0.0593, "step": 1503 }, { "epoch": 0.19340734707874319, "grad_norm": 0.21875, "learning_rate": 9.773731525814632e-05, "loss": 0.0569, "step": 1504 }, { "epoch": 0.19353594238930086, "grad_norm": 0.2060546875, "learning_rate": 9.773430830909471e-05, "loss": 0.0642, "step": 1505 }, { "epoch": 0.19366453769985856, "grad_norm": 0.21484375, "learning_rate": 9.773129940967508e-05, "loss": 0.0525, "step": 1506 }, { "epoch": 0.19379313301041623, "grad_norm": 0.2158203125, "learning_rate": 9.772828856001036e-05, "loss": 0.0492, "step": 1507 }, { "epoch": 0.1939217283209739, "grad_norm": 0.2197265625, "learning_rate": 9.772527576022357e-05, "loss": 0.0552, "step": 1508 }, { "epoch": 0.19405032363153157, "grad_norm": 0.21484375, "learning_rate": 9.772226101043782e-05, "loss": 0.0613, "step": 1509 }, { "epoch": 0.19417891894208925, "grad_norm": 0.24609375, "learning_rate": 9.771924431077627e-05, "loss": 0.062, "step": 1510 }, { "epoch": 0.19430751425264692, "grad_norm": 0.2333984375, "learning_rate": 9.771622566136221e-05, "loss": 0.0642, "step": 1511 }, { "epoch": 0.1944361095632046, "grad_norm": 0.20703125, "learning_rate": 9.771320506231894e-05, "loss": 0.0609, "step": 1512 }, { "epoch": 0.19456470487376226, "grad_norm": 0.234375, "learning_rate": 9.771018251376991e-05, "loss": 0.0734, "step": 1513 }, { "epoch": 0.19469330018431993, "grad_norm": 0.2294921875, "learning_rate": 9.77071580158386e-05, "loss": 0.0636, "step": 1514 }, { "epoch": 0.19482189549487763, "grad_norm": 0.220703125, "learning_rate": 9.77041315686486e-05, "loss": 0.0599, "step": 1515 }, { "epoch": 0.1949504908054353, "grad_norm": 0.2060546875, "learning_rate": 9.770110317232357e-05, "loss": 0.0638, "step": 1516 }, { "epoch": 0.19507908611599298, "grad_norm": 0.2041015625, "learning_rate": 9.769807282698722e-05, "loss": 0.0538, "step": 1517 }, { "epoch": 0.19520768142655065, "grad_norm": 0.208984375, "learning_rate": 9.769504053276339e-05, "loss": 0.0683, "step": 1518 }, { "epoch": 0.19533627673710832, "grad_norm": 0.1953125, "learning_rate": 9.769200628977598e-05, "loss": 0.0563, "step": 1519 }, { "epoch": 0.195464872047666, "grad_norm": 0.22265625, "learning_rate": 9.768897009814895e-05, "loss": 0.063, "step": 1520 }, { "epoch": 0.19559346735822367, "grad_norm": 0.216796875, "learning_rate": 9.768593195800634e-05, "loss": 0.0604, "step": 1521 }, { "epoch": 0.19572206266878134, "grad_norm": 0.189453125, "learning_rate": 9.768289186947232e-05, "loss": 0.0502, "step": 1522 }, { "epoch": 0.195850657979339, "grad_norm": 0.2080078125, "learning_rate": 9.767984983267111e-05, "loss": 0.0614, "step": 1523 }, { "epoch": 0.19597925328989668, "grad_norm": 0.1953125, "learning_rate": 9.767680584772696e-05, "loss": 0.0637, "step": 1524 }, { "epoch": 0.19610784860045438, "grad_norm": 0.193359375, "learning_rate": 9.767375991476429e-05, "loss": 0.0467, "step": 1525 }, { "epoch": 0.19623644391101205, "grad_norm": 0.232421875, "learning_rate": 9.76707120339075e-05, "loss": 0.0841, "step": 1526 }, { "epoch": 0.19636503922156973, "grad_norm": 0.2197265625, "learning_rate": 9.766766220528117e-05, "loss": 0.0662, "step": 1527 }, { "epoch": 0.1964936345321274, "grad_norm": 0.2197265625, "learning_rate": 9.766461042900989e-05, "loss": 0.0686, "step": 1528 }, { "epoch": 0.19662222984268507, "grad_norm": 0.2275390625, "learning_rate": 9.766155670521837e-05, "loss": 0.0697, "step": 1529 }, { "epoch": 0.19675082515324274, "grad_norm": 0.216796875, "learning_rate": 9.765850103403137e-05, "loss": 0.0582, "step": 1530 }, { "epoch": 0.19687942046380041, "grad_norm": 0.2197265625, "learning_rate": 9.765544341557374e-05, "loss": 0.0566, "step": 1531 }, { "epoch": 0.1970080157743581, "grad_norm": 0.1904296875, "learning_rate": 9.76523838499704e-05, "loss": 0.0515, "step": 1532 }, { "epoch": 0.19713661108491576, "grad_norm": 0.2197265625, "learning_rate": 9.764932233734639e-05, "loss": 0.0631, "step": 1533 }, { "epoch": 0.19726520639547346, "grad_norm": 0.232421875, "learning_rate": 9.764625887782676e-05, "loss": 0.066, "step": 1534 }, { "epoch": 0.19739380170603113, "grad_norm": 0.2099609375, "learning_rate": 9.764319347153671e-05, "loss": 0.0629, "step": 1535 }, { "epoch": 0.1975223970165888, "grad_norm": 0.2119140625, "learning_rate": 9.764012611860148e-05, "loss": 0.0587, "step": 1536 }, { "epoch": 0.19765099232714647, "grad_norm": 0.2236328125, "learning_rate": 9.76370568191464e-05, "loss": 0.0552, "step": 1537 }, { "epoch": 0.19777958763770415, "grad_norm": 0.232421875, "learning_rate": 9.763398557329688e-05, "loss": 0.087, "step": 1538 }, { "epoch": 0.19790818294826182, "grad_norm": 0.2265625, "learning_rate": 9.76309123811784e-05, "loss": 0.0663, "step": 1539 }, { "epoch": 0.1980367782588195, "grad_norm": 0.2177734375, "learning_rate": 9.762783724291654e-05, "loss": 0.0698, "step": 1540 }, { "epoch": 0.19816537356937716, "grad_norm": 0.2109375, "learning_rate": 9.762476015863694e-05, "loss": 0.0607, "step": 1541 }, { "epoch": 0.19829396887993483, "grad_norm": 0.2255859375, "learning_rate": 9.762168112846531e-05, "loss": 0.0497, "step": 1542 }, { "epoch": 0.1984225641904925, "grad_norm": 0.205078125, "learning_rate": 9.761860015252746e-05, "loss": 0.0601, "step": 1543 }, { "epoch": 0.1985511595010502, "grad_norm": 0.20703125, "learning_rate": 9.761551723094931e-05, "loss": 0.057, "step": 1544 }, { "epoch": 0.19867975481160788, "grad_norm": 0.2119140625, "learning_rate": 9.761243236385678e-05, "loss": 0.066, "step": 1545 }, { "epoch": 0.19880835012216555, "grad_norm": 0.208984375, "learning_rate": 9.760934555137593e-05, "loss": 0.0658, "step": 1546 }, { "epoch": 0.19893694543272322, "grad_norm": 0.248046875, "learning_rate": 9.76062567936329e-05, "loss": 0.0681, "step": 1547 }, { "epoch": 0.1990655407432809, "grad_norm": 0.2138671875, "learning_rate": 9.760316609075388e-05, "loss": 0.0758, "step": 1548 }, { "epoch": 0.19919413605383857, "grad_norm": 0.2109375, "learning_rate": 9.760007344286515e-05, "loss": 0.0602, "step": 1549 }, { "epoch": 0.19932273136439624, "grad_norm": 0.2138671875, "learning_rate": 9.759697885009307e-05, "loss": 0.0619, "step": 1550 }, { "epoch": 0.1994513266749539, "grad_norm": 0.2255859375, "learning_rate": 9.759388231256407e-05, "loss": 0.0749, "step": 1551 }, { "epoch": 0.19957992198551158, "grad_norm": 0.189453125, "learning_rate": 9.759078383040472e-05, "loss": 0.0491, "step": 1552 }, { "epoch": 0.19970851729606928, "grad_norm": 0.2236328125, "learning_rate": 9.758768340374157e-05, "loss": 0.0758, "step": 1553 }, { "epoch": 0.19983711260662695, "grad_norm": 0.251953125, "learning_rate": 9.758458103270132e-05, "loss": 0.0594, "step": 1554 }, { "epoch": 0.19996570791718463, "grad_norm": 0.1943359375, "learning_rate": 9.758147671741072e-05, "loss": 0.0552, "step": 1555 }, { "epoch": 0.2000943032277423, "grad_norm": 0.1953125, "learning_rate": 9.757837045799662e-05, "loss": 0.0599, "step": 1556 }, { "epoch": 0.20022289853829997, "grad_norm": 0.2060546875, "learning_rate": 9.757526225458593e-05, "loss": 0.0604, "step": 1557 }, { "epoch": 0.20035149384885764, "grad_norm": 0.2421875, "learning_rate": 9.757215210730565e-05, "loss": 0.0862, "step": 1558 }, { "epoch": 0.20048008915941531, "grad_norm": 0.205078125, "learning_rate": 9.756904001628285e-05, "loss": 0.0604, "step": 1559 }, { "epoch": 0.200608684469973, "grad_norm": 0.2158203125, "learning_rate": 9.75659259816447e-05, "loss": 0.0646, "step": 1560 }, { "epoch": 0.20073727978053066, "grad_norm": 0.212890625, "learning_rate": 9.756281000351844e-05, "loss": 0.0665, "step": 1561 }, { "epoch": 0.20086587509108836, "grad_norm": 0.19140625, "learning_rate": 9.755969208203137e-05, "loss": 0.0557, "step": 1562 }, { "epoch": 0.20099447040164603, "grad_norm": 0.208984375, "learning_rate": 9.755657221731087e-05, "loss": 0.0665, "step": 1563 }, { "epoch": 0.2011230657122037, "grad_norm": 0.2197265625, "learning_rate": 9.755345040948445e-05, "loss": 0.0557, "step": 1564 }, { "epoch": 0.20125166102276137, "grad_norm": 0.2197265625, "learning_rate": 9.755032665867965e-05, "loss": 0.0631, "step": 1565 }, { "epoch": 0.20138025633331905, "grad_norm": 0.216796875, "learning_rate": 9.75472009650241e-05, "loss": 0.0606, "step": 1566 }, { "epoch": 0.20150885164387672, "grad_norm": 0.25390625, "learning_rate": 9.75440733286455e-05, "loss": 0.0582, "step": 1567 }, { "epoch": 0.2016374469544344, "grad_norm": 0.2265625, "learning_rate": 9.754094374967166e-05, "loss": 0.0708, "step": 1568 }, { "epoch": 0.20176604226499206, "grad_norm": 0.203125, "learning_rate": 9.753781222823045e-05, "loss": 0.0577, "step": 1569 }, { "epoch": 0.20189463757554973, "grad_norm": 0.2177734375, "learning_rate": 9.75346787644498e-05, "loss": 0.0595, "step": 1570 }, { "epoch": 0.2020232328861074, "grad_norm": 0.2236328125, "learning_rate": 9.753154335845777e-05, "loss": 0.0669, "step": 1571 }, { "epoch": 0.2021518281966651, "grad_norm": 0.1943359375, "learning_rate": 9.752840601038245e-05, "loss": 0.0512, "step": 1572 }, { "epoch": 0.20228042350722278, "grad_norm": 0.212890625, "learning_rate": 9.752526672035202e-05, "loss": 0.0649, "step": 1573 }, { "epoch": 0.20240901881778045, "grad_norm": 0.2265625, "learning_rate": 9.752212548849476e-05, "loss": 0.0681, "step": 1574 }, { "epoch": 0.20253761412833812, "grad_norm": 0.2158203125, "learning_rate": 9.751898231493904e-05, "loss": 0.0622, "step": 1575 }, { "epoch": 0.2026662094388958, "grad_norm": 0.212890625, "learning_rate": 9.751583719981324e-05, "loss": 0.0516, "step": 1576 }, { "epoch": 0.20279480474945347, "grad_norm": 0.2294921875, "learning_rate": 9.75126901432459e-05, "loss": 0.0647, "step": 1577 }, { "epoch": 0.20292340006001114, "grad_norm": 0.212890625, "learning_rate": 9.750954114536558e-05, "loss": 0.0626, "step": 1578 }, { "epoch": 0.2030519953705688, "grad_norm": 0.2314453125, "learning_rate": 9.750639020630097e-05, "loss": 0.0702, "step": 1579 }, { "epoch": 0.20318059068112648, "grad_norm": 0.220703125, "learning_rate": 9.750323732618078e-05, "loss": 0.0574, "step": 1580 }, { "epoch": 0.20330918599168418, "grad_norm": 0.2197265625, "learning_rate": 9.750008250513388e-05, "loss": 0.0627, "step": 1581 }, { "epoch": 0.20343778130224185, "grad_norm": 0.212890625, "learning_rate": 9.749692574328914e-05, "loss": 0.0613, "step": 1582 }, { "epoch": 0.20356637661279953, "grad_norm": 0.1962890625, "learning_rate": 9.749376704077555e-05, "loss": 0.0441, "step": 1583 }, { "epoch": 0.2036949719233572, "grad_norm": 0.212890625, "learning_rate": 9.749060639772216e-05, "loss": 0.0578, "step": 1584 }, { "epoch": 0.20382356723391487, "grad_norm": 0.2197265625, "learning_rate": 9.748744381425813e-05, "loss": 0.0664, "step": 1585 }, { "epoch": 0.20395216254447254, "grad_norm": 0.208984375, "learning_rate": 9.748427929051266e-05, "loss": 0.0537, "step": 1586 }, { "epoch": 0.20408075785503021, "grad_norm": 0.20703125, "learning_rate": 9.748111282661507e-05, "loss": 0.0607, "step": 1587 }, { "epoch": 0.2042093531655879, "grad_norm": 0.244140625, "learning_rate": 9.747794442269471e-05, "loss": 0.0634, "step": 1588 }, { "epoch": 0.20433794847614556, "grad_norm": 0.22265625, "learning_rate": 9.747477407888107e-05, "loss": 0.0768, "step": 1589 }, { "epoch": 0.20446654378670326, "grad_norm": 0.2333984375, "learning_rate": 9.747160179530366e-05, "loss": 0.0654, "step": 1590 }, { "epoch": 0.20459513909726093, "grad_norm": 0.2392578125, "learning_rate": 9.746842757209211e-05, "loss": 0.0631, "step": 1591 }, { "epoch": 0.2047237344078186, "grad_norm": 0.2216796875, "learning_rate": 9.74652514093761e-05, "loss": 0.0503, "step": 1592 }, { "epoch": 0.20485232971837627, "grad_norm": 0.2109375, "learning_rate": 9.746207330728542e-05, "loss": 0.0541, "step": 1593 }, { "epoch": 0.20498092502893395, "grad_norm": 0.2109375, "learning_rate": 9.745889326594992e-05, "loss": 0.0518, "step": 1594 }, { "epoch": 0.20510952033949162, "grad_norm": 0.2021484375, "learning_rate": 9.745571128549954e-05, "loss": 0.0582, "step": 1595 }, { "epoch": 0.2052381156500493, "grad_norm": 0.20703125, "learning_rate": 9.745252736606427e-05, "loss": 0.0546, "step": 1596 }, { "epoch": 0.20536671096060696, "grad_norm": 0.1796875, "learning_rate": 9.744934150777423e-05, "loss": 0.0447, "step": 1597 }, { "epoch": 0.20549530627116463, "grad_norm": 0.2265625, "learning_rate": 9.744615371075956e-05, "loss": 0.0578, "step": 1598 }, { "epoch": 0.2056239015817223, "grad_norm": 0.1806640625, "learning_rate": 9.744296397515052e-05, "loss": 0.0488, "step": 1599 }, { "epoch": 0.20575249689228, "grad_norm": 0.1982421875, "learning_rate": 9.743977230107746e-05, "loss": 0.0593, "step": 1600 }, { "epoch": 0.20588109220283768, "grad_norm": 0.2431640625, "learning_rate": 9.743657868867076e-05, "loss": 0.0597, "step": 1601 }, { "epoch": 0.20600968751339535, "grad_norm": 0.240234375, "learning_rate": 9.743338313806092e-05, "loss": 0.0646, "step": 1602 }, { "epoch": 0.20613828282395302, "grad_norm": 0.212890625, "learning_rate": 9.74301856493785e-05, "loss": 0.0618, "step": 1603 }, { "epoch": 0.2062668781345107, "grad_norm": 0.205078125, "learning_rate": 9.742698622275416e-05, "loss": 0.0668, "step": 1604 }, { "epoch": 0.20639547344506837, "grad_norm": 0.2119140625, "learning_rate": 9.742378485831861e-05, "loss": 0.0505, "step": 1605 }, { "epoch": 0.20652406875562604, "grad_norm": 0.203125, "learning_rate": 9.742058155620266e-05, "loss": 0.0648, "step": 1606 }, { "epoch": 0.2066526640661837, "grad_norm": 0.2265625, "learning_rate": 9.74173763165372e-05, "loss": 0.0622, "step": 1607 }, { "epoch": 0.20678125937674138, "grad_norm": 0.2265625, "learning_rate": 9.741416913945318e-05, "loss": 0.0627, "step": 1608 }, { "epoch": 0.20690985468729908, "grad_norm": 0.248046875, "learning_rate": 9.741096002508164e-05, "loss": 0.0758, "step": 1609 }, { "epoch": 0.20703844999785675, "grad_norm": 0.2138671875, "learning_rate": 9.740774897355372e-05, "loss": 0.0616, "step": 1610 }, { "epoch": 0.20716704530841443, "grad_norm": 0.197265625, "learning_rate": 9.74045359850006e-05, "loss": 0.0598, "step": 1611 }, { "epoch": 0.2072956406189721, "grad_norm": 0.203125, "learning_rate": 9.740132105955356e-05, "loss": 0.0561, "step": 1612 }, { "epoch": 0.20742423592952977, "grad_norm": 0.2216796875, "learning_rate": 9.739810419734398e-05, "loss": 0.0632, "step": 1613 }, { "epoch": 0.20755283124008744, "grad_norm": 0.201171875, "learning_rate": 9.739488539850326e-05, "loss": 0.0601, "step": 1614 }, { "epoch": 0.20768142655064512, "grad_norm": 0.2099609375, "learning_rate": 9.739166466316296e-05, "loss": 0.0606, "step": 1615 }, { "epoch": 0.2078100218612028, "grad_norm": 0.2109375, "learning_rate": 9.738844199145464e-05, "loss": 0.0636, "step": 1616 }, { "epoch": 0.20793861717176046, "grad_norm": 0.2060546875, "learning_rate": 9.738521738350999e-05, "loss": 0.0528, "step": 1617 }, { "epoch": 0.20806721248231813, "grad_norm": 0.2021484375, "learning_rate": 9.738199083946075e-05, "loss": 0.0626, "step": 1618 }, { "epoch": 0.20819580779287583, "grad_norm": 0.236328125, "learning_rate": 9.737876235943879e-05, "loss": 0.0582, "step": 1619 }, { "epoch": 0.2083244031034335, "grad_norm": 0.201171875, "learning_rate": 9.737553194357599e-05, "loss": 0.0597, "step": 1620 }, { "epoch": 0.20845299841399118, "grad_norm": 0.1923828125, "learning_rate": 9.737229959200434e-05, "loss": 0.0562, "step": 1621 }, { "epoch": 0.20858159372454885, "grad_norm": 0.228515625, "learning_rate": 9.73690653048559e-05, "loss": 0.0665, "step": 1622 }, { "epoch": 0.20871018903510652, "grad_norm": 0.203125, "learning_rate": 9.736582908226286e-05, "loss": 0.0591, "step": 1623 }, { "epoch": 0.2088387843456642, "grad_norm": 0.208984375, "learning_rate": 9.736259092435743e-05, "loss": 0.0686, "step": 1624 }, { "epoch": 0.20896737965622186, "grad_norm": 0.2080078125, "learning_rate": 9.735935083127188e-05, "loss": 0.0557, "step": 1625 }, { "epoch": 0.20909597496677954, "grad_norm": 0.1962890625, "learning_rate": 9.735610880313865e-05, "loss": 0.057, "step": 1626 }, { "epoch": 0.2092245702773372, "grad_norm": 0.1953125, "learning_rate": 9.735286484009018e-05, "loss": 0.0499, "step": 1627 }, { "epoch": 0.2093531655878949, "grad_norm": 0.21484375, "learning_rate": 9.734961894225904e-05, "loss": 0.0677, "step": 1628 }, { "epoch": 0.20948176089845258, "grad_norm": 0.2041015625, "learning_rate": 9.73463711097778e-05, "loss": 0.0596, "step": 1629 }, { "epoch": 0.20961035620901025, "grad_norm": 0.201171875, "learning_rate": 9.734312134277921e-05, "loss": 0.0541, "step": 1630 }, { "epoch": 0.20973895151956792, "grad_norm": 0.2060546875, "learning_rate": 9.733986964139604e-05, "loss": 0.0552, "step": 1631 }, { "epoch": 0.2098675468301256, "grad_norm": 0.216796875, "learning_rate": 9.733661600576113e-05, "loss": 0.0586, "step": 1632 }, { "epoch": 0.20999614214068327, "grad_norm": 0.2294921875, "learning_rate": 9.733336043600745e-05, "loss": 0.05, "step": 1633 }, { "epoch": 0.21012473745124094, "grad_norm": 0.21484375, "learning_rate": 9.733010293226802e-05, "loss": 0.0645, "step": 1634 }, { "epoch": 0.2102533327617986, "grad_norm": 0.1767578125, "learning_rate": 9.732684349467592e-05, "loss": 0.0555, "step": 1635 }, { "epoch": 0.21038192807235628, "grad_norm": 0.203125, "learning_rate": 9.73235821233643e-05, "loss": 0.0532, "step": 1636 }, { "epoch": 0.21051052338291398, "grad_norm": 0.2353515625, "learning_rate": 9.732031881846646e-05, "loss": 0.0748, "step": 1637 }, { "epoch": 0.21063911869347166, "grad_norm": 0.2216796875, "learning_rate": 9.731705358011573e-05, "loss": 0.0586, "step": 1638 }, { "epoch": 0.21076771400402933, "grad_norm": 0.2138671875, "learning_rate": 9.73137864084455e-05, "loss": 0.0605, "step": 1639 }, { "epoch": 0.210896309314587, "grad_norm": 0.2109375, "learning_rate": 9.731051730358932e-05, "loss": 0.0633, "step": 1640 }, { "epoch": 0.21102490462514467, "grad_norm": 0.1982421875, "learning_rate": 9.730724626568067e-05, "loss": 0.0572, "step": 1641 }, { "epoch": 0.21115349993570234, "grad_norm": 0.2138671875, "learning_rate": 9.730397329485327e-05, "loss": 0.0651, "step": 1642 }, { "epoch": 0.21128209524626002, "grad_norm": 0.2216796875, "learning_rate": 9.730069839124083e-05, "loss": 0.0688, "step": 1643 }, { "epoch": 0.2114106905568177, "grad_norm": 0.19921875, "learning_rate": 9.729742155497715e-05, "loss": 0.0551, "step": 1644 }, { "epoch": 0.21153928586737536, "grad_norm": 0.240234375, "learning_rate": 9.729414278619614e-05, "loss": 0.0567, "step": 1645 }, { "epoch": 0.21166788117793303, "grad_norm": 0.2060546875, "learning_rate": 9.729086208503174e-05, "loss": 0.0521, "step": 1646 }, { "epoch": 0.21179647648849073, "grad_norm": 0.201171875, "learning_rate": 9.728757945161801e-05, "loss": 0.0564, "step": 1647 }, { "epoch": 0.2119250717990484, "grad_norm": 0.212890625, "learning_rate": 9.728429488608908e-05, "loss": 0.0602, "step": 1648 }, { "epoch": 0.21205366710960608, "grad_norm": 0.2021484375, "learning_rate": 9.728100838857914e-05, "loss": 0.0617, "step": 1649 }, { "epoch": 0.21218226242016375, "grad_norm": 0.22265625, "learning_rate": 9.727771995922249e-05, "loss": 0.0591, "step": 1650 }, { "epoch": 0.21231085773072142, "grad_norm": 0.1806640625, "learning_rate": 9.727442959815347e-05, "loss": 0.0492, "step": 1651 }, { "epoch": 0.2124394530412791, "grad_norm": 0.1865234375, "learning_rate": 9.727113730550655e-05, "loss": 0.0521, "step": 1652 }, { "epoch": 0.21256804835183676, "grad_norm": 0.23046875, "learning_rate": 9.72678430814162e-05, "loss": 0.0678, "step": 1653 }, { "epoch": 0.21269664366239444, "grad_norm": 0.259765625, "learning_rate": 9.726454692601706e-05, "loss": 0.073, "step": 1654 }, { "epoch": 0.2128252389729521, "grad_norm": 0.201171875, "learning_rate": 9.72612488394438e-05, "loss": 0.0608, "step": 1655 }, { "epoch": 0.2129538342835098, "grad_norm": 0.220703125, "learning_rate": 9.725794882183117e-05, "loss": 0.0585, "step": 1656 }, { "epoch": 0.21308242959406748, "grad_norm": 0.21484375, "learning_rate": 9.7254646873314e-05, "loss": 0.0567, "step": 1657 }, { "epoch": 0.21321102490462515, "grad_norm": 0.22265625, "learning_rate": 9.725134299402723e-05, "loss": 0.064, "step": 1658 }, { "epoch": 0.21333962021518282, "grad_norm": 0.1904296875, "learning_rate": 9.724803718410581e-05, "loss": 0.0518, "step": 1659 }, { "epoch": 0.2134682155257405, "grad_norm": 0.2197265625, "learning_rate": 9.724472944368484e-05, "loss": 0.0684, "step": 1660 }, { "epoch": 0.21359681083629817, "grad_norm": 0.2236328125, "learning_rate": 9.724141977289946e-05, "loss": 0.0726, "step": 1661 }, { "epoch": 0.21372540614685584, "grad_norm": 0.2109375, "learning_rate": 9.723810817188492e-05, "loss": 0.0601, "step": 1662 }, { "epoch": 0.2138540014574135, "grad_norm": 0.193359375, "learning_rate": 9.723479464077649e-05, "loss": 0.0563, "step": 1663 }, { "epoch": 0.21398259676797118, "grad_norm": 0.2197265625, "learning_rate": 9.72314791797096e-05, "loss": 0.07, "step": 1664 }, { "epoch": 0.21411119207852886, "grad_norm": 0.1875, "learning_rate": 9.722816178881968e-05, "loss": 0.0545, "step": 1665 }, { "epoch": 0.21423978738908656, "grad_norm": 0.21875, "learning_rate": 9.72248424682423e-05, "loss": 0.0634, "step": 1666 }, { "epoch": 0.21436838269964423, "grad_norm": 0.259765625, "learning_rate": 9.722152121811307e-05, "loss": 0.0617, "step": 1667 }, { "epoch": 0.2144969780102019, "grad_norm": 0.2177734375, "learning_rate": 9.721819803856769e-05, "loss": 0.055, "step": 1668 }, { "epoch": 0.21462557332075957, "grad_norm": 0.203125, "learning_rate": 9.721487292974195e-05, "loss": 0.0551, "step": 1669 }, { "epoch": 0.21475416863131724, "grad_norm": 0.248046875, "learning_rate": 9.72115458917717e-05, "loss": 0.0698, "step": 1670 }, { "epoch": 0.21488276394187492, "grad_norm": 0.2119140625, "learning_rate": 9.720821692479289e-05, "loss": 0.0566, "step": 1671 }, { "epoch": 0.2150113592524326, "grad_norm": 0.1982421875, "learning_rate": 9.720488602894154e-05, "loss": 0.0505, "step": 1672 }, { "epoch": 0.21513995456299026, "grad_norm": 0.19921875, "learning_rate": 9.720155320435374e-05, "loss": 0.063, "step": 1673 }, { "epoch": 0.21526854987354793, "grad_norm": 0.212890625, "learning_rate": 9.719821845116566e-05, "loss": 0.0598, "step": 1674 }, { "epoch": 0.21539714518410563, "grad_norm": 0.21875, "learning_rate": 9.719488176951357e-05, "loss": 0.063, "step": 1675 }, { "epoch": 0.2155257404946633, "grad_norm": 0.2119140625, "learning_rate": 9.719154315953379e-05, "loss": 0.064, "step": 1676 }, { "epoch": 0.21565433580522098, "grad_norm": 0.1953125, "learning_rate": 9.71882026213627e-05, "loss": 0.0602, "step": 1677 }, { "epoch": 0.21578293111577865, "grad_norm": 0.1845703125, "learning_rate": 9.718486015513687e-05, "loss": 0.0513, "step": 1678 }, { "epoch": 0.21591152642633632, "grad_norm": 0.212890625, "learning_rate": 9.71815157609928e-05, "loss": 0.061, "step": 1679 }, { "epoch": 0.216040121736894, "grad_norm": 0.2119140625, "learning_rate": 9.717816943906716e-05, "loss": 0.0631, "step": 1680 }, { "epoch": 0.21616871704745166, "grad_norm": 0.19921875, "learning_rate": 9.717482118949668e-05, "loss": 0.0567, "step": 1681 }, { "epoch": 0.21629731235800934, "grad_norm": 0.208984375, "learning_rate": 9.717147101241816e-05, "loss": 0.065, "step": 1682 }, { "epoch": 0.216425907668567, "grad_norm": 0.22265625, "learning_rate": 9.71681189079685e-05, "loss": 0.0685, "step": 1683 }, { "epoch": 0.2165545029791247, "grad_norm": 0.220703125, "learning_rate": 9.716476487628466e-05, "loss": 0.0585, "step": 1684 }, { "epoch": 0.21668309828968238, "grad_norm": 0.1982421875, "learning_rate": 9.716140891750365e-05, "loss": 0.0603, "step": 1685 }, { "epoch": 0.21681169360024005, "grad_norm": 0.2197265625, "learning_rate": 9.715805103176262e-05, "loss": 0.0643, "step": 1686 }, { "epoch": 0.21694028891079772, "grad_norm": 0.2265625, "learning_rate": 9.715469121919877e-05, "loss": 0.0725, "step": 1687 }, { "epoch": 0.2170688842213554, "grad_norm": 0.2451171875, "learning_rate": 9.715132947994936e-05, "loss": 0.0695, "step": 1688 }, { "epoch": 0.21719747953191307, "grad_norm": 0.20703125, "learning_rate": 9.714796581415178e-05, "loss": 0.0516, "step": 1689 }, { "epoch": 0.21732607484247074, "grad_norm": 0.21875, "learning_rate": 9.714460022194343e-05, "loss": 0.0661, "step": 1690 }, { "epoch": 0.2174546701530284, "grad_norm": 0.2294921875, "learning_rate": 9.714123270346183e-05, "loss": 0.0726, "step": 1691 }, { "epoch": 0.21758326546358608, "grad_norm": 0.21484375, "learning_rate": 9.713786325884458e-05, "loss": 0.0598, "step": 1692 }, { "epoch": 0.21771186077414376, "grad_norm": 0.21875, "learning_rate": 9.713449188822936e-05, "loss": 0.0648, "step": 1693 }, { "epoch": 0.21784045608470146, "grad_norm": 0.2021484375, "learning_rate": 9.71311185917539e-05, "loss": 0.054, "step": 1694 }, { "epoch": 0.21796905139525913, "grad_norm": 0.2314453125, "learning_rate": 9.712774336955606e-05, "loss": 0.067, "step": 1695 }, { "epoch": 0.2180976467058168, "grad_norm": 0.2119140625, "learning_rate": 9.712436622177372e-05, "loss": 0.0659, "step": 1696 }, { "epoch": 0.21822624201637447, "grad_norm": 0.2265625, "learning_rate": 9.712098714854488e-05, "loss": 0.0617, "step": 1697 }, { "epoch": 0.21835483732693214, "grad_norm": 0.2255859375, "learning_rate": 9.711760615000759e-05, "loss": 0.0616, "step": 1698 }, { "epoch": 0.21848343263748982, "grad_norm": 0.23046875, "learning_rate": 9.71142232263e-05, "loss": 0.0622, "step": 1699 }, { "epoch": 0.2186120279480475, "grad_norm": 0.220703125, "learning_rate": 9.711083837756035e-05, "loss": 0.0672, "step": 1700 }, { "epoch": 0.21874062325860516, "grad_norm": 0.1962890625, "learning_rate": 9.710745160392692e-05, "loss": 0.0553, "step": 1701 }, { "epoch": 0.21886921856916283, "grad_norm": 0.212890625, "learning_rate": 9.71040629055381e-05, "loss": 0.0569, "step": 1702 }, { "epoch": 0.21899781387972053, "grad_norm": 0.208984375, "learning_rate": 9.710067228253234e-05, "loss": 0.0554, "step": 1703 }, { "epoch": 0.2191264091902782, "grad_norm": 0.228515625, "learning_rate": 9.709727973504819e-05, "loss": 0.059, "step": 1704 }, { "epoch": 0.21925500450083588, "grad_norm": 0.205078125, "learning_rate": 9.709388526322424e-05, "loss": 0.0615, "step": 1705 }, { "epoch": 0.21938359981139355, "grad_norm": 0.1904296875, "learning_rate": 9.709048886719922e-05, "loss": 0.0456, "step": 1706 }, { "epoch": 0.21951219512195122, "grad_norm": 0.2451171875, "learning_rate": 9.708709054711187e-05, "loss": 0.0673, "step": 1707 }, { "epoch": 0.2196407904325089, "grad_norm": 0.2109375, "learning_rate": 9.708369030310108e-05, "loss": 0.0749, "step": 1708 }, { "epoch": 0.21976938574306656, "grad_norm": 0.2333984375, "learning_rate": 9.708028813530572e-05, "loss": 0.0603, "step": 1709 }, { "epoch": 0.21989798105362424, "grad_norm": 0.21875, "learning_rate": 9.707688404386486e-05, "loss": 0.0588, "step": 1710 }, { "epoch": 0.2200265763641819, "grad_norm": 0.25390625, "learning_rate": 9.707347802891754e-05, "loss": 0.06, "step": 1711 }, { "epoch": 0.2201551716747396, "grad_norm": 0.251953125, "learning_rate": 9.707007009060296e-05, "loss": 0.081, "step": 1712 }, { "epoch": 0.22028376698529728, "grad_norm": 0.2109375, "learning_rate": 9.706666022906034e-05, "loss": 0.0559, "step": 1713 }, { "epoch": 0.22041236229585495, "grad_norm": 0.2197265625, "learning_rate": 9.706324844442904e-05, "loss": 0.0618, "step": 1714 }, { "epoch": 0.22054095760641262, "grad_norm": 0.2392578125, "learning_rate": 9.705983473684842e-05, "loss": 0.0638, "step": 1715 }, { "epoch": 0.2206695529169703, "grad_norm": 0.189453125, "learning_rate": 9.705641910645798e-05, "loss": 0.0577, "step": 1716 }, { "epoch": 0.22079814822752797, "grad_norm": 0.23046875, "learning_rate": 9.705300155339727e-05, "loss": 0.0587, "step": 1717 }, { "epoch": 0.22092674353808564, "grad_norm": 0.2255859375, "learning_rate": 9.704958207780592e-05, "loss": 0.0554, "step": 1718 }, { "epoch": 0.2210553388486433, "grad_norm": 0.2158203125, "learning_rate": 9.704616067982369e-05, "loss": 0.0666, "step": 1719 }, { "epoch": 0.22118393415920098, "grad_norm": 0.2138671875, "learning_rate": 9.704273735959031e-05, "loss": 0.058, "step": 1720 }, { "epoch": 0.22131252946975866, "grad_norm": 0.24609375, "learning_rate": 9.703931211724571e-05, "loss": 0.0638, "step": 1721 }, { "epoch": 0.22144112478031636, "grad_norm": 0.2099609375, "learning_rate": 9.703588495292979e-05, "loss": 0.0559, "step": 1722 }, { "epoch": 0.22156972009087403, "grad_norm": 0.2021484375, "learning_rate": 9.703245586678262e-05, "loss": 0.055, "step": 1723 }, { "epoch": 0.2216983154014317, "grad_norm": 0.2041015625, "learning_rate": 9.70290248589443e-05, "loss": 0.0606, "step": 1724 }, { "epoch": 0.22182691071198937, "grad_norm": 0.2255859375, "learning_rate": 9.7025591929555e-05, "loss": 0.0666, "step": 1725 }, { "epoch": 0.22195550602254704, "grad_norm": 0.2099609375, "learning_rate": 9.7022157078755e-05, "loss": 0.0587, "step": 1726 }, { "epoch": 0.22208410133310472, "grad_norm": 0.2041015625, "learning_rate": 9.701872030668466e-05, "loss": 0.0577, "step": 1727 }, { "epoch": 0.2222126966436624, "grad_norm": 0.224609375, "learning_rate": 9.701528161348437e-05, "loss": 0.0576, "step": 1728 }, { "epoch": 0.22234129195422006, "grad_norm": 0.2392578125, "learning_rate": 9.701184099929463e-05, "loss": 0.0721, "step": 1729 }, { "epoch": 0.22246988726477773, "grad_norm": 0.23046875, "learning_rate": 9.700839846425605e-05, "loss": 0.0701, "step": 1730 }, { "epoch": 0.22259848257533543, "grad_norm": 0.2041015625, "learning_rate": 9.700495400850926e-05, "loss": 0.0542, "step": 1731 }, { "epoch": 0.2227270778858931, "grad_norm": 0.193359375, "learning_rate": 9.700150763219502e-05, "loss": 0.0589, "step": 1732 }, { "epoch": 0.22285567319645078, "grad_norm": 0.18359375, "learning_rate": 9.699805933545413e-05, "loss": 0.053, "step": 1733 }, { "epoch": 0.22298426850700845, "grad_norm": 0.220703125, "learning_rate": 9.699460911842748e-05, "loss": 0.0645, "step": 1734 }, { "epoch": 0.22311286381756612, "grad_norm": 0.2333984375, "learning_rate": 9.699115698125606e-05, "loss": 0.0665, "step": 1735 }, { "epoch": 0.2232414591281238, "grad_norm": 0.203125, "learning_rate": 9.69877029240809e-05, "loss": 0.0584, "step": 1736 }, { "epoch": 0.22337005443868146, "grad_norm": 0.1904296875, "learning_rate": 9.698424694704314e-05, "loss": 0.0542, "step": 1737 }, { "epoch": 0.22349864974923914, "grad_norm": 0.2001953125, "learning_rate": 9.698078905028398e-05, "loss": 0.0619, "step": 1738 }, { "epoch": 0.2236272450597968, "grad_norm": 0.234375, "learning_rate": 9.697732923394473e-05, "loss": 0.058, "step": 1739 }, { "epoch": 0.22375584037035448, "grad_norm": 0.2392578125, "learning_rate": 9.697386749816672e-05, "loss": 0.0736, "step": 1740 }, { "epoch": 0.22388443568091218, "grad_norm": 0.2099609375, "learning_rate": 9.697040384309139e-05, "loss": 0.057, "step": 1741 }, { "epoch": 0.22401303099146985, "grad_norm": 0.2197265625, "learning_rate": 9.69669382688603e-05, "loss": 0.0605, "step": 1742 }, { "epoch": 0.22414162630202752, "grad_norm": 0.2236328125, "learning_rate": 9.696347077561502e-05, "loss": 0.058, "step": 1743 }, { "epoch": 0.2242702216125852, "grad_norm": 0.2392578125, "learning_rate": 9.696000136349724e-05, "loss": 0.0647, "step": 1744 }, { "epoch": 0.22439881692314287, "grad_norm": 0.2265625, "learning_rate": 9.695653003264871e-05, "loss": 0.0615, "step": 1745 }, { "epoch": 0.22452741223370054, "grad_norm": 0.2041015625, "learning_rate": 9.695305678321128e-05, "loss": 0.0601, "step": 1746 }, { "epoch": 0.2246560075442582, "grad_norm": 0.2216796875, "learning_rate": 9.69495816153268e-05, "loss": 0.0708, "step": 1747 }, { "epoch": 0.22478460285481588, "grad_norm": 0.193359375, "learning_rate": 9.694610452913734e-05, "loss": 0.0467, "step": 1748 }, { "epoch": 0.22491319816537356, "grad_norm": 0.2060546875, "learning_rate": 9.694262552478496e-05, "loss": 0.0611, "step": 1749 }, { "epoch": 0.22504179347593126, "grad_norm": 0.2373046875, "learning_rate": 9.693914460241175e-05, "loss": 0.0663, "step": 1750 }, { "epoch": 0.22517038878648893, "grad_norm": 0.2412109375, "learning_rate": 9.693566176216e-05, "loss": 0.0644, "step": 1751 }, { "epoch": 0.2252989840970466, "grad_norm": 0.1962890625, "learning_rate": 9.6932177004172e-05, "loss": 0.0628, "step": 1752 }, { "epoch": 0.22542757940760427, "grad_norm": 0.2255859375, "learning_rate": 9.692869032859009e-05, "loss": 0.0619, "step": 1753 }, { "epoch": 0.22555617471816194, "grad_norm": 0.212890625, "learning_rate": 9.692520173555676e-05, "loss": 0.0584, "step": 1754 }, { "epoch": 0.22568477002871962, "grad_norm": 0.255859375, "learning_rate": 9.692171122521459e-05, "loss": 0.0588, "step": 1755 }, { "epoch": 0.2258133653392773, "grad_norm": 0.1806640625, "learning_rate": 9.691821879770613e-05, "loss": 0.0421, "step": 1756 }, { "epoch": 0.22594196064983496, "grad_norm": 0.1982421875, "learning_rate": 9.691472445317411e-05, "loss": 0.0526, "step": 1757 }, { "epoch": 0.22607055596039263, "grad_norm": 0.1982421875, "learning_rate": 9.691122819176131e-05, "loss": 0.0548, "step": 1758 }, { "epoch": 0.22619915127095033, "grad_norm": 0.2255859375, "learning_rate": 9.690773001361058e-05, "loss": 0.061, "step": 1759 }, { "epoch": 0.226327746581508, "grad_norm": 0.2060546875, "learning_rate": 9.690422991886484e-05, "loss": 0.0539, "step": 1760 }, { "epoch": 0.22645634189206568, "grad_norm": 0.185546875, "learning_rate": 9.690072790766711e-05, "loss": 0.052, "step": 1761 }, { "epoch": 0.22658493720262335, "grad_norm": 0.216796875, "learning_rate": 9.689722398016047e-05, "loss": 0.0701, "step": 1762 }, { "epoch": 0.22671353251318102, "grad_norm": 0.1943359375, "learning_rate": 9.689371813648808e-05, "loss": 0.0552, "step": 1763 }, { "epoch": 0.2268421278237387, "grad_norm": 0.189453125, "learning_rate": 9.689021037679322e-05, "loss": 0.0494, "step": 1764 }, { "epoch": 0.22697072313429636, "grad_norm": 0.185546875, "learning_rate": 9.68867007012192e-05, "loss": 0.0487, "step": 1765 }, { "epoch": 0.22709931844485404, "grad_norm": 0.2041015625, "learning_rate": 9.688318910990937e-05, "loss": 0.0546, "step": 1766 }, { "epoch": 0.2272279137554117, "grad_norm": 0.2099609375, "learning_rate": 9.687967560300727e-05, "loss": 0.0585, "step": 1767 }, { "epoch": 0.22735650906596938, "grad_norm": 0.208984375, "learning_rate": 9.687616018065644e-05, "loss": 0.0597, "step": 1768 }, { "epoch": 0.22748510437652708, "grad_norm": 0.2275390625, "learning_rate": 9.68726428430005e-05, "loss": 0.0519, "step": 1769 }, { "epoch": 0.22761369968708475, "grad_norm": 0.267578125, "learning_rate": 9.68691235901832e-05, "loss": 0.0658, "step": 1770 }, { "epoch": 0.22774229499764242, "grad_norm": 0.2109375, "learning_rate": 9.68656024223483e-05, "loss": 0.0629, "step": 1771 }, { "epoch": 0.2278708903082001, "grad_norm": 0.216796875, "learning_rate": 9.686207933963967e-05, "loss": 0.0577, "step": 1772 }, { "epoch": 0.22799948561875777, "grad_norm": 0.19921875, "learning_rate": 9.685855434220128e-05, "loss": 0.0556, "step": 1773 }, { "epoch": 0.22812808092931544, "grad_norm": 0.2373046875, "learning_rate": 9.685502743017716e-05, "loss": 0.0563, "step": 1774 }, { "epoch": 0.2282566762398731, "grad_norm": 0.2119140625, "learning_rate": 9.685149860371139e-05, "loss": 0.0583, "step": 1775 }, { "epoch": 0.22838527155043079, "grad_norm": 0.19921875, "learning_rate": 9.684796786294817e-05, "loss": 0.0604, "step": 1776 }, { "epoch": 0.22851386686098846, "grad_norm": 0.2109375, "learning_rate": 9.684443520803175e-05, "loss": 0.0549, "step": 1777 }, { "epoch": 0.22864246217154616, "grad_norm": 0.2197265625, "learning_rate": 9.68409006391065e-05, "loss": 0.0578, "step": 1778 }, { "epoch": 0.22877105748210383, "grad_norm": 0.2109375, "learning_rate": 9.683736415631679e-05, "loss": 0.0633, "step": 1779 }, { "epoch": 0.2288996527926615, "grad_norm": 0.212890625, "learning_rate": 9.683382575980716e-05, "loss": 0.0582, "step": 1780 }, { "epoch": 0.22902824810321917, "grad_norm": 0.2001953125, "learning_rate": 9.683028544972216e-05, "loss": 0.0531, "step": 1781 }, { "epoch": 0.22915684341377685, "grad_norm": 0.2578125, "learning_rate": 9.682674322620646e-05, "loss": 0.0751, "step": 1782 }, { "epoch": 0.22928543872433452, "grad_norm": 0.1962890625, "learning_rate": 9.682319908940478e-05, "loss": 0.0534, "step": 1783 }, { "epoch": 0.2294140340348922, "grad_norm": 0.2119140625, "learning_rate": 9.681965303946193e-05, "loss": 0.0617, "step": 1784 }, { "epoch": 0.22954262934544986, "grad_norm": 0.2119140625, "learning_rate": 9.681610507652279e-05, "loss": 0.0538, "step": 1785 }, { "epoch": 0.22967122465600753, "grad_norm": 0.2080078125, "learning_rate": 9.681255520073236e-05, "loss": 0.0591, "step": 1786 }, { "epoch": 0.22979981996656523, "grad_norm": 0.193359375, "learning_rate": 9.680900341223564e-05, "loss": 0.0514, "step": 1787 }, { "epoch": 0.2299284152771229, "grad_norm": 0.1865234375, "learning_rate": 9.680544971117778e-05, "loss": 0.0546, "step": 1788 }, { "epoch": 0.23005701058768058, "grad_norm": 0.1962890625, "learning_rate": 9.680189409770395e-05, "loss": 0.0535, "step": 1789 }, { "epoch": 0.23018560589823825, "grad_norm": 0.24609375, "learning_rate": 9.679833657195948e-05, "loss": 0.0665, "step": 1790 }, { "epoch": 0.23031420120879592, "grad_norm": 0.1748046875, "learning_rate": 9.679477713408966e-05, "loss": 0.0485, "step": 1791 }, { "epoch": 0.2304427965193536, "grad_norm": 0.2060546875, "learning_rate": 9.679121578423998e-05, "loss": 0.06, "step": 1792 }, { "epoch": 0.23057139182991127, "grad_norm": 0.2216796875, "learning_rate": 9.678765252255593e-05, "loss": 0.0649, "step": 1793 }, { "epoch": 0.23069998714046894, "grad_norm": 0.1953125, "learning_rate": 9.678408734918312e-05, "loss": 0.0569, "step": 1794 }, { "epoch": 0.2308285824510266, "grad_norm": 0.255859375, "learning_rate": 9.678052026426718e-05, "loss": 0.0762, "step": 1795 }, { "epoch": 0.23095717776158428, "grad_norm": 0.21875, "learning_rate": 9.677695126795388e-05, "loss": 0.0654, "step": 1796 }, { "epoch": 0.23108577307214198, "grad_norm": 0.2373046875, "learning_rate": 9.677338036038905e-05, "loss": 0.0654, "step": 1797 }, { "epoch": 0.23121436838269965, "grad_norm": 0.197265625, "learning_rate": 9.676980754171859e-05, "loss": 0.0471, "step": 1798 }, { "epoch": 0.23134296369325733, "grad_norm": 0.2099609375, "learning_rate": 9.676623281208847e-05, "loss": 0.0655, "step": 1799 }, { "epoch": 0.231471559003815, "grad_norm": 0.21875, "learning_rate": 9.676265617164476e-05, "loss": 0.0595, "step": 1800 }, { "epoch": 0.23160015431437267, "grad_norm": 0.2265625, "learning_rate": 9.67590776205336e-05, "loss": 0.0658, "step": 1801 }, { "epoch": 0.23172874962493034, "grad_norm": 0.2421875, "learning_rate": 9.675549715890118e-05, "loss": 0.0641, "step": 1802 }, { "epoch": 0.231857344935488, "grad_norm": 0.2431640625, "learning_rate": 9.675191478689384e-05, "loss": 0.0705, "step": 1803 }, { "epoch": 0.23198594024604569, "grad_norm": 0.1904296875, "learning_rate": 9.674833050465792e-05, "loss": 0.0533, "step": 1804 }, { "epoch": 0.23211453555660336, "grad_norm": 0.1845703125, "learning_rate": 9.674474431233986e-05, "loss": 0.048, "step": 1805 }, { "epoch": 0.23224313086716106, "grad_norm": 0.189453125, "learning_rate": 9.674115621008621e-05, "loss": 0.0538, "step": 1806 }, { "epoch": 0.23237172617771873, "grad_norm": 0.2431640625, "learning_rate": 9.673756619804357e-05, "loss": 0.064, "step": 1807 }, { "epoch": 0.2325003214882764, "grad_norm": 0.201171875, "learning_rate": 9.673397427635862e-05, "loss": 0.0427, "step": 1808 }, { "epoch": 0.23262891679883407, "grad_norm": 0.1943359375, "learning_rate": 9.673038044517813e-05, "loss": 0.0534, "step": 1809 }, { "epoch": 0.23275751210939175, "grad_norm": 0.2021484375, "learning_rate": 9.672678470464892e-05, "loss": 0.0576, "step": 1810 }, { "epoch": 0.23288610741994942, "grad_norm": 0.2197265625, "learning_rate": 9.672318705491793e-05, "loss": 0.0543, "step": 1811 }, { "epoch": 0.2330147027305071, "grad_norm": 0.25, "learning_rate": 9.671958749613215e-05, "loss": 0.0598, "step": 1812 }, { "epoch": 0.23314329804106476, "grad_norm": 0.208984375, "learning_rate": 9.671598602843864e-05, "loss": 0.0521, "step": 1813 }, { "epoch": 0.23327189335162243, "grad_norm": 0.2216796875, "learning_rate": 9.671238265198457e-05, "loss": 0.0611, "step": 1814 }, { "epoch": 0.2334004886621801, "grad_norm": 0.2294921875, "learning_rate": 9.670877736691716e-05, "loss": 0.0621, "step": 1815 }, { "epoch": 0.2335290839727378, "grad_norm": 0.2333984375, "learning_rate": 9.670517017338372e-05, "loss": 0.0684, "step": 1816 }, { "epoch": 0.23365767928329548, "grad_norm": 0.220703125, "learning_rate": 9.670156107153164e-05, "loss": 0.0586, "step": 1817 }, { "epoch": 0.23378627459385315, "grad_norm": 0.20703125, "learning_rate": 9.669795006150837e-05, "loss": 0.0556, "step": 1818 }, { "epoch": 0.23391486990441082, "grad_norm": 0.1962890625, "learning_rate": 9.669433714346146e-05, "loss": 0.0547, "step": 1819 }, { "epoch": 0.2340434652149685, "grad_norm": 0.2109375, "learning_rate": 9.669072231753853e-05, "loss": 0.0578, "step": 1820 }, { "epoch": 0.23417206052552617, "grad_norm": 0.2060546875, "learning_rate": 9.668710558388729e-05, "loss": 0.0646, "step": 1821 }, { "epoch": 0.23430065583608384, "grad_norm": 0.224609375, "learning_rate": 9.668348694265549e-05, "loss": 0.0641, "step": 1822 }, { "epoch": 0.2344292511466415, "grad_norm": 0.26953125, "learning_rate": 9.6679866393991e-05, "loss": 0.0641, "step": 1823 }, { "epoch": 0.23455784645719918, "grad_norm": 0.2412109375, "learning_rate": 9.667624393804176e-05, "loss": 0.0682, "step": 1824 }, { "epoch": 0.23468644176775688, "grad_norm": 0.2294921875, "learning_rate": 9.667261957495575e-05, "loss": 0.0682, "step": 1825 }, { "epoch": 0.23481503707831455, "grad_norm": 0.21484375, "learning_rate": 9.666899330488109e-05, "loss": 0.0667, "step": 1826 }, { "epoch": 0.23494363238887223, "grad_norm": 0.2099609375, "learning_rate": 9.666536512796592e-05, "loss": 0.0607, "step": 1827 }, { "epoch": 0.2350722276994299, "grad_norm": 0.251953125, "learning_rate": 9.66617350443585e-05, "loss": 0.0705, "step": 1828 }, { "epoch": 0.23520082300998757, "grad_norm": 0.251953125, "learning_rate": 9.665810305420713e-05, "loss": 0.0537, "step": 1829 }, { "epoch": 0.23532941832054524, "grad_norm": 0.19140625, "learning_rate": 9.665446915766023e-05, "loss": 0.0537, "step": 1830 }, { "epoch": 0.2354580136311029, "grad_norm": 0.201171875, "learning_rate": 9.665083335486627e-05, "loss": 0.0521, "step": 1831 }, { "epoch": 0.23558660894166059, "grad_norm": 0.2294921875, "learning_rate": 9.66471956459738e-05, "loss": 0.0638, "step": 1832 }, { "epoch": 0.23571520425221826, "grad_norm": 0.1962890625, "learning_rate": 9.664355603113146e-05, "loss": 0.0561, "step": 1833 }, { "epoch": 0.23584379956277596, "grad_norm": 0.2392578125, "learning_rate": 9.663991451048797e-05, "loss": 0.0611, "step": 1834 }, { "epoch": 0.23597239487333363, "grad_norm": 0.2255859375, "learning_rate": 9.663627108419208e-05, "loss": 0.0617, "step": 1835 }, { "epoch": 0.2361009901838913, "grad_norm": 0.197265625, "learning_rate": 9.663262575239271e-05, "loss": 0.0524, "step": 1836 }, { "epoch": 0.23622958549444897, "grad_norm": 0.2080078125, "learning_rate": 9.662897851523875e-05, "loss": 0.0686, "step": 1837 }, { "epoch": 0.23635818080500665, "grad_norm": 0.20703125, "learning_rate": 9.662532937287928e-05, "loss": 0.0509, "step": 1838 }, { "epoch": 0.23648677611556432, "grad_norm": 0.24609375, "learning_rate": 9.662167832546334e-05, "loss": 0.0663, "step": 1839 }, { "epoch": 0.236615371426122, "grad_norm": 0.2421875, "learning_rate": 9.661802537314014e-05, "loss": 0.0584, "step": 1840 }, { "epoch": 0.23674396673667966, "grad_norm": 0.2109375, "learning_rate": 9.661437051605892e-05, "loss": 0.0589, "step": 1841 }, { "epoch": 0.23687256204723733, "grad_norm": 0.1953125, "learning_rate": 9.661071375436906e-05, "loss": 0.0582, "step": 1842 }, { "epoch": 0.237001157357795, "grad_norm": 0.216796875, "learning_rate": 9.66070550882199e-05, "loss": 0.0596, "step": 1843 }, { "epoch": 0.2371297526683527, "grad_norm": 0.1953125, "learning_rate": 9.660339451776097e-05, "loss": 0.0584, "step": 1844 }, { "epoch": 0.23725834797891038, "grad_norm": 0.2236328125, "learning_rate": 9.659973204314183e-05, "loss": 0.0581, "step": 1845 }, { "epoch": 0.23738694328946805, "grad_norm": 0.20703125, "learning_rate": 9.659606766451213e-05, "loss": 0.0558, "step": 1846 }, { "epoch": 0.23751553860002572, "grad_norm": 0.1953125, "learning_rate": 9.659240138202157e-05, "loss": 0.0497, "step": 1847 }, { "epoch": 0.2376441339105834, "grad_norm": 0.1875, "learning_rate": 9.658873319581998e-05, "loss": 0.0512, "step": 1848 }, { "epoch": 0.23777272922114107, "grad_norm": 0.1962890625, "learning_rate": 9.658506310605723e-05, "loss": 0.0466, "step": 1849 }, { "epoch": 0.23790132453169874, "grad_norm": 0.2236328125, "learning_rate": 9.658139111288325e-05, "loss": 0.0605, "step": 1850 }, { "epoch": 0.2380299198422564, "grad_norm": 0.1962890625, "learning_rate": 9.657771721644812e-05, "loss": 0.0521, "step": 1851 }, { "epoch": 0.23815851515281408, "grad_norm": 0.1923828125, "learning_rate": 9.65740414169019e-05, "loss": 0.053, "step": 1852 }, { "epoch": 0.23828711046337178, "grad_norm": 0.224609375, "learning_rate": 9.657036371439482e-05, "loss": 0.0641, "step": 1853 }, { "epoch": 0.23841570577392945, "grad_norm": 0.203125, "learning_rate": 9.656668410907712e-05, "loss": 0.0549, "step": 1854 }, { "epoch": 0.23854430108448713, "grad_norm": 0.2021484375, "learning_rate": 9.656300260109915e-05, "loss": 0.0533, "step": 1855 }, { "epoch": 0.2386728963950448, "grad_norm": 0.203125, "learning_rate": 9.655931919061134e-05, "loss": 0.0522, "step": 1856 }, { "epoch": 0.23880149170560247, "grad_norm": 0.2353515625, "learning_rate": 9.655563387776419e-05, "loss": 0.0698, "step": 1857 }, { "epoch": 0.23893008701616014, "grad_norm": 0.1982421875, "learning_rate": 9.655194666270828e-05, "loss": 0.0598, "step": 1858 }, { "epoch": 0.23905868232671781, "grad_norm": 0.1826171875, "learning_rate": 9.654825754559425e-05, "loss": 0.0533, "step": 1859 }, { "epoch": 0.23918727763727549, "grad_norm": 0.1962890625, "learning_rate": 9.654456652657285e-05, "loss": 0.05, "step": 1860 }, { "epoch": 0.23931587294783316, "grad_norm": 0.1982421875, "learning_rate": 9.654087360579486e-05, "loss": 0.0495, "step": 1861 }, { "epoch": 0.23944446825839083, "grad_norm": 0.2158203125, "learning_rate": 9.653717878341121e-05, "loss": 0.0741, "step": 1862 }, { "epoch": 0.23957306356894853, "grad_norm": 0.2158203125, "learning_rate": 9.653348205957285e-05, "loss": 0.0664, "step": 1863 }, { "epoch": 0.2397016588795062, "grad_norm": 0.2099609375, "learning_rate": 9.65297834344308e-05, "loss": 0.0561, "step": 1864 }, { "epoch": 0.23983025419006387, "grad_norm": 0.2216796875, "learning_rate": 9.652608290813621e-05, "loss": 0.0604, "step": 1865 }, { "epoch": 0.23995884950062155, "grad_norm": 0.185546875, "learning_rate": 9.652238048084028e-05, "loss": 0.0575, "step": 1866 }, { "epoch": 0.24008744481117922, "grad_norm": 0.2119140625, "learning_rate": 9.651867615269427e-05, "loss": 0.0554, "step": 1867 }, { "epoch": 0.2402160401217369, "grad_norm": 0.2041015625, "learning_rate": 9.651496992384956e-05, "loss": 0.051, "step": 1868 }, { "epoch": 0.24034463543229456, "grad_norm": 0.263671875, "learning_rate": 9.651126179445756e-05, "loss": 0.0753, "step": 1869 }, { "epoch": 0.24047323074285223, "grad_norm": 0.189453125, "learning_rate": 9.650755176466978e-05, "loss": 0.0509, "step": 1870 }, { "epoch": 0.2406018260534099, "grad_norm": 0.1787109375, "learning_rate": 9.650383983463781e-05, "loss": 0.0441, "step": 1871 }, { "epoch": 0.2407304213639676, "grad_norm": 0.2177734375, "learning_rate": 9.650012600451333e-05, "loss": 0.0563, "step": 1872 }, { "epoch": 0.24085901667452528, "grad_norm": 0.21875, "learning_rate": 9.649641027444805e-05, "loss": 0.0572, "step": 1873 }, { "epoch": 0.24098761198508295, "grad_norm": 0.208984375, "learning_rate": 9.649269264459382e-05, "loss": 0.0585, "step": 1874 }, { "epoch": 0.24111620729564062, "grad_norm": 0.2099609375, "learning_rate": 9.648897311510252e-05, "loss": 0.0619, "step": 1875 }, { "epoch": 0.2412448026061983, "grad_norm": 0.2265625, "learning_rate": 9.648525168612615e-05, "loss": 0.0658, "step": 1876 }, { "epoch": 0.24137339791675597, "grad_norm": 0.1943359375, "learning_rate": 9.648152835781674e-05, "loss": 0.0536, "step": 1877 }, { "epoch": 0.24150199322731364, "grad_norm": 0.244140625, "learning_rate": 9.647780313032644e-05, "loss": 0.0656, "step": 1878 }, { "epoch": 0.2416305885378713, "grad_norm": 0.1982421875, "learning_rate": 9.647407600380743e-05, "loss": 0.0567, "step": 1879 }, { "epoch": 0.24175918384842898, "grad_norm": 0.2080078125, "learning_rate": 9.647034697841202e-05, "loss": 0.0548, "step": 1880 }, { "epoch": 0.24188777915898668, "grad_norm": 0.2197265625, "learning_rate": 9.646661605429258e-05, "loss": 0.0601, "step": 1881 }, { "epoch": 0.24201637446954435, "grad_norm": 0.208984375, "learning_rate": 9.64628832316015e-05, "loss": 0.0479, "step": 1882 }, { "epoch": 0.24214496978010203, "grad_norm": 0.2060546875, "learning_rate": 9.645914851049137e-05, "loss": 0.0539, "step": 1883 }, { "epoch": 0.2422735650906597, "grad_norm": 0.2060546875, "learning_rate": 9.645541189111475e-05, "loss": 0.0598, "step": 1884 }, { "epoch": 0.24240216040121737, "grad_norm": 0.193359375, "learning_rate": 9.64516733736243e-05, "loss": 0.0494, "step": 1885 }, { "epoch": 0.24253075571177504, "grad_norm": 0.20703125, "learning_rate": 9.64479329581728e-05, "loss": 0.0612, "step": 1886 }, { "epoch": 0.24265935102233271, "grad_norm": 0.2158203125, "learning_rate": 9.644419064491306e-05, "loss": 0.0696, "step": 1887 }, { "epoch": 0.2427879463328904, "grad_norm": 0.1982421875, "learning_rate": 9.644044643399801e-05, "loss": 0.0497, "step": 1888 }, { "epoch": 0.24291654164344806, "grad_norm": 0.25390625, "learning_rate": 9.64367003255806e-05, "loss": 0.051, "step": 1889 }, { "epoch": 0.24304513695400573, "grad_norm": 0.2099609375, "learning_rate": 9.643295231981392e-05, "loss": 0.0577, "step": 1890 }, { "epoch": 0.24317373226456343, "grad_norm": 0.2001953125, "learning_rate": 9.64292024168511e-05, "loss": 0.0484, "step": 1891 }, { "epoch": 0.2433023275751211, "grad_norm": 0.1943359375, "learning_rate": 9.642545061684534e-05, "loss": 0.0497, "step": 1892 }, { "epoch": 0.24343092288567877, "grad_norm": 0.236328125, "learning_rate": 9.642169691994995e-05, "loss": 0.0691, "step": 1893 }, { "epoch": 0.24355951819623645, "grad_norm": 0.1943359375, "learning_rate": 9.641794132631832e-05, "loss": 0.0537, "step": 1894 }, { "epoch": 0.24368811350679412, "grad_norm": 0.19921875, "learning_rate": 9.641418383610388e-05, "loss": 0.0586, "step": 1895 }, { "epoch": 0.2438167088173518, "grad_norm": 0.224609375, "learning_rate": 9.641042444946014e-05, "loss": 0.0606, "step": 1896 }, { "epoch": 0.24394530412790946, "grad_norm": 0.212890625, "learning_rate": 9.640666316654072e-05, "loss": 0.0575, "step": 1897 }, { "epoch": 0.24407389943846713, "grad_norm": 0.193359375, "learning_rate": 9.640289998749932e-05, "loss": 0.0497, "step": 1898 }, { "epoch": 0.2442024947490248, "grad_norm": 0.2158203125, "learning_rate": 9.639913491248968e-05, "loss": 0.0622, "step": 1899 }, { "epoch": 0.2443310900595825, "grad_norm": 0.1982421875, "learning_rate": 9.639536794166562e-05, "loss": 0.0535, "step": 1900 }, { "epoch": 0.24445968537014018, "grad_norm": 0.1953125, "learning_rate": 9.63915990751811e-05, "loss": 0.0544, "step": 1901 }, { "epoch": 0.24458828068069785, "grad_norm": 0.271484375, "learning_rate": 9.638782831319005e-05, "loss": 0.0617, "step": 1902 }, { "epoch": 0.24471687599125552, "grad_norm": 0.1923828125, "learning_rate": 9.63840556558466e-05, "loss": 0.0542, "step": 1903 }, { "epoch": 0.2448454713018132, "grad_norm": 0.20703125, "learning_rate": 9.638028110330485e-05, "loss": 0.0625, "step": 1904 }, { "epoch": 0.24497406661237087, "grad_norm": 0.1943359375, "learning_rate": 9.637650465571906e-05, "loss": 0.0527, "step": 1905 }, { "epoch": 0.24510266192292854, "grad_norm": 0.2138671875, "learning_rate": 9.63727263132435e-05, "loss": 0.0642, "step": 1906 }, { "epoch": 0.2452312572334862, "grad_norm": 0.1923828125, "learning_rate": 9.636894607603257e-05, "loss": 0.0556, "step": 1907 }, { "epoch": 0.24535985254404388, "grad_norm": 0.2138671875, "learning_rate": 9.636516394424071e-05, "loss": 0.0654, "step": 1908 }, { "epoch": 0.24548844785460158, "grad_norm": 0.21875, "learning_rate": 9.636137991802248e-05, "loss": 0.0641, "step": 1909 }, { "epoch": 0.24561704316515925, "grad_norm": 0.2451171875, "learning_rate": 9.635759399753246e-05, "loss": 0.0614, "step": 1910 }, { "epoch": 0.24574563847571693, "grad_norm": 0.212890625, "learning_rate": 9.635380618292535e-05, "loss": 0.0655, "step": 1911 }, { "epoch": 0.2458742337862746, "grad_norm": 0.20703125, "learning_rate": 9.635001647435593e-05, "loss": 0.0591, "step": 1912 }, { "epoch": 0.24600282909683227, "grad_norm": 0.17578125, "learning_rate": 9.634622487197902e-05, "loss": 0.047, "step": 1913 }, { "epoch": 0.24613142440738994, "grad_norm": 0.2041015625, "learning_rate": 9.634243137594955e-05, "loss": 0.0527, "step": 1914 }, { "epoch": 0.24626001971794761, "grad_norm": 0.1904296875, "learning_rate": 9.633863598642253e-05, "loss": 0.0476, "step": 1915 }, { "epoch": 0.2463886150285053, "grad_norm": 0.1875, "learning_rate": 9.633483870355301e-05, "loss": 0.0519, "step": 1916 }, { "epoch": 0.24651721033906296, "grad_norm": 0.2138671875, "learning_rate": 9.633103952749615e-05, "loss": 0.059, "step": 1917 }, { "epoch": 0.24664580564962063, "grad_norm": 0.2119140625, "learning_rate": 9.632723845840721e-05, "loss": 0.0545, "step": 1918 }, { "epoch": 0.24677440096017833, "grad_norm": 0.203125, "learning_rate": 9.632343549644145e-05, "loss": 0.0584, "step": 1919 }, { "epoch": 0.246902996270736, "grad_norm": 0.2099609375, "learning_rate": 9.631963064175428e-05, "loss": 0.0592, "step": 1920 }, { "epoch": 0.24703159158129367, "grad_norm": 0.2158203125, "learning_rate": 9.631582389450118e-05, "loss": 0.0542, "step": 1921 }, { "epoch": 0.24716018689185135, "grad_norm": 0.2021484375, "learning_rate": 9.631201525483766e-05, "loss": 0.0467, "step": 1922 }, { "epoch": 0.24728878220240902, "grad_norm": 0.2216796875, "learning_rate": 9.630820472291935e-05, "loss": 0.0638, "step": 1923 }, { "epoch": 0.2474173775129667, "grad_norm": 0.21875, "learning_rate": 9.630439229890192e-05, "loss": 0.062, "step": 1924 }, { "epoch": 0.24754597282352436, "grad_norm": 0.228515625, "learning_rate": 9.630057798294118e-05, "loss": 0.0584, "step": 1925 }, { "epoch": 0.24767456813408203, "grad_norm": 0.20703125, "learning_rate": 9.629676177519296e-05, "loss": 0.0562, "step": 1926 }, { "epoch": 0.2478031634446397, "grad_norm": 0.2373046875, "learning_rate": 9.629294367581316e-05, "loss": 0.0557, "step": 1927 }, { "epoch": 0.2479317587551974, "grad_norm": 0.19140625, "learning_rate": 9.628912368495784e-05, "loss": 0.05, "step": 1928 }, { "epoch": 0.24806035406575508, "grad_norm": 0.2080078125, "learning_rate": 9.628530180278303e-05, "loss": 0.0589, "step": 1929 }, { "epoch": 0.24818894937631275, "grad_norm": 0.189453125, "learning_rate": 9.628147802944492e-05, "loss": 0.0492, "step": 1930 }, { "epoch": 0.24831754468687042, "grad_norm": 0.224609375, "learning_rate": 9.627765236509971e-05, "loss": 0.0594, "step": 1931 }, { "epoch": 0.2484461399974281, "grad_norm": 0.2021484375, "learning_rate": 9.627382480990374e-05, "loss": 0.0587, "step": 1932 }, { "epoch": 0.24857473530798577, "grad_norm": 0.2021484375, "learning_rate": 9.62699953640134e-05, "loss": 0.0617, "step": 1933 }, { "epoch": 0.24870333061854344, "grad_norm": 0.24609375, "learning_rate": 9.626616402758515e-05, "loss": 0.0733, "step": 1934 }, { "epoch": 0.2488319259291011, "grad_norm": 0.189453125, "learning_rate": 9.626233080077552e-05, "loss": 0.0523, "step": 1935 }, { "epoch": 0.24896052123965878, "grad_norm": 0.22265625, "learning_rate": 9.625849568374116e-05, "loss": 0.0554, "step": 1936 }, { "epoch": 0.24908911655021646, "grad_norm": 0.1943359375, "learning_rate": 9.625465867663875e-05, "loss": 0.0517, "step": 1937 }, { "epoch": 0.24921771186077415, "grad_norm": 0.1845703125, "learning_rate": 9.625081977962505e-05, "loss": 0.0496, "step": 1938 }, { "epoch": 0.24934630717133183, "grad_norm": 0.2158203125, "learning_rate": 9.624697899285696e-05, "loss": 0.0603, "step": 1939 }, { "epoch": 0.2494749024818895, "grad_norm": 0.2373046875, "learning_rate": 9.624313631649136e-05, "loss": 0.0592, "step": 1940 }, { "epoch": 0.24960349779244717, "grad_norm": 0.2041015625, "learning_rate": 9.62392917506853e-05, "loss": 0.0577, "step": 1941 }, { "epoch": 0.24973209310300484, "grad_norm": 0.201171875, "learning_rate": 9.623544529559581e-05, "loss": 0.05, "step": 1942 }, { "epoch": 0.24986068841356252, "grad_norm": 0.2470703125, "learning_rate": 9.623159695138011e-05, "loss": 0.0652, "step": 1943 }, { "epoch": 0.2499892837241202, "grad_norm": 0.19921875, "learning_rate": 9.622774671819541e-05, "loss": 0.058, "step": 1944 }, { "epoch": 0.25011787903467786, "grad_norm": 0.228515625, "learning_rate": 9.622389459619903e-05, "loss": 0.0672, "step": 1945 }, { "epoch": 0.25024647434523556, "grad_norm": 0.236328125, "learning_rate": 9.622004058554835e-05, "loss": 0.0582, "step": 1946 }, { "epoch": 0.2503750696557932, "grad_norm": 0.21875, "learning_rate": 9.621618468640086e-05, "loss": 0.0552, "step": 1947 }, { "epoch": 0.2505036649663509, "grad_norm": 0.1826171875, "learning_rate": 9.621232689891411e-05, "loss": 0.0432, "step": 1948 }, { "epoch": 0.25063226027690855, "grad_norm": 0.19140625, "learning_rate": 9.62084672232457e-05, "loss": 0.0501, "step": 1949 }, { "epoch": 0.25076085558746625, "grad_norm": 0.2490234375, "learning_rate": 9.620460565955336e-05, "loss": 0.0647, "step": 1950 }, { "epoch": 0.2508894508980239, "grad_norm": 0.1982421875, "learning_rate": 9.620074220799485e-05, "loss": 0.0502, "step": 1951 }, { "epoch": 0.2510180462085816, "grad_norm": 0.244140625, "learning_rate": 9.619687686872803e-05, "loss": 0.0562, "step": 1952 }, { "epoch": 0.2511466415191393, "grad_norm": 0.197265625, "learning_rate": 9.619300964191084e-05, "loss": 0.0581, "step": 1953 }, { "epoch": 0.25127523682969694, "grad_norm": 0.216796875, "learning_rate": 9.618914052770129e-05, "loss": 0.0611, "step": 1954 }, { "epoch": 0.25140383214025463, "grad_norm": 0.23046875, "learning_rate": 9.618526952625745e-05, "loss": 0.067, "step": 1955 }, { "epoch": 0.2515324274508123, "grad_norm": 0.1875, "learning_rate": 9.618139663773751e-05, "loss": 0.0518, "step": 1956 }, { "epoch": 0.25166102276137, "grad_norm": 0.2119140625, "learning_rate": 9.61775218622997e-05, "loss": 0.0638, "step": 1957 }, { "epoch": 0.2517896180719276, "grad_norm": 0.2041015625, "learning_rate": 9.617364520010232e-05, "loss": 0.0502, "step": 1958 }, { "epoch": 0.2519182133824853, "grad_norm": 0.232421875, "learning_rate": 9.61697666513038e-05, "loss": 0.055, "step": 1959 }, { "epoch": 0.25204680869304297, "grad_norm": 0.236328125, "learning_rate": 9.61658862160626e-05, "loss": 0.0529, "step": 1960 }, { "epoch": 0.25217540400360067, "grad_norm": 0.2109375, "learning_rate": 9.616200389453727e-05, "loss": 0.0595, "step": 1961 }, { "epoch": 0.25230399931415837, "grad_norm": 0.2216796875, "learning_rate": 9.615811968688642e-05, "loss": 0.0616, "step": 1962 }, { "epoch": 0.252432594624716, "grad_norm": 0.216796875, "learning_rate": 9.615423359326877e-05, "loss": 0.0601, "step": 1963 }, { "epoch": 0.2525611899352737, "grad_norm": 0.171875, "learning_rate": 9.61503456138431e-05, "loss": 0.042, "step": 1964 }, { "epoch": 0.25268978524583136, "grad_norm": 0.2373046875, "learning_rate": 9.614645574876828e-05, "loss": 0.064, "step": 1965 }, { "epoch": 0.25281838055638906, "grad_norm": 0.2578125, "learning_rate": 9.614256399820323e-05, "loss": 0.0642, "step": 1966 }, { "epoch": 0.2529469758669467, "grad_norm": 0.185546875, "learning_rate": 9.613867036230696e-05, "loss": 0.0443, "step": 1967 }, { "epoch": 0.2530755711775044, "grad_norm": 0.1953125, "learning_rate": 9.613477484123856e-05, "loss": 0.0603, "step": 1968 }, { "epoch": 0.25320416648806204, "grad_norm": 0.197265625, "learning_rate": 9.613087743515722e-05, "loss": 0.0501, "step": 1969 }, { "epoch": 0.25333276179861974, "grad_norm": 0.2001953125, "learning_rate": 9.612697814422216e-05, "loss": 0.0554, "step": 1970 }, { "epoch": 0.25346135710917744, "grad_norm": 0.205078125, "learning_rate": 9.61230769685927e-05, "loss": 0.0556, "step": 1971 }, { "epoch": 0.2535899524197351, "grad_norm": 0.20703125, "learning_rate": 9.611917390842825e-05, "loss": 0.0567, "step": 1972 }, { "epoch": 0.2537185477302928, "grad_norm": 0.1953125, "learning_rate": 9.611526896388825e-05, "loss": 0.0563, "step": 1973 }, { "epoch": 0.25384714304085043, "grad_norm": 0.1875, "learning_rate": 9.611136213513231e-05, "loss": 0.0444, "step": 1974 }, { "epoch": 0.25397573835140813, "grad_norm": 0.2216796875, "learning_rate": 9.610745342232001e-05, "loss": 0.0652, "step": 1975 }, { "epoch": 0.2541043336619658, "grad_norm": 0.205078125, "learning_rate": 9.610354282561109e-05, "loss": 0.0644, "step": 1976 }, { "epoch": 0.2542329289725235, "grad_norm": 0.1982421875, "learning_rate": 9.609963034516531e-05, "loss": 0.0477, "step": 1977 }, { "epoch": 0.2543615242830811, "grad_norm": 0.2080078125, "learning_rate": 9.609571598114253e-05, "loss": 0.0546, "step": 1978 }, { "epoch": 0.2544901195936388, "grad_norm": 0.20703125, "learning_rate": 9.609179973370268e-05, "loss": 0.0616, "step": 1979 }, { "epoch": 0.2546187149041965, "grad_norm": 0.2421875, "learning_rate": 9.60878816030058e-05, "loss": 0.0616, "step": 1980 }, { "epoch": 0.25474731021475416, "grad_norm": 0.1806640625, "learning_rate": 9.608396158921195e-05, "loss": 0.0476, "step": 1981 }, { "epoch": 0.25487590552531186, "grad_norm": 0.21484375, "learning_rate": 9.608003969248133e-05, "loss": 0.0594, "step": 1982 }, { "epoch": 0.2550045008358695, "grad_norm": 0.19921875, "learning_rate": 9.607611591297416e-05, "loss": 0.0566, "step": 1983 }, { "epoch": 0.2551330961464272, "grad_norm": 0.2119140625, "learning_rate": 9.607219025085077e-05, "loss": 0.0631, "step": 1984 }, { "epoch": 0.25526169145698485, "grad_norm": 0.2021484375, "learning_rate": 9.606826270627155e-05, "loss": 0.0579, "step": 1985 }, { "epoch": 0.25539028676754255, "grad_norm": 0.212890625, "learning_rate": 9.606433327939698e-05, "loss": 0.0542, "step": 1986 }, { "epoch": 0.2555188820781002, "grad_norm": 0.1845703125, "learning_rate": 9.606040197038761e-05, "loss": 0.0585, "step": 1987 }, { "epoch": 0.2556474773886579, "grad_norm": 0.201171875, "learning_rate": 9.605646877940408e-05, "loss": 0.0561, "step": 1988 }, { "epoch": 0.2557760726992156, "grad_norm": 0.19140625, "learning_rate": 9.605253370660707e-05, "loss": 0.0539, "step": 1989 }, { "epoch": 0.25590466800977324, "grad_norm": 0.1865234375, "learning_rate": 9.604859675215739e-05, "loss": 0.054, "step": 1990 }, { "epoch": 0.25603326332033094, "grad_norm": 0.205078125, "learning_rate": 9.604465791621588e-05, "loss": 0.0569, "step": 1991 }, { "epoch": 0.2561618586308886, "grad_norm": 0.216796875, "learning_rate": 9.60407171989435e-05, "loss": 0.0666, "step": 1992 }, { "epoch": 0.2562904539414463, "grad_norm": 0.2080078125, "learning_rate": 9.603677460050123e-05, "loss": 0.0605, "step": 1993 }, { "epoch": 0.2564190492520039, "grad_norm": 0.1904296875, "learning_rate": 9.603283012105018e-05, "loss": 0.0547, "step": 1994 }, { "epoch": 0.2565476445625616, "grad_norm": 0.1953125, "learning_rate": 9.60288837607515e-05, "loss": 0.063, "step": 1995 }, { "epoch": 0.25667623987311927, "grad_norm": 0.197265625, "learning_rate": 9.602493551976647e-05, "loss": 0.0516, "step": 1996 }, { "epoch": 0.25680483518367697, "grad_norm": 0.2060546875, "learning_rate": 9.602098539825639e-05, "loss": 0.0613, "step": 1997 }, { "epoch": 0.2569334304942346, "grad_norm": 0.1787109375, "learning_rate": 9.601703339638264e-05, "loss": 0.0498, "step": 1998 }, { "epoch": 0.2570620258047923, "grad_norm": 0.2041015625, "learning_rate": 9.601307951430671e-05, "loss": 0.0647, "step": 1999 }, { "epoch": 0.25719062111535, "grad_norm": 0.1865234375, "learning_rate": 9.600912375219016e-05, "loss": 0.045, "step": 2000 }, { "epoch": 0.25719062111535, "eval_loss": 0.05588929355144501, "eval_runtime": 1044.7356, "eval_samples_per_second": 94.02, "eval_steps_per_second": 1.175, "step": 2000 }, { "epoch": 0.25731921642590766, "grad_norm": 0.2177734375, "learning_rate": 9.600516611019461e-05, "loss": 0.0715, "step": 2001 }, { "epoch": 0.25744781173646536, "grad_norm": 0.1953125, "learning_rate": 9.600120658848175e-05, "loss": 0.0457, "step": 2002 }, { "epoch": 0.257576407047023, "grad_norm": 0.185546875, "learning_rate": 9.599724518721338e-05, "loss": 0.0536, "step": 2003 }, { "epoch": 0.2577050023575807, "grad_norm": 0.193359375, "learning_rate": 9.599328190655135e-05, "loss": 0.0548, "step": 2004 }, { "epoch": 0.25783359766813835, "grad_norm": 0.2099609375, "learning_rate": 9.598931674665761e-05, "loss": 0.0512, "step": 2005 }, { "epoch": 0.25796219297869605, "grad_norm": 0.212890625, "learning_rate": 9.598534970769416e-05, "loss": 0.0509, "step": 2006 }, { "epoch": 0.2580907882892537, "grad_norm": 0.2294921875, "learning_rate": 9.598138078982306e-05, "loss": 0.0569, "step": 2007 }, { "epoch": 0.2582193835998114, "grad_norm": 0.2255859375, "learning_rate": 9.597740999320652e-05, "loss": 0.0641, "step": 2008 }, { "epoch": 0.2583479789103691, "grad_norm": 0.2158203125, "learning_rate": 9.597343731800677e-05, "loss": 0.0659, "step": 2009 }, { "epoch": 0.25847657422092674, "grad_norm": 0.2392578125, "learning_rate": 9.596946276438613e-05, "loss": 0.0675, "step": 2010 }, { "epoch": 0.25860516953148444, "grad_norm": 0.193359375, "learning_rate": 9.596548633250699e-05, "loss": 0.051, "step": 2011 }, { "epoch": 0.2587337648420421, "grad_norm": 0.21484375, "learning_rate": 9.596150802253181e-05, "loss": 0.0585, "step": 2012 }, { "epoch": 0.2588623601525998, "grad_norm": 0.20703125, "learning_rate": 9.595752783462315e-05, "loss": 0.0573, "step": 2013 }, { "epoch": 0.2589909554631574, "grad_norm": 0.1953125, "learning_rate": 9.595354576894364e-05, "loss": 0.0611, "step": 2014 }, { "epoch": 0.2591195507737151, "grad_norm": 0.19140625, "learning_rate": 9.594956182565598e-05, "loss": 0.0438, "step": 2015 }, { "epoch": 0.25924814608427277, "grad_norm": 0.193359375, "learning_rate": 9.594557600492295e-05, "loss": 0.0509, "step": 2016 }, { "epoch": 0.25937674139483047, "grad_norm": 0.21484375, "learning_rate": 9.59415883069074e-05, "loss": 0.0645, "step": 2017 }, { "epoch": 0.25950533670538817, "grad_norm": 0.2021484375, "learning_rate": 9.593759873177228e-05, "loss": 0.0579, "step": 2018 }, { "epoch": 0.2596339320159458, "grad_norm": 0.19140625, "learning_rate": 9.59336072796806e-05, "loss": 0.0584, "step": 2019 }, { "epoch": 0.2597625273265035, "grad_norm": 0.1826171875, "learning_rate": 9.59296139507954e-05, "loss": 0.0474, "step": 2020 }, { "epoch": 0.25989112263706116, "grad_norm": 0.1865234375, "learning_rate": 9.59256187452799e-05, "loss": 0.0534, "step": 2021 }, { "epoch": 0.26001971794761886, "grad_norm": 0.2109375, "learning_rate": 9.59216216632973e-05, "loss": 0.0577, "step": 2022 }, { "epoch": 0.2601483132581765, "grad_norm": 0.2041015625, "learning_rate": 9.591762270501092e-05, "loss": 0.0539, "step": 2023 }, { "epoch": 0.2602769085687342, "grad_norm": 0.2119140625, "learning_rate": 9.591362187058419e-05, "loss": 0.0655, "step": 2024 }, { "epoch": 0.26040550387929184, "grad_norm": 0.1923828125, "learning_rate": 9.590961916018056e-05, "loss": 0.0459, "step": 2025 }, { "epoch": 0.26053409918984954, "grad_norm": 0.1962890625, "learning_rate": 9.590561457396355e-05, "loss": 0.0545, "step": 2026 }, { "epoch": 0.26066269450040724, "grad_norm": 0.2392578125, "learning_rate": 9.590160811209679e-05, "loss": 0.0631, "step": 2027 }, { "epoch": 0.2607912898109649, "grad_norm": 0.20703125, "learning_rate": 9.589759977474401e-05, "loss": 0.0574, "step": 2028 }, { "epoch": 0.2609198851215226, "grad_norm": 0.1962890625, "learning_rate": 9.589358956206896e-05, "loss": 0.0589, "step": 2029 }, { "epoch": 0.26104848043208023, "grad_norm": 0.197265625, "learning_rate": 9.588957747423551e-05, "loss": 0.0544, "step": 2030 }, { "epoch": 0.26117707574263793, "grad_norm": 0.19140625, "learning_rate": 9.588556351140757e-05, "loss": 0.0503, "step": 2031 }, { "epoch": 0.2613056710531956, "grad_norm": 0.2001953125, "learning_rate": 9.588154767374916e-05, "loss": 0.0545, "step": 2032 }, { "epoch": 0.2614342663637533, "grad_norm": 0.1904296875, "learning_rate": 9.587752996142436e-05, "loss": 0.0437, "step": 2033 }, { "epoch": 0.2615628616743109, "grad_norm": 0.2177734375, "learning_rate": 9.587351037459733e-05, "loss": 0.0691, "step": 2034 }, { "epoch": 0.2616914569848686, "grad_norm": 0.205078125, "learning_rate": 9.58694889134323e-05, "loss": 0.0578, "step": 2035 }, { "epoch": 0.2618200522954263, "grad_norm": 0.19921875, "learning_rate": 9.586546557809357e-05, "loss": 0.0522, "step": 2036 }, { "epoch": 0.26194864760598396, "grad_norm": 0.2236328125, "learning_rate": 9.586144036874556e-05, "loss": 0.0608, "step": 2037 }, { "epoch": 0.26207724291654166, "grad_norm": 0.1982421875, "learning_rate": 9.585741328555271e-05, "loss": 0.065, "step": 2038 }, { "epoch": 0.2622058382270993, "grad_norm": 0.2314453125, "learning_rate": 9.585338432867958e-05, "loss": 0.0554, "step": 2039 }, { "epoch": 0.262334433537657, "grad_norm": 0.2119140625, "learning_rate": 9.584935349829076e-05, "loss": 0.0553, "step": 2040 }, { "epoch": 0.26246302884821465, "grad_norm": 0.201171875, "learning_rate": 9.584532079455097e-05, "loss": 0.0485, "step": 2041 }, { "epoch": 0.26259162415877235, "grad_norm": 0.1845703125, "learning_rate": 9.584128621762499e-05, "loss": 0.049, "step": 2042 }, { "epoch": 0.26272021946933, "grad_norm": 0.2275390625, "learning_rate": 9.583724976767765e-05, "loss": 0.057, "step": 2043 }, { "epoch": 0.2628488147798877, "grad_norm": 0.2216796875, "learning_rate": 9.583321144487387e-05, "loss": 0.0687, "step": 2044 }, { "epoch": 0.2629774100904454, "grad_norm": 0.21484375, "learning_rate": 9.582917124937867e-05, "loss": 0.0491, "step": 2045 }, { "epoch": 0.26310600540100304, "grad_norm": 0.212890625, "learning_rate": 9.582512918135711e-05, "loss": 0.0594, "step": 2046 }, { "epoch": 0.26323460071156074, "grad_norm": 0.185546875, "learning_rate": 9.582108524097436e-05, "loss": 0.0517, "step": 2047 }, { "epoch": 0.2633631960221184, "grad_norm": 0.1845703125, "learning_rate": 9.581703942839561e-05, "loss": 0.0438, "step": 2048 }, { "epoch": 0.2634917913326761, "grad_norm": 0.23828125, "learning_rate": 9.581299174378624e-05, "loss": 0.0593, "step": 2049 }, { "epoch": 0.26362038664323373, "grad_norm": 0.2138671875, "learning_rate": 9.580894218731158e-05, "loss": 0.0585, "step": 2050 }, { "epoch": 0.26374898195379143, "grad_norm": 0.21875, "learning_rate": 9.580489075913708e-05, "loss": 0.064, "step": 2051 }, { "epoch": 0.2638775772643491, "grad_norm": 0.1982421875, "learning_rate": 9.580083745942834e-05, "loss": 0.05, "step": 2052 }, { "epoch": 0.2640061725749068, "grad_norm": 0.2197265625, "learning_rate": 9.57967822883509e-05, "loss": 0.0583, "step": 2053 }, { "epoch": 0.2641347678854644, "grad_norm": 0.20703125, "learning_rate": 9.579272524607051e-05, "loss": 0.0596, "step": 2054 }, { "epoch": 0.2642633631960221, "grad_norm": 0.19140625, "learning_rate": 9.578866633275288e-05, "loss": 0.0497, "step": 2055 }, { "epoch": 0.2643919585065798, "grad_norm": 0.220703125, "learning_rate": 9.578460554856389e-05, "loss": 0.0629, "step": 2056 }, { "epoch": 0.26452055381713746, "grad_norm": 0.208984375, "learning_rate": 9.578054289366944e-05, "loss": 0.0536, "step": 2057 }, { "epoch": 0.26464914912769516, "grad_norm": 0.2255859375, "learning_rate": 9.577647836823555e-05, "loss": 0.059, "step": 2058 }, { "epoch": 0.2647777444382528, "grad_norm": 0.216796875, "learning_rate": 9.577241197242828e-05, "loss": 0.0704, "step": 2059 }, { "epoch": 0.2649063397488105, "grad_norm": 0.197265625, "learning_rate": 9.576834370641376e-05, "loss": 0.0562, "step": 2060 }, { "epoch": 0.26503493505936815, "grad_norm": 0.2041015625, "learning_rate": 9.576427357035825e-05, "loss": 0.0578, "step": 2061 }, { "epoch": 0.26516353036992585, "grad_norm": 0.2275390625, "learning_rate": 9.576020156442802e-05, "loss": 0.0549, "step": 2062 }, { "epoch": 0.2652921256804835, "grad_norm": 0.2158203125, "learning_rate": 9.575612768878946e-05, "loss": 0.0718, "step": 2063 }, { "epoch": 0.2654207209910412, "grad_norm": 0.2099609375, "learning_rate": 9.575205194360901e-05, "loss": 0.0555, "step": 2064 }, { "epoch": 0.2655493163015989, "grad_norm": 0.18359375, "learning_rate": 9.574797432905322e-05, "loss": 0.0496, "step": 2065 }, { "epoch": 0.26567791161215654, "grad_norm": 0.1923828125, "learning_rate": 9.57438948452887e-05, "loss": 0.0579, "step": 2066 }, { "epoch": 0.26580650692271424, "grad_norm": 0.1953125, "learning_rate": 9.57398134924821e-05, "loss": 0.0536, "step": 2067 }, { "epoch": 0.2659351022332719, "grad_norm": 0.1806640625, "learning_rate": 9.57357302708002e-05, "loss": 0.0436, "step": 2068 }, { "epoch": 0.2660636975438296, "grad_norm": 0.2197265625, "learning_rate": 9.573164518040985e-05, "loss": 0.0644, "step": 2069 }, { "epoch": 0.2661922928543872, "grad_norm": 0.24609375, "learning_rate": 9.572755822147794e-05, "loss": 0.0611, "step": 2070 }, { "epoch": 0.2663208881649449, "grad_norm": 0.2451171875, "learning_rate": 9.572346939417147e-05, "loss": 0.0678, "step": 2071 }, { "epoch": 0.26644948347550257, "grad_norm": 0.2080078125, "learning_rate": 9.571937869865751e-05, "loss": 0.0557, "step": 2072 }, { "epoch": 0.26657807878606027, "grad_norm": 0.2060546875, "learning_rate": 9.571528613510319e-05, "loss": 0.0613, "step": 2073 }, { "epoch": 0.26670667409661797, "grad_norm": 0.212890625, "learning_rate": 9.571119170367571e-05, "loss": 0.0643, "step": 2074 }, { "epoch": 0.2668352694071756, "grad_norm": 0.197265625, "learning_rate": 9.570709540454242e-05, "loss": 0.0553, "step": 2075 }, { "epoch": 0.2669638647177333, "grad_norm": 0.1806640625, "learning_rate": 9.570299723787062e-05, "loss": 0.0554, "step": 2076 }, { "epoch": 0.26709246002829096, "grad_norm": 0.201171875, "learning_rate": 9.569889720382781e-05, "loss": 0.0584, "step": 2077 }, { "epoch": 0.26722105533884866, "grad_norm": 0.22265625, "learning_rate": 9.569479530258148e-05, "loss": 0.0665, "step": 2078 }, { "epoch": 0.2673496506494063, "grad_norm": 0.2255859375, "learning_rate": 9.569069153429924e-05, "loss": 0.0626, "step": 2079 }, { "epoch": 0.267478245959964, "grad_norm": 0.2294921875, "learning_rate": 9.568658589914877e-05, "loss": 0.0515, "step": 2080 }, { "epoch": 0.26760684127052164, "grad_norm": 0.283203125, "learning_rate": 9.568247839729782e-05, "loss": 0.0621, "step": 2081 }, { "epoch": 0.26773543658107934, "grad_norm": 0.2001953125, "learning_rate": 9.567836902891421e-05, "loss": 0.0586, "step": 2082 }, { "epoch": 0.26786403189163704, "grad_norm": 0.2158203125, "learning_rate": 9.567425779416586e-05, "loss": 0.0573, "step": 2083 }, { "epoch": 0.2679926272021947, "grad_norm": 0.1865234375, "learning_rate": 9.567014469322073e-05, "loss": 0.0476, "step": 2084 }, { "epoch": 0.2681212225127524, "grad_norm": 0.2041015625, "learning_rate": 9.56660297262469e-05, "loss": 0.0484, "step": 2085 }, { "epoch": 0.26824981782331003, "grad_norm": 0.23828125, "learning_rate": 9.566191289341247e-05, "loss": 0.0703, "step": 2086 }, { "epoch": 0.26837841313386773, "grad_norm": 0.2314453125, "learning_rate": 9.565779419488567e-05, "loss": 0.0613, "step": 2087 }, { "epoch": 0.2685070084444254, "grad_norm": 0.2060546875, "learning_rate": 9.565367363083479e-05, "loss": 0.0586, "step": 2088 }, { "epoch": 0.2686356037549831, "grad_norm": 0.2099609375, "learning_rate": 9.56495512014282e-05, "loss": 0.063, "step": 2089 }, { "epoch": 0.2687641990655407, "grad_norm": 0.2109375, "learning_rate": 9.56454269068343e-05, "loss": 0.0627, "step": 2090 }, { "epoch": 0.2688927943760984, "grad_norm": 0.189453125, "learning_rate": 9.564130074722164e-05, "loss": 0.0611, "step": 2091 }, { "epoch": 0.2690213896866561, "grad_norm": 0.1806640625, "learning_rate": 9.563717272275878e-05, "loss": 0.0512, "step": 2092 }, { "epoch": 0.26914998499721376, "grad_norm": 0.20703125, "learning_rate": 9.563304283361442e-05, "loss": 0.0578, "step": 2093 }, { "epoch": 0.26927858030777146, "grad_norm": 0.1826171875, "learning_rate": 9.562891107995727e-05, "loss": 0.045, "step": 2094 }, { "epoch": 0.2694071756183291, "grad_norm": 0.2001953125, "learning_rate": 9.562477746195617e-05, "loss": 0.0622, "step": 2095 }, { "epoch": 0.2695357709288868, "grad_norm": 0.271484375, "learning_rate": 9.562064197978001e-05, "loss": 0.0592, "step": 2096 }, { "epoch": 0.26966436623944445, "grad_norm": 0.2041015625, "learning_rate": 9.561650463359777e-05, "loss": 0.0572, "step": 2097 }, { "epoch": 0.26979296155000215, "grad_norm": 0.2177734375, "learning_rate": 9.561236542357846e-05, "loss": 0.0563, "step": 2098 }, { "epoch": 0.2699215568605598, "grad_norm": 0.2080078125, "learning_rate": 9.560822434989125e-05, "loss": 0.0616, "step": 2099 }, { "epoch": 0.2700501521711175, "grad_norm": 0.23046875, "learning_rate": 9.560408141270531e-05, "loss": 0.0657, "step": 2100 }, { "epoch": 0.27017874748167514, "grad_norm": 0.25390625, "learning_rate": 9.559993661218994e-05, "loss": 0.0606, "step": 2101 }, { "epoch": 0.27030734279223284, "grad_norm": 0.203125, "learning_rate": 9.559578994851446e-05, "loss": 0.0562, "step": 2102 }, { "epoch": 0.27043593810279054, "grad_norm": 0.2255859375, "learning_rate": 9.559164142184831e-05, "loss": 0.0694, "step": 2103 }, { "epoch": 0.2705645334133482, "grad_norm": 0.2294921875, "learning_rate": 9.5587491032361e-05, "loss": 0.0546, "step": 2104 }, { "epoch": 0.2706931287239059, "grad_norm": 0.25, "learning_rate": 9.558333878022212e-05, "loss": 0.0604, "step": 2105 }, { "epoch": 0.27082172403446353, "grad_norm": 0.2021484375, "learning_rate": 9.557918466560132e-05, "loss": 0.0548, "step": 2106 }, { "epoch": 0.27095031934502123, "grad_norm": 0.1953125, "learning_rate": 9.557502868866832e-05, "loss": 0.0502, "step": 2107 }, { "epoch": 0.2710789146555789, "grad_norm": 0.212890625, "learning_rate": 9.557087084959293e-05, "loss": 0.0533, "step": 2108 }, { "epoch": 0.2712075099661366, "grad_norm": 0.220703125, "learning_rate": 9.556671114854503e-05, "loss": 0.0608, "step": 2109 }, { "epoch": 0.2713361052766942, "grad_norm": 0.2158203125, "learning_rate": 9.55625495856946e-05, "loss": 0.0596, "step": 2110 }, { "epoch": 0.2714647005872519, "grad_norm": 0.2294921875, "learning_rate": 9.555838616121167e-05, "loss": 0.0564, "step": 2111 }, { "epoch": 0.2715932958978096, "grad_norm": 0.189453125, "learning_rate": 9.555422087526636e-05, "loss": 0.0542, "step": 2112 }, { "epoch": 0.27172189120836726, "grad_norm": 0.193359375, "learning_rate": 9.555005372802883e-05, "loss": 0.0576, "step": 2113 }, { "epoch": 0.27185048651892496, "grad_norm": 0.18359375, "learning_rate": 9.554588471966936e-05, "loss": 0.0474, "step": 2114 }, { "epoch": 0.2719790818294826, "grad_norm": 0.18359375, "learning_rate": 9.55417138503583e-05, "loss": 0.0451, "step": 2115 }, { "epoch": 0.2721076771400403, "grad_norm": 0.2060546875, "learning_rate": 9.553754112026606e-05, "loss": 0.0558, "step": 2116 }, { "epoch": 0.27223627245059795, "grad_norm": 0.197265625, "learning_rate": 9.553336652956314e-05, "loss": 0.0513, "step": 2117 }, { "epoch": 0.27236486776115565, "grad_norm": 0.19140625, "learning_rate": 9.55291900784201e-05, "loss": 0.0505, "step": 2118 }, { "epoch": 0.2724934630717133, "grad_norm": 0.1962890625, "learning_rate": 9.552501176700758e-05, "loss": 0.0548, "step": 2119 }, { "epoch": 0.272622058382271, "grad_norm": 0.205078125, "learning_rate": 9.55208315954963e-05, "loss": 0.0573, "step": 2120 }, { "epoch": 0.2727506536928287, "grad_norm": 0.2236328125, "learning_rate": 9.551664956405708e-05, "loss": 0.0618, "step": 2121 }, { "epoch": 0.27287924900338634, "grad_norm": 0.2197265625, "learning_rate": 9.551246567286079e-05, "loss": 0.0514, "step": 2122 }, { "epoch": 0.27300784431394404, "grad_norm": 0.201171875, "learning_rate": 9.550827992207834e-05, "loss": 0.0605, "step": 2123 }, { "epoch": 0.2731364396245017, "grad_norm": 0.2265625, "learning_rate": 9.55040923118808e-05, "loss": 0.0572, "step": 2124 }, { "epoch": 0.2732650349350594, "grad_norm": 0.2060546875, "learning_rate": 9.549990284243924e-05, "loss": 0.054, "step": 2125 }, { "epoch": 0.273393630245617, "grad_norm": 0.2236328125, "learning_rate": 9.549571151392485e-05, "loss": 0.0619, "step": 2126 }, { "epoch": 0.2735222255561747, "grad_norm": 0.1845703125, "learning_rate": 9.54915183265089e-05, "loss": 0.0501, "step": 2127 }, { "epoch": 0.27365082086673237, "grad_norm": 0.1875, "learning_rate": 9.548732328036267e-05, "loss": 0.0554, "step": 2128 }, { "epoch": 0.27377941617729007, "grad_norm": 0.2197265625, "learning_rate": 9.548312637565761e-05, "loss": 0.0573, "step": 2129 }, { "epoch": 0.27390801148784777, "grad_norm": 0.1923828125, "learning_rate": 9.54789276125652e-05, "loss": 0.0561, "step": 2130 }, { "epoch": 0.2740366067984054, "grad_norm": 0.2177734375, "learning_rate": 9.547472699125696e-05, "loss": 0.0591, "step": 2131 }, { "epoch": 0.2741652021089631, "grad_norm": 0.2080078125, "learning_rate": 9.547052451190456e-05, "loss": 0.0645, "step": 2132 }, { "epoch": 0.27429379741952076, "grad_norm": 0.2021484375, "learning_rate": 9.54663201746797e-05, "loss": 0.0652, "step": 2133 }, { "epoch": 0.27442239273007846, "grad_norm": 0.193359375, "learning_rate": 9.546211397975414e-05, "loss": 0.0527, "step": 2134 }, { "epoch": 0.2745509880406361, "grad_norm": 0.1904296875, "learning_rate": 9.545790592729977e-05, "loss": 0.0526, "step": 2135 }, { "epoch": 0.2746795833511938, "grad_norm": 0.2060546875, "learning_rate": 9.54536960174885e-05, "loss": 0.0513, "step": 2136 }, { "epoch": 0.27480817866175145, "grad_norm": 0.1845703125, "learning_rate": 9.544948425049237e-05, "loss": 0.052, "step": 2137 }, { "epoch": 0.27493677397230915, "grad_norm": 0.2041015625, "learning_rate": 9.544527062648345e-05, "loss": 0.0538, "step": 2138 }, { "epoch": 0.27506536928286685, "grad_norm": 0.2021484375, "learning_rate": 9.54410551456339e-05, "loss": 0.0509, "step": 2139 }, { "epoch": 0.2751939645934245, "grad_norm": 0.232421875, "learning_rate": 9.543683780811599e-05, "loss": 0.0593, "step": 2140 }, { "epoch": 0.2753225599039822, "grad_norm": 0.212890625, "learning_rate": 9.5432618614102e-05, "loss": 0.0617, "step": 2141 }, { "epoch": 0.27545115521453983, "grad_norm": 0.20703125, "learning_rate": 9.542839756376432e-05, "loss": 0.0602, "step": 2142 }, { "epoch": 0.27557975052509753, "grad_norm": 0.2265625, "learning_rate": 9.542417465727545e-05, "loss": 0.0487, "step": 2143 }, { "epoch": 0.2757083458356552, "grad_norm": 0.1865234375, "learning_rate": 9.541994989480791e-05, "loss": 0.0511, "step": 2144 }, { "epoch": 0.2758369411462129, "grad_norm": 0.171875, "learning_rate": 9.541572327653432e-05, "loss": 0.0398, "step": 2145 }, { "epoch": 0.2759655364567705, "grad_norm": 0.17578125, "learning_rate": 9.541149480262738e-05, "loss": 0.0431, "step": 2146 }, { "epoch": 0.2760941317673282, "grad_norm": 0.181640625, "learning_rate": 9.540726447325985e-05, "loss": 0.0506, "step": 2147 }, { "epoch": 0.27622272707788587, "grad_norm": 0.212890625, "learning_rate": 9.540303228860459e-05, "loss": 0.0623, "step": 2148 }, { "epoch": 0.27635132238844357, "grad_norm": 0.208984375, "learning_rate": 9.539879824883453e-05, "loss": 0.0617, "step": 2149 }, { "epoch": 0.27647991769900127, "grad_norm": 0.201171875, "learning_rate": 9.539456235412263e-05, "loss": 0.0524, "step": 2150 }, { "epoch": 0.2766085130095589, "grad_norm": 0.193359375, "learning_rate": 9.539032460464201e-05, "loss": 0.0528, "step": 2151 }, { "epoch": 0.2767371083201166, "grad_norm": 0.248046875, "learning_rate": 9.538608500056579e-05, "loss": 0.0612, "step": 2152 }, { "epoch": 0.27686570363067425, "grad_norm": 0.1982421875, "learning_rate": 9.538184354206721e-05, "loss": 0.056, "step": 2153 }, { "epoch": 0.27699429894123195, "grad_norm": 0.244140625, "learning_rate": 9.537760022931954e-05, "loss": 0.0506, "step": 2154 }, { "epoch": 0.2771228942517896, "grad_norm": 0.1923828125, "learning_rate": 9.53733550624962e-05, "loss": 0.0441, "step": 2155 }, { "epoch": 0.2772514895623473, "grad_norm": 0.2275390625, "learning_rate": 9.536910804177063e-05, "loss": 0.0684, "step": 2156 }, { "epoch": 0.27738008487290494, "grad_norm": 0.1962890625, "learning_rate": 9.536485916731634e-05, "loss": 0.0481, "step": 2157 }, { "epoch": 0.27750868018346264, "grad_norm": 0.2060546875, "learning_rate": 9.536060843930695e-05, "loss": 0.059, "step": 2158 }, { "epoch": 0.27763727549402034, "grad_norm": 0.2119140625, "learning_rate": 9.535635585791613e-05, "loss": 0.0534, "step": 2159 }, { "epoch": 0.277765870804578, "grad_norm": 0.2333984375, "learning_rate": 9.535210142331765e-05, "loss": 0.0638, "step": 2160 }, { "epoch": 0.2778944661151357, "grad_norm": 0.203125, "learning_rate": 9.534784513568533e-05, "loss": 0.0604, "step": 2161 }, { "epoch": 0.27802306142569333, "grad_norm": 0.1962890625, "learning_rate": 9.534358699519308e-05, "loss": 0.0526, "step": 2162 }, { "epoch": 0.27815165673625103, "grad_norm": 0.1904296875, "learning_rate": 9.533932700201489e-05, "loss": 0.0502, "step": 2163 }, { "epoch": 0.2782802520468087, "grad_norm": 0.203125, "learning_rate": 9.53350651563248e-05, "loss": 0.0602, "step": 2164 }, { "epoch": 0.2784088473573664, "grad_norm": 0.2353515625, "learning_rate": 9.533080145829694e-05, "loss": 0.0707, "step": 2165 }, { "epoch": 0.278537442667924, "grad_norm": 0.208984375, "learning_rate": 9.532653590810556e-05, "loss": 0.0592, "step": 2166 }, { "epoch": 0.2786660379784817, "grad_norm": 0.203125, "learning_rate": 9.532226850592491e-05, "loss": 0.0585, "step": 2167 }, { "epoch": 0.2787946332890394, "grad_norm": 0.171875, "learning_rate": 9.531799925192935e-05, "loss": 0.0501, "step": 2168 }, { "epoch": 0.27892322859959706, "grad_norm": 0.2333984375, "learning_rate": 9.531372814629333e-05, "loss": 0.0548, "step": 2169 }, { "epoch": 0.27905182391015476, "grad_norm": 0.193359375, "learning_rate": 9.530945518919138e-05, "loss": 0.0572, "step": 2170 }, { "epoch": 0.2791804192207124, "grad_norm": 0.2001953125, "learning_rate": 9.530518038079805e-05, "loss": 0.0436, "step": 2171 }, { "epoch": 0.2793090145312701, "grad_norm": 0.1982421875, "learning_rate": 9.530090372128803e-05, "loss": 0.0567, "step": 2172 }, { "epoch": 0.27943760984182775, "grad_norm": 0.2353515625, "learning_rate": 9.529662521083604e-05, "loss": 0.0646, "step": 2173 }, { "epoch": 0.27956620515238545, "grad_norm": 0.177734375, "learning_rate": 9.529234484961692e-05, "loss": 0.0456, "step": 2174 }, { "epoch": 0.2796948004629431, "grad_norm": 0.1806640625, "learning_rate": 9.528806263780555e-05, "loss": 0.0468, "step": 2175 }, { "epoch": 0.2798233957735008, "grad_norm": 0.1904296875, "learning_rate": 9.528377857557687e-05, "loss": 0.0533, "step": 2176 }, { "epoch": 0.2799519910840585, "grad_norm": 0.197265625, "learning_rate": 9.527949266310596e-05, "loss": 0.0476, "step": 2177 }, { "epoch": 0.28008058639461614, "grad_norm": 0.171875, "learning_rate": 9.52752049005679e-05, "loss": 0.0592, "step": 2178 }, { "epoch": 0.28020918170517384, "grad_norm": 0.2255859375, "learning_rate": 9.527091528813793e-05, "loss": 0.0579, "step": 2179 }, { "epoch": 0.2803377770157315, "grad_norm": 0.1884765625, "learning_rate": 9.526662382599128e-05, "loss": 0.0522, "step": 2180 }, { "epoch": 0.2804663723262892, "grad_norm": 0.203125, "learning_rate": 9.526233051430331e-05, "loss": 0.0569, "step": 2181 }, { "epoch": 0.2805949676368468, "grad_norm": 0.1650390625, "learning_rate": 9.525803535324944e-05, "loss": 0.0387, "step": 2182 }, { "epoch": 0.2807235629474045, "grad_norm": 0.193359375, "learning_rate": 9.525373834300515e-05, "loss": 0.0569, "step": 2183 }, { "epoch": 0.28085215825796217, "grad_norm": 0.1845703125, "learning_rate": 9.524943948374604e-05, "loss": 0.0406, "step": 2184 }, { "epoch": 0.28098075356851987, "grad_norm": 0.1796875, "learning_rate": 9.524513877564774e-05, "loss": 0.0511, "step": 2185 }, { "epoch": 0.28110934887907757, "grad_norm": 0.18359375, "learning_rate": 9.524083621888595e-05, "loss": 0.0387, "step": 2186 }, { "epoch": 0.2812379441896352, "grad_norm": 0.21484375, "learning_rate": 9.52365318136365e-05, "loss": 0.0585, "step": 2187 }, { "epoch": 0.2813665395001929, "grad_norm": 0.193359375, "learning_rate": 9.523222556007526e-05, "loss": 0.0489, "step": 2188 }, { "epoch": 0.28149513481075056, "grad_norm": 0.1875, "learning_rate": 9.522791745837815e-05, "loss": 0.0462, "step": 2189 }, { "epoch": 0.28162373012130826, "grad_norm": 0.2001953125, "learning_rate": 9.522360750872123e-05, "loss": 0.0533, "step": 2190 }, { "epoch": 0.2817523254318659, "grad_norm": 0.208984375, "learning_rate": 9.521929571128059e-05, "loss": 0.0599, "step": 2191 }, { "epoch": 0.2818809207424236, "grad_norm": 0.2138671875, "learning_rate": 9.521498206623239e-05, "loss": 0.0641, "step": 2192 }, { "epoch": 0.28200951605298125, "grad_norm": 0.212890625, "learning_rate": 9.521066657375288e-05, "loss": 0.0577, "step": 2193 }, { "epoch": 0.28213811136353895, "grad_norm": 0.2001953125, "learning_rate": 9.520634923401841e-05, "loss": 0.0551, "step": 2194 }, { "epoch": 0.2822667066740966, "grad_norm": 0.1982421875, "learning_rate": 9.520203004720537e-05, "loss": 0.0577, "step": 2195 }, { "epoch": 0.2823953019846543, "grad_norm": 0.197265625, "learning_rate": 9.519770901349022e-05, "loss": 0.0583, "step": 2196 }, { "epoch": 0.282523897295212, "grad_norm": 0.1865234375, "learning_rate": 9.519338613304952e-05, "loss": 0.0547, "step": 2197 }, { "epoch": 0.28265249260576963, "grad_norm": 0.1943359375, "learning_rate": 9.518906140605991e-05, "loss": 0.0474, "step": 2198 }, { "epoch": 0.28278108791632733, "grad_norm": 0.177734375, "learning_rate": 9.51847348326981e-05, "loss": 0.0488, "step": 2199 }, { "epoch": 0.282909683226885, "grad_norm": 0.1923828125, "learning_rate": 9.518040641314083e-05, "loss": 0.0523, "step": 2200 }, { "epoch": 0.2830382785374427, "grad_norm": 0.2080078125, "learning_rate": 9.5176076147565e-05, "loss": 0.0589, "step": 2201 }, { "epoch": 0.2831668738480003, "grad_norm": 0.205078125, "learning_rate": 9.517174403614751e-05, "loss": 0.0558, "step": 2202 }, { "epoch": 0.283295469158558, "grad_norm": 0.2099609375, "learning_rate": 9.516741007906538e-05, "loss": 0.0584, "step": 2203 }, { "epoch": 0.28342406446911567, "grad_norm": 0.1904296875, "learning_rate": 9.516307427649569e-05, "loss": 0.0565, "step": 2204 }, { "epoch": 0.28355265977967337, "grad_norm": 0.2255859375, "learning_rate": 9.515873662861558e-05, "loss": 0.0489, "step": 2205 }, { "epoch": 0.28368125509023107, "grad_norm": 0.185546875, "learning_rate": 9.515439713560231e-05, "loss": 0.0474, "step": 2206 }, { "epoch": 0.2838098504007887, "grad_norm": 0.1962890625, "learning_rate": 9.515005579763314e-05, "loss": 0.0479, "step": 2207 }, { "epoch": 0.2839384457113464, "grad_norm": 0.2236328125, "learning_rate": 9.51457126148855e-05, "loss": 0.0586, "step": 2208 }, { "epoch": 0.28406704102190405, "grad_norm": 0.19921875, "learning_rate": 9.514136758753683e-05, "loss": 0.0552, "step": 2209 }, { "epoch": 0.28419563633246175, "grad_norm": 0.234375, "learning_rate": 9.513702071576464e-05, "loss": 0.0633, "step": 2210 }, { "epoch": 0.2843242316430194, "grad_norm": 0.2109375, "learning_rate": 9.513267199974658e-05, "loss": 0.0625, "step": 2211 }, { "epoch": 0.2844528269535771, "grad_norm": 0.1796875, "learning_rate": 9.512832143966029e-05, "loss": 0.0478, "step": 2212 }, { "epoch": 0.28458142226413474, "grad_norm": 0.212890625, "learning_rate": 9.512396903568357e-05, "loss": 0.0608, "step": 2213 }, { "epoch": 0.28471001757469244, "grad_norm": 0.2080078125, "learning_rate": 9.511961478799423e-05, "loss": 0.0632, "step": 2214 }, { "epoch": 0.28483861288525014, "grad_norm": 0.2041015625, "learning_rate": 9.511525869677018e-05, "loss": 0.0573, "step": 2215 }, { "epoch": 0.2849672081958078, "grad_norm": 0.1875, "learning_rate": 9.511090076218942e-05, "loss": 0.0591, "step": 2216 }, { "epoch": 0.2850958035063655, "grad_norm": 0.1728515625, "learning_rate": 9.510654098442999e-05, "loss": 0.0469, "step": 2217 }, { "epoch": 0.28522439881692313, "grad_norm": 0.1953125, "learning_rate": 9.510217936367005e-05, "loss": 0.0493, "step": 2218 }, { "epoch": 0.28535299412748083, "grad_norm": 0.2080078125, "learning_rate": 9.509781590008777e-05, "loss": 0.0538, "step": 2219 }, { "epoch": 0.2854815894380385, "grad_norm": 0.1796875, "learning_rate": 9.509345059386148e-05, "loss": 0.0506, "step": 2220 }, { "epoch": 0.2856101847485962, "grad_norm": 0.21484375, "learning_rate": 9.508908344516951e-05, "loss": 0.0599, "step": 2221 }, { "epoch": 0.2857387800591538, "grad_norm": 0.1923828125, "learning_rate": 9.508471445419032e-05, "loss": 0.065, "step": 2222 }, { "epoch": 0.2858673753697115, "grad_norm": 0.2216796875, "learning_rate": 9.508034362110242e-05, "loss": 0.0636, "step": 2223 }, { "epoch": 0.2859959706802692, "grad_norm": 0.171875, "learning_rate": 9.507597094608438e-05, "loss": 0.0395, "step": 2224 }, { "epoch": 0.28612456599082686, "grad_norm": 0.1845703125, "learning_rate": 9.507159642931487e-05, "loss": 0.0513, "step": 2225 }, { "epoch": 0.28625316130138456, "grad_norm": 0.1904296875, "learning_rate": 9.506722007097262e-05, "loss": 0.0489, "step": 2226 }, { "epoch": 0.2863817566119422, "grad_norm": 0.189453125, "learning_rate": 9.506284187123647e-05, "loss": 0.0495, "step": 2227 }, { "epoch": 0.2865103519224999, "grad_norm": 0.1796875, "learning_rate": 9.505846183028526e-05, "loss": 0.0508, "step": 2228 }, { "epoch": 0.28663894723305755, "grad_norm": 0.2451171875, "learning_rate": 9.5054079948298e-05, "loss": 0.0686, "step": 2229 }, { "epoch": 0.28676754254361525, "grad_norm": 0.185546875, "learning_rate": 9.504969622545372e-05, "loss": 0.0487, "step": 2230 }, { "epoch": 0.2868961378541729, "grad_norm": 0.19921875, "learning_rate": 9.504531066193152e-05, "loss": 0.0571, "step": 2231 }, { "epoch": 0.2870247331647306, "grad_norm": 0.208984375, "learning_rate": 9.504092325791059e-05, "loss": 0.0557, "step": 2232 }, { "epoch": 0.2871533284752883, "grad_norm": 0.236328125, "learning_rate": 9.50365340135702e-05, "loss": 0.064, "step": 2233 }, { "epoch": 0.28728192378584594, "grad_norm": 0.1845703125, "learning_rate": 9.503214292908969e-05, "loss": 0.0517, "step": 2234 }, { "epoch": 0.28741051909640364, "grad_norm": 0.2001953125, "learning_rate": 9.502775000464847e-05, "loss": 0.0462, "step": 2235 }, { "epoch": 0.2875391144069613, "grad_norm": 0.189453125, "learning_rate": 9.502335524042603e-05, "loss": 0.0612, "step": 2236 }, { "epoch": 0.287667709717519, "grad_norm": 0.185546875, "learning_rate": 9.501895863660193e-05, "loss": 0.0507, "step": 2237 }, { "epoch": 0.2877963050280766, "grad_norm": 0.244140625, "learning_rate": 9.501456019335584e-05, "loss": 0.0683, "step": 2238 }, { "epoch": 0.2879249003386343, "grad_norm": 0.2158203125, "learning_rate": 9.501015991086744e-05, "loss": 0.058, "step": 2239 }, { "epoch": 0.28805349564919197, "grad_norm": 0.2109375, "learning_rate": 9.500575778931653e-05, "loss": 0.0519, "step": 2240 }, { "epoch": 0.28818209095974967, "grad_norm": 0.2001953125, "learning_rate": 9.500135382888298e-05, "loss": 0.057, "step": 2241 }, { "epoch": 0.28831068627030737, "grad_norm": 0.197265625, "learning_rate": 9.499694802974673e-05, "loss": 0.0529, "step": 2242 }, { "epoch": 0.288439281580865, "grad_norm": 0.216796875, "learning_rate": 9.49925403920878e-05, "loss": 0.0587, "step": 2243 }, { "epoch": 0.2885678768914227, "grad_norm": 0.1904296875, "learning_rate": 9.498813091608627e-05, "loss": 0.0516, "step": 2244 }, { "epoch": 0.28869647220198036, "grad_norm": 0.2080078125, "learning_rate": 9.49837196019223e-05, "loss": 0.0579, "step": 2245 }, { "epoch": 0.28882506751253806, "grad_norm": 0.201171875, "learning_rate": 9.497930644977616e-05, "loss": 0.055, "step": 2246 }, { "epoch": 0.2889536628230957, "grad_norm": 0.162109375, "learning_rate": 9.497489145982815e-05, "loss": 0.0448, "step": 2247 }, { "epoch": 0.2890822581336534, "grad_norm": 0.17578125, "learning_rate": 9.497047463225866e-05, "loss": 0.0505, "step": 2248 }, { "epoch": 0.28921085344421105, "grad_norm": 0.1845703125, "learning_rate": 9.496605596724814e-05, "loss": 0.0523, "step": 2249 }, { "epoch": 0.28933944875476875, "grad_norm": 0.201171875, "learning_rate": 9.496163546497716e-05, "loss": 0.0636, "step": 2250 }, { "epoch": 0.2894680440653264, "grad_norm": 0.2109375, "learning_rate": 9.495721312562634e-05, "loss": 0.0675, "step": 2251 }, { "epoch": 0.2895966393758841, "grad_norm": 0.19921875, "learning_rate": 9.495278894937632e-05, "loss": 0.0625, "step": 2252 }, { "epoch": 0.2897252346864418, "grad_norm": 0.2109375, "learning_rate": 9.494836293640794e-05, "loss": 0.0515, "step": 2253 }, { "epoch": 0.28985382999699943, "grad_norm": 0.216796875, "learning_rate": 9.494393508690198e-05, "loss": 0.0647, "step": 2254 }, { "epoch": 0.28998242530755713, "grad_norm": 0.2001953125, "learning_rate": 9.49395054010394e-05, "loss": 0.0547, "step": 2255 }, { "epoch": 0.2901110206181148, "grad_norm": 0.216796875, "learning_rate": 9.493507387900115e-05, "loss": 0.0517, "step": 2256 }, { "epoch": 0.2902396159286725, "grad_norm": 0.1904296875, "learning_rate": 9.493064052096834e-05, "loss": 0.0549, "step": 2257 }, { "epoch": 0.2903682112392301, "grad_norm": 0.2099609375, "learning_rate": 9.492620532712209e-05, "loss": 0.0509, "step": 2258 }, { "epoch": 0.2904968065497878, "grad_norm": 0.18359375, "learning_rate": 9.492176829764361e-05, "loss": 0.0437, "step": 2259 }, { "epoch": 0.29062540186034547, "grad_norm": 0.2001953125, "learning_rate": 9.491732943271421e-05, "loss": 0.0624, "step": 2260 }, { "epoch": 0.29075399717090317, "grad_norm": 0.2138671875, "learning_rate": 9.491288873251525e-05, "loss": 0.0644, "step": 2261 }, { "epoch": 0.29088259248146087, "grad_norm": 0.203125, "learning_rate": 9.490844619722817e-05, "loss": 0.0532, "step": 2262 }, { "epoch": 0.2910111877920185, "grad_norm": 0.1982421875, "learning_rate": 9.490400182703447e-05, "loss": 0.0547, "step": 2263 }, { "epoch": 0.2911397831025762, "grad_norm": 0.2177734375, "learning_rate": 9.489955562211577e-05, "loss": 0.057, "step": 2264 }, { "epoch": 0.29126837841313385, "grad_norm": 0.18359375, "learning_rate": 9.489510758265371e-05, "loss": 0.0527, "step": 2265 }, { "epoch": 0.29139697372369155, "grad_norm": 0.205078125, "learning_rate": 9.489065770883007e-05, "loss": 0.0576, "step": 2266 }, { "epoch": 0.2915255690342492, "grad_norm": 0.1826171875, "learning_rate": 9.488620600082664e-05, "loss": 0.0491, "step": 2267 }, { "epoch": 0.2916541643448069, "grad_norm": 0.193359375, "learning_rate": 9.488175245882531e-05, "loss": 0.0496, "step": 2268 }, { "epoch": 0.29178275965536454, "grad_norm": 0.2001953125, "learning_rate": 9.487729708300805e-05, "loss": 0.0526, "step": 2269 }, { "epoch": 0.29191135496592224, "grad_norm": 0.1923828125, "learning_rate": 9.48728398735569e-05, "loss": 0.0538, "step": 2270 }, { "epoch": 0.29203995027647994, "grad_norm": 0.185546875, "learning_rate": 9.486838083065397e-05, "loss": 0.0474, "step": 2271 }, { "epoch": 0.2921685455870376, "grad_norm": 0.1806640625, "learning_rate": 9.486391995448148e-05, "loss": 0.046, "step": 2272 }, { "epoch": 0.2922971408975953, "grad_norm": 0.193359375, "learning_rate": 9.485945724522166e-05, "loss": 0.0556, "step": 2273 }, { "epoch": 0.29242573620815293, "grad_norm": 0.2158203125, "learning_rate": 9.485499270305688e-05, "loss": 0.0553, "step": 2274 }, { "epoch": 0.29255433151871063, "grad_norm": 0.1748046875, "learning_rate": 9.485052632816953e-05, "loss": 0.0442, "step": 2275 }, { "epoch": 0.2926829268292683, "grad_norm": 0.1953125, "learning_rate": 9.484605812074213e-05, "loss": 0.0565, "step": 2276 }, { "epoch": 0.292811522139826, "grad_norm": 0.19140625, "learning_rate": 9.484158808095722e-05, "loss": 0.0515, "step": 2277 }, { "epoch": 0.2929401174503836, "grad_norm": 0.19921875, "learning_rate": 9.483711620899745e-05, "loss": 0.0525, "step": 2278 }, { "epoch": 0.2930687127609413, "grad_norm": 0.2001953125, "learning_rate": 9.483264250504555e-05, "loss": 0.0502, "step": 2279 }, { "epoch": 0.293197308071499, "grad_norm": 0.1904296875, "learning_rate": 9.482816696928429e-05, "loss": 0.0454, "step": 2280 }, { "epoch": 0.29332590338205666, "grad_norm": 0.2080078125, "learning_rate": 9.482368960189652e-05, "loss": 0.0599, "step": 2281 }, { "epoch": 0.29345449869261436, "grad_norm": 0.2109375, "learning_rate": 9.481921040306522e-05, "loss": 0.0642, "step": 2282 }, { "epoch": 0.293583094003172, "grad_norm": 0.2041015625, "learning_rate": 9.48147293729734e-05, "loss": 0.0629, "step": 2283 }, { "epoch": 0.2937116893137297, "grad_norm": 0.201171875, "learning_rate": 9.481024651180412e-05, "loss": 0.0488, "step": 2284 }, { "epoch": 0.29384028462428735, "grad_norm": 0.1787109375, "learning_rate": 9.480576181974056e-05, "loss": 0.052, "step": 2285 }, { "epoch": 0.29396887993484505, "grad_norm": 0.1787109375, "learning_rate": 9.480127529696596e-05, "loss": 0.0518, "step": 2286 }, { "epoch": 0.2940974752454027, "grad_norm": 0.205078125, "learning_rate": 9.479678694366363e-05, "loss": 0.0589, "step": 2287 }, { "epoch": 0.2942260705559604, "grad_norm": 0.306640625, "learning_rate": 9.479229676001697e-05, "loss": 0.0568, "step": 2288 }, { "epoch": 0.2943546658665181, "grad_norm": 0.185546875, "learning_rate": 9.478780474620943e-05, "loss": 0.051, "step": 2289 }, { "epoch": 0.29448326117707574, "grad_norm": 0.2138671875, "learning_rate": 9.478331090242456e-05, "loss": 0.0592, "step": 2290 }, { "epoch": 0.29461185648763344, "grad_norm": 0.193359375, "learning_rate": 9.477881522884597e-05, "loss": 0.0553, "step": 2291 }, { "epoch": 0.2947404517981911, "grad_norm": 0.1943359375, "learning_rate": 9.477431772565735e-05, "loss": 0.0513, "step": 2292 }, { "epoch": 0.2948690471087488, "grad_norm": 0.1962890625, "learning_rate": 9.476981839304245e-05, "loss": 0.0535, "step": 2293 }, { "epoch": 0.2949976424193064, "grad_norm": 0.1962890625, "learning_rate": 9.476531723118512e-05, "loss": 0.0541, "step": 2294 }, { "epoch": 0.2951262377298641, "grad_norm": 0.1845703125, "learning_rate": 9.476081424026926e-05, "loss": 0.0499, "step": 2295 }, { "epoch": 0.29525483304042177, "grad_norm": 0.23046875, "learning_rate": 9.475630942047889e-05, "loss": 0.0665, "step": 2296 }, { "epoch": 0.29538342835097947, "grad_norm": 0.2021484375, "learning_rate": 9.475180277199802e-05, "loss": 0.0507, "step": 2297 }, { "epoch": 0.2955120236615371, "grad_norm": 0.20703125, "learning_rate": 9.474729429501084e-05, "loss": 0.056, "step": 2298 }, { "epoch": 0.2956406189720948, "grad_norm": 0.2060546875, "learning_rate": 9.474278398970151e-05, "loss": 0.057, "step": 2299 }, { "epoch": 0.2957692142826525, "grad_norm": 0.1943359375, "learning_rate": 9.473827185625435e-05, "loss": 0.0514, "step": 2300 }, { "epoch": 0.29589780959321016, "grad_norm": 0.19140625, "learning_rate": 9.47337578948537e-05, "loss": 0.0533, "step": 2301 }, { "epoch": 0.29602640490376786, "grad_norm": 0.2138671875, "learning_rate": 9.472924210568402e-05, "loss": 0.0621, "step": 2302 }, { "epoch": 0.2961550002143255, "grad_norm": 0.2294921875, "learning_rate": 9.47247244889298e-05, "loss": 0.0601, "step": 2303 }, { "epoch": 0.2962835955248832, "grad_norm": 0.1904296875, "learning_rate": 9.472020504477563e-05, "loss": 0.0515, "step": 2304 }, { "epoch": 0.29641219083544085, "grad_norm": 0.2060546875, "learning_rate": 9.471568377340617e-05, "loss": 0.0647, "step": 2305 }, { "epoch": 0.29654078614599855, "grad_norm": 0.1953125, "learning_rate": 9.471116067500616e-05, "loss": 0.052, "step": 2306 }, { "epoch": 0.2966693814565562, "grad_norm": 0.22265625, "learning_rate": 9.47066357497604e-05, "loss": 0.061, "step": 2307 }, { "epoch": 0.2967979767671139, "grad_norm": 0.1982421875, "learning_rate": 9.470210899785377e-05, "loss": 0.0502, "step": 2308 }, { "epoch": 0.2969265720776716, "grad_norm": 0.1943359375, "learning_rate": 9.469758041947124e-05, "loss": 0.0511, "step": 2309 }, { "epoch": 0.29705516738822924, "grad_norm": 0.185546875, "learning_rate": 9.469305001479783e-05, "loss": 0.0506, "step": 2310 }, { "epoch": 0.29718376269878694, "grad_norm": 0.181640625, "learning_rate": 9.468851778401864e-05, "loss": 0.0513, "step": 2311 }, { "epoch": 0.2973123580093446, "grad_norm": 0.1630859375, "learning_rate": 9.468398372731888e-05, "loss": 0.0404, "step": 2312 }, { "epoch": 0.2974409533199023, "grad_norm": 0.1875, "learning_rate": 9.467944784488378e-05, "loss": 0.0519, "step": 2313 }, { "epoch": 0.2975695486304599, "grad_norm": 0.1962890625, "learning_rate": 9.46749101368987e-05, "loss": 0.0481, "step": 2314 }, { "epoch": 0.2976981439410176, "grad_norm": 0.1962890625, "learning_rate": 9.467037060354902e-05, "loss": 0.0595, "step": 2315 }, { "epoch": 0.29782673925157527, "grad_norm": 0.1875, "learning_rate": 9.466582924502021e-05, "loss": 0.053, "step": 2316 }, { "epoch": 0.29795533456213297, "grad_norm": 0.2041015625, "learning_rate": 9.466128606149786e-05, "loss": 0.0552, "step": 2317 }, { "epoch": 0.29808392987269067, "grad_norm": 0.212890625, "learning_rate": 9.465674105316757e-05, "loss": 0.0566, "step": 2318 }, { "epoch": 0.2982125251832483, "grad_norm": 0.1806640625, "learning_rate": 9.465219422021505e-05, "loss": 0.0548, "step": 2319 }, { "epoch": 0.298341120493806, "grad_norm": 0.19140625, "learning_rate": 9.46476455628261e-05, "loss": 0.0516, "step": 2320 }, { "epoch": 0.29846971580436366, "grad_norm": 0.19921875, "learning_rate": 9.464309508118654e-05, "loss": 0.0487, "step": 2321 }, { "epoch": 0.29859831111492136, "grad_norm": 0.208984375, "learning_rate": 9.463854277548233e-05, "loss": 0.0557, "step": 2322 }, { "epoch": 0.298726906425479, "grad_norm": 0.205078125, "learning_rate": 9.463398864589943e-05, "loss": 0.0525, "step": 2323 }, { "epoch": 0.2988555017360367, "grad_norm": 0.2021484375, "learning_rate": 9.462943269262395e-05, "loss": 0.0585, "step": 2324 }, { "epoch": 0.29898409704659434, "grad_norm": 0.193359375, "learning_rate": 9.462487491584204e-05, "loss": 0.0527, "step": 2325 }, { "epoch": 0.29911269235715204, "grad_norm": 0.1806640625, "learning_rate": 9.462031531573992e-05, "loss": 0.0485, "step": 2326 }, { "epoch": 0.29924128766770974, "grad_norm": 0.2099609375, "learning_rate": 9.461575389250387e-05, "loss": 0.0658, "step": 2327 }, { "epoch": 0.2993698829782674, "grad_norm": 0.1953125, "learning_rate": 9.46111906463203e-05, "loss": 0.0544, "step": 2328 }, { "epoch": 0.2994984782888251, "grad_norm": 0.21484375, "learning_rate": 9.460662557737564e-05, "loss": 0.0568, "step": 2329 }, { "epoch": 0.29962707359938273, "grad_norm": 0.1884765625, "learning_rate": 9.460205868585639e-05, "loss": 0.0554, "step": 2330 }, { "epoch": 0.29975566890994043, "grad_norm": 0.189453125, "learning_rate": 9.459748997194919e-05, "loss": 0.0547, "step": 2331 }, { "epoch": 0.2998842642204981, "grad_norm": 0.2177734375, "learning_rate": 9.459291943584069e-05, "loss": 0.0569, "step": 2332 }, { "epoch": 0.3000128595310558, "grad_norm": 0.1923828125, "learning_rate": 9.458834707771764e-05, "loss": 0.0526, "step": 2333 }, { "epoch": 0.3001414548416134, "grad_norm": 0.1728515625, "learning_rate": 9.458377289776686e-05, "loss": 0.0476, "step": 2334 }, { "epoch": 0.3002700501521711, "grad_norm": 0.197265625, "learning_rate": 9.457919689617524e-05, "loss": 0.0512, "step": 2335 }, { "epoch": 0.3003986454627288, "grad_norm": 0.19140625, "learning_rate": 9.457461907312979e-05, "loss": 0.0504, "step": 2336 }, { "epoch": 0.30052724077328646, "grad_norm": 0.220703125, "learning_rate": 9.457003942881751e-05, "loss": 0.0543, "step": 2337 }, { "epoch": 0.30065583608384416, "grad_norm": 0.189453125, "learning_rate": 9.456545796342551e-05, "loss": 0.0495, "step": 2338 }, { "epoch": 0.3007844313944018, "grad_norm": 0.203125, "learning_rate": 9.456087467714101e-05, "loss": 0.0585, "step": 2339 }, { "epoch": 0.3009130267049595, "grad_norm": 0.23828125, "learning_rate": 9.455628957015128e-05, "loss": 0.0712, "step": 2340 }, { "epoch": 0.30104162201551715, "grad_norm": 0.203125, "learning_rate": 9.455170264264366e-05, "loss": 0.0601, "step": 2341 }, { "epoch": 0.30117021732607485, "grad_norm": 0.1953125, "learning_rate": 9.454711389480555e-05, "loss": 0.0515, "step": 2342 }, { "epoch": 0.3012988126366325, "grad_norm": 0.2099609375, "learning_rate": 9.454252332682445e-05, "loss": 0.0568, "step": 2343 }, { "epoch": 0.3014274079471902, "grad_norm": 0.1875, "learning_rate": 9.453793093888794e-05, "loss": 0.0481, "step": 2344 }, { "epoch": 0.30155600325774784, "grad_norm": 0.2080078125, "learning_rate": 9.453333673118364e-05, "loss": 0.0587, "step": 2345 }, { "epoch": 0.30168459856830554, "grad_norm": 0.251953125, "learning_rate": 9.452874070389927e-05, "loss": 0.0726, "step": 2346 }, { "epoch": 0.30181319387886324, "grad_norm": 0.2177734375, "learning_rate": 9.452414285722262e-05, "loss": 0.0559, "step": 2347 }, { "epoch": 0.3019417891894209, "grad_norm": 0.220703125, "learning_rate": 9.451954319134155e-05, "loss": 0.0472, "step": 2348 }, { "epoch": 0.3020703844999786, "grad_norm": 0.1904296875, "learning_rate": 9.4514941706444e-05, "loss": 0.0507, "step": 2349 }, { "epoch": 0.30219897981053623, "grad_norm": 0.26171875, "learning_rate": 9.451033840271797e-05, "loss": 0.0625, "step": 2350 }, { "epoch": 0.3023275751210939, "grad_norm": 0.201171875, "learning_rate": 9.450573328035158e-05, "loss": 0.0477, "step": 2351 }, { "epoch": 0.30245617043165157, "grad_norm": 0.1728515625, "learning_rate": 9.450112633953295e-05, "loss": 0.0455, "step": 2352 }, { "epoch": 0.30258476574220927, "grad_norm": 0.2158203125, "learning_rate": 9.449651758045031e-05, "loss": 0.0433, "step": 2353 }, { "epoch": 0.3027133610527669, "grad_norm": 0.2099609375, "learning_rate": 9.4491907003292e-05, "loss": 0.0653, "step": 2354 }, { "epoch": 0.3028419563633246, "grad_norm": 0.2138671875, "learning_rate": 9.448729460824641e-05, "loss": 0.0539, "step": 2355 }, { "epoch": 0.3029705516738823, "grad_norm": 0.2109375, "learning_rate": 9.448268039550196e-05, "loss": 0.054, "step": 2356 }, { "epoch": 0.30309914698443996, "grad_norm": 0.193359375, "learning_rate": 9.44780643652472e-05, "loss": 0.0458, "step": 2357 }, { "epoch": 0.30322774229499766, "grad_norm": 0.1845703125, "learning_rate": 9.447344651767074e-05, "loss": 0.0559, "step": 2358 }, { "epoch": 0.3033563376055553, "grad_norm": 0.203125, "learning_rate": 9.446882685296125e-05, "loss": 0.0626, "step": 2359 }, { "epoch": 0.303484932916113, "grad_norm": 0.23046875, "learning_rate": 9.44642053713075e-05, "loss": 0.0673, "step": 2360 }, { "epoch": 0.30361352822667065, "grad_norm": 0.216796875, "learning_rate": 9.445958207289829e-05, "loss": 0.0552, "step": 2361 }, { "epoch": 0.30374212353722835, "grad_norm": 0.18359375, "learning_rate": 9.445495695792255e-05, "loss": 0.0381, "step": 2362 }, { "epoch": 0.303870718847786, "grad_norm": 0.197265625, "learning_rate": 9.445033002656924e-05, "loss": 0.0479, "step": 2363 }, { "epoch": 0.3039993141583437, "grad_norm": 0.17578125, "learning_rate": 9.444570127902744e-05, "loss": 0.0498, "step": 2364 }, { "epoch": 0.3041279094689014, "grad_norm": 0.2099609375, "learning_rate": 9.444107071548623e-05, "loss": 0.0674, "step": 2365 }, { "epoch": 0.30425650477945904, "grad_norm": 0.1865234375, "learning_rate": 9.443643833613482e-05, "loss": 0.0538, "step": 2366 }, { "epoch": 0.30438510009001674, "grad_norm": 0.19921875, "learning_rate": 9.443180414116252e-05, "loss": 0.0546, "step": 2367 }, { "epoch": 0.3045136954005744, "grad_norm": 0.189453125, "learning_rate": 9.442716813075865e-05, "loss": 0.0484, "step": 2368 }, { "epoch": 0.3046422907111321, "grad_norm": 0.19140625, "learning_rate": 9.442253030511262e-05, "loss": 0.0531, "step": 2369 }, { "epoch": 0.3047708860216897, "grad_norm": 0.1796875, "learning_rate": 9.441789066441395e-05, "loss": 0.0511, "step": 2370 }, { "epoch": 0.3048994813322474, "grad_norm": 0.18359375, "learning_rate": 9.441324920885221e-05, "loss": 0.0523, "step": 2371 }, { "epoch": 0.30502807664280507, "grad_norm": 0.2119140625, "learning_rate": 9.440860593861703e-05, "loss": 0.054, "step": 2372 }, { "epoch": 0.30515667195336277, "grad_norm": 0.2236328125, "learning_rate": 9.440396085389814e-05, "loss": 0.0716, "step": 2373 }, { "epoch": 0.30528526726392047, "grad_norm": 0.20703125, "learning_rate": 9.43993139548853e-05, "loss": 0.0501, "step": 2374 }, { "epoch": 0.3054138625744781, "grad_norm": 0.1962890625, "learning_rate": 9.439466524176843e-05, "loss": 0.0489, "step": 2375 }, { "epoch": 0.3055424578850358, "grad_norm": 0.1767578125, "learning_rate": 9.439001471473745e-05, "loss": 0.0472, "step": 2376 }, { "epoch": 0.30567105319559346, "grad_norm": 0.173828125, "learning_rate": 9.438536237398235e-05, "loss": 0.049, "step": 2377 }, { "epoch": 0.30579964850615116, "grad_norm": 0.1904296875, "learning_rate": 9.438070821969326e-05, "loss": 0.049, "step": 2378 }, { "epoch": 0.3059282438167088, "grad_norm": 0.193359375, "learning_rate": 9.437605225206032e-05, "loss": 0.0595, "step": 2379 }, { "epoch": 0.3060568391272665, "grad_norm": 0.2080078125, "learning_rate": 9.437139447127377e-05, "loss": 0.0558, "step": 2380 }, { "epoch": 0.30618543443782414, "grad_norm": 0.2080078125, "learning_rate": 9.43667348775239e-05, "loss": 0.048, "step": 2381 }, { "epoch": 0.30631402974838184, "grad_norm": 0.203125, "learning_rate": 9.436207347100115e-05, "loss": 0.0592, "step": 2382 }, { "epoch": 0.30644262505893954, "grad_norm": 0.1787109375, "learning_rate": 9.435741025189592e-05, "loss": 0.0497, "step": 2383 }, { "epoch": 0.3065712203694972, "grad_norm": 0.169921875, "learning_rate": 9.435274522039878e-05, "loss": 0.0468, "step": 2384 }, { "epoch": 0.3066998156800549, "grad_norm": 0.234375, "learning_rate": 9.434807837670034e-05, "loss": 0.0488, "step": 2385 }, { "epoch": 0.30682841099061253, "grad_norm": 0.2021484375, "learning_rate": 9.434340972099125e-05, "loss": 0.0581, "step": 2386 }, { "epoch": 0.30695700630117023, "grad_norm": 0.203125, "learning_rate": 9.433873925346231e-05, "loss": 0.0512, "step": 2387 }, { "epoch": 0.3070856016117279, "grad_norm": 0.197265625, "learning_rate": 9.43340669743043e-05, "loss": 0.048, "step": 2388 }, { "epoch": 0.3072141969222856, "grad_norm": 0.2060546875, "learning_rate": 9.432939288370816e-05, "loss": 0.0479, "step": 2389 }, { "epoch": 0.3073427922328432, "grad_norm": 0.2158203125, "learning_rate": 9.432471698186487e-05, "loss": 0.0669, "step": 2390 }, { "epoch": 0.3074713875434009, "grad_norm": 0.216796875, "learning_rate": 9.432003926896546e-05, "loss": 0.0636, "step": 2391 }, { "epoch": 0.3075999828539586, "grad_norm": 0.2060546875, "learning_rate": 9.431535974520106e-05, "loss": 0.0512, "step": 2392 }, { "epoch": 0.30772857816451626, "grad_norm": 0.1943359375, "learning_rate": 9.431067841076288e-05, "loss": 0.0452, "step": 2393 }, { "epoch": 0.30785717347507396, "grad_norm": 0.1962890625, "learning_rate": 9.430599526584218e-05, "loss": 0.0559, "step": 2394 }, { "epoch": 0.3079857687856316, "grad_norm": 0.2041015625, "learning_rate": 9.430131031063033e-05, "loss": 0.0597, "step": 2395 }, { "epoch": 0.3081143640961893, "grad_norm": 0.1875, "learning_rate": 9.429662354531874e-05, "loss": 0.0544, "step": 2396 }, { "epoch": 0.30824295940674695, "grad_norm": 0.1884765625, "learning_rate": 9.42919349700989e-05, "loss": 0.0548, "step": 2397 }, { "epoch": 0.30837155471730465, "grad_norm": 0.2236328125, "learning_rate": 9.428724458516239e-05, "loss": 0.0787, "step": 2398 }, { "epoch": 0.3085001500278623, "grad_norm": 0.181640625, "learning_rate": 9.428255239070082e-05, "loss": 0.0521, "step": 2399 }, { "epoch": 0.30862874533842, "grad_norm": 0.1826171875, "learning_rate": 9.427785838690597e-05, "loss": 0.0512, "step": 2400 }, { "epoch": 0.30875734064897764, "grad_norm": 0.1923828125, "learning_rate": 9.427316257396957e-05, "loss": 0.0565, "step": 2401 }, { "epoch": 0.30888593595953534, "grad_norm": 0.177734375, "learning_rate": 9.426846495208352e-05, "loss": 0.0442, "step": 2402 }, { "epoch": 0.30901453127009304, "grad_norm": 0.2080078125, "learning_rate": 9.426376552143977e-05, "loss": 0.0598, "step": 2403 }, { "epoch": 0.3091431265806507, "grad_norm": 0.18359375, "learning_rate": 9.42590642822303e-05, "loss": 0.0479, "step": 2404 }, { "epoch": 0.3092717218912084, "grad_norm": 0.19140625, "learning_rate": 9.425436123464721e-05, "loss": 0.055, "step": 2405 }, { "epoch": 0.30940031720176603, "grad_norm": 0.1875, "learning_rate": 9.424965637888268e-05, "loss": 0.0569, "step": 2406 }, { "epoch": 0.30952891251232373, "grad_norm": 0.1845703125, "learning_rate": 9.42449497151289e-05, "loss": 0.0428, "step": 2407 }, { "epoch": 0.3096575078228814, "grad_norm": 0.203125, "learning_rate": 9.424024124357822e-05, "loss": 0.0572, "step": 2408 }, { "epoch": 0.3097861031334391, "grad_norm": 0.201171875, "learning_rate": 9.423553096442302e-05, "loss": 0.0618, "step": 2409 }, { "epoch": 0.3099146984439967, "grad_norm": 0.185546875, "learning_rate": 9.423081887785574e-05, "loss": 0.0481, "step": 2410 }, { "epoch": 0.3100432937545544, "grad_norm": 0.185546875, "learning_rate": 9.422610498406891e-05, "loss": 0.0544, "step": 2411 }, { "epoch": 0.3101718890651121, "grad_norm": 0.1796875, "learning_rate": 9.422138928325514e-05, "loss": 0.0492, "step": 2412 }, { "epoch": 0.31030048437566976, "grad_norm": 0.1875, "learning_rate": 9.421667177560711e-05, "loss": 0.0575, "step": 2413 }, { "epoch": 0.31042907968622746, "grad_norm": 0.2138671875, "learning_rate": 9.421195246131758e-05, "loss": 0.0576, "step": 2414 }, { "epoch": 0.3105576749967851, "grad_norm": 0.2021484375, "learning_rate": 9.420723134057936e-05, "loss": 0.0531, "step": 2415 }, { "epoch": 0.3106862703073428, "grad_norm": 0.2138671875, "learning_rate": 9.420250841358535e-05, "loss": 0.0596, "step": 2416 }, { "epoch": 0.31081486561790045, "grad_norm": 0.1904296875, "learning_rate": 9.419778368052853e-05, "loss": 0.0521, "step": 2417 }, { "epoch": 0.31094346092845815, "grad_norm": 0.2119140625, "learning_rate": 9.419305714160195e-05, "loss": 0.0645, "step": 2418 }, { "epoch": 0.3110720562390158, "grad_norm": 0.1884765625, "learning_rate": 9.418832879699873e-05, "loss": 0.0563, "step": 2419 }, { "epoch": 0.3112006515495735, "grad_norm": 0.197265625, "learning_rate": 9.418359864691205e-05, "loss": 0.0532, "step": 2420 }, { "epoch": 0.3113292468601312, "grad_norm": 0.1630859375, "learning_rate": 9.41788666915352e-05, "loss": 0.0422, "step": 2421 }, { "epoch": 0.31145784217068884, "grad_norm": 0.2109375, "learning_rate": 9.41741329310615e-05, "loss": 0.0589, "step": 2422 }, { "epoch": 0.31158643748124654, "grad_norm": 0.169921875, "learning_rate": 9.416939736568438e-05, "loss": 0.0476, "step": 2423 }, { "epoch": 0.3117150327918042, "grad_norm": 0.2158203125, "learning_rate": 9.416465999559733e-05, "loss": 0.0717, "step": 2424 }, { "epoch": 0.3118436281023619, "grad_norm": 0.1875, "learning_rate": 9.415992082099392e-05, "loss": 0.0463, "step": 2425 }, { "epoch": 0.3119722234129195, "grad_norm": 0.1884765625, "learning_rate": 9.415517984206776e-05, "loss": 0.0471, "step": 2426 }, { "epoch": 0.3121008187234772, "grad_norm": 0.177734375, "learning_rate": 9.415043705901261e-05, "loss": 0.0519, "step": 2427 }, { "epoch": 0.31222941403403487, "grad_norm": 0.244140625, "learning_rate": 9.41456924720222e-05, "loss": 0.0547, "step": 2428 }, { "epoch": 0.31235800934459257, "grad_norm": 0.189453125, "learning_rate": 9.414094608129041e-05, "loss": 0.0503, "step": 2429 }, { "epoch": 0.31248660465515027, "grad_norm": 0.19921875, "learning_rate": 9.413619788701117e-05, "loss": 0.0648, "step": 2430 }, { "epoch": 0.3126151999657079, "grad_norm": 0.1943359375, "learning_rate": 9.413144788937851e-05, "loss": 0.0551, "step": 2431 }, { "epoch": 0.3127437952762656, "grad_norm": 0.1943359375, "learning_rate": 9.412669608858647e-05, "loss": 0.0494, "step": 2432 }, { "epoch": 0.31287239058682326, "grad_norm": 0.169921875, "learning_rate": 9.412194248482922e-05, "loss": 0.0428, "step": 2433 }, { "epoch": 0.31300098589738096, "grad_norm": 0.220703125, "learning_rate": 9.4117187078301e-05, "loss": 0.0689, "step": 2434 }, { "epoch": 0.3131295812079386, "grad_norm": 0.185546875, "learning_rate": 9.41124298691961e-05, "loss": 0.0516, "step": 2435 }, { "epoch": 0.3132581765184963, "grad_norm": 0.181640625, "learning_rate": 9.410767085770889e-05, "loss": 0.0534, "step": 2436 }, { "epoch": 0.31338677182905395, "grad_norm": 0.205078125, "learning_rate": 9.410291004403382e-05, "loss": 0.0538, "step": 2437 }, { "epoch": 0.31351536713961164, "grad_norm": 0.16796875, "learning_rate": 9.40981474283654e-05, "loss": 0.0449, "step": 2438 }, { "epoch": 0.31364396245016934, "grad_norm": 0.1787109375, "learning_rate": 9.409338301089825e-05, "loss": 0.0397, "step": 2439 }, { "epoch": 0.313772557760727, "grad_norm": 0.1767578125, "learning_rate": 9.408861679182703e-05, "loss": 0.0454, "step": 2440 }, { "epoch": 0.3139011530712847, "grad_norm": 0.193359375, "learning_rate": 9.408384877134646e-05, "loss": 0.0582, "step": 2441 }, { "epoch": 0.31402974838184233, "grad_norm": 0.2080078125, "learning_rate": 9.407907894965137e-05, "loss": 0.0551, "step": 2442 }, { "epoch": 0.31415834369240003, "grad_norm": 0.212890625, "learning_rate": 9.407430732693667e-05, "loss": 0.0562, "step": 2443 }, { "epoch": 0.3142869390029577, "grad_norm": 0.1884765625, "learning_rate": 9.40695339033973e-05, "loss": 0.0529, "step": 2444 }, { "epoch": 0.3144155343135154, "grad_norm": 0.205078125, "learning_rate": 9.40647586792283e-05, "loss": 0.0446, "step": 2445 }, { "epoch": 0.314544129624073, "grad_norm": 0.2021484375, "learning_rate": 9.405998165462479e-05, "loss": 0.0493, "step": 2446 }, { "epoch": 0.3146727249346307, "grad_norm": 0.193359375, "learning_rate": 9.405520282978193e-05, "loss": 0.0473, "step": 2447 }, { "epoch": 0.31480132024518837, "grad_norm": 0.18359375, "learning_rate": 9.405042220489501e-05, "loss": 0.0486, "step": 2448 }, { "epoch": 0.31492991555574606, "grad_norm": 0.1904296875, "learning_rate": 9.404563978015934e-05, "loss": 0.0476, "step": 2449 }, { "epoch": 0.31505851086630376, "grad_norm": 0.224609375, "learning_rate": 9.404085555577031e-05, "loss": 0.0662, "step": 2450 }, { "epoch": 0.3151871061768614, "grad_norm": 0.1806640625, "learning_rate": 9.403606953192344e-05, "loss": 0.0493, "step": 2451 }, { "epoch": 0.3153157014874191, "grad_norm": 0.1962890625, "learning_rate": 9.403128170881423e-05, "loss": 0.0602, "step": 2452 }, { "epoch": 0.31544429679797675, "grad_norm": 0.2060546875, "learning_rate": 9.402649208663835e-05, "loss": 0.0529, "step": 2453 }, { "epoch": 0.31557289210853445, "grad_norm": 0.2060546875, "learning_rate": 9.402170066559148e-05, "loss": 0.0519, "step": 2454 }, { "epoch": 0.3157014874190921, "grad_norm": 0.22265625, "learning_rate": 9.401690744586937e-05, "loss": 0.0499, "step": 2455 }, { "epoch": 0.3158300827296498, "grad_norm": 0.1728515625, "learning_rate": 9.401211242766792e-05, "loss": 0.0494, "step": 2456 }, { "epoch": 0.31595867804020744, "grad_norm": 0.2041015625, "learning_rate": 9.400731561118299e-05, "loss": 0.0616, "step": 2457 }, { "epoch": 0.31608727335076514, "grad_norm": 0.2001953125, "learning_rate": 9.400251699661063e-05, "loss": 0.0587, "step": 2458 }, { "epoch": 0.31621586866132284, "grad_norm": 0.181640625, "learning_rate": 9.399771658414684e-05, "loss": 0.0412, "step": 2459 }, { "epoch": 0.3163444639718805, "grad_norm": 0.22265625, "learning_rate": 9.399291437398781e-05, "loss": 0.0661, "step": 2460 }, { "epoch": 0.3164730592824382, "grad_norm": 0.21875, "learning_rate": 9.398811036632973e-05, "loss": 0.0551, "step": 2461 }, { "epoch": 0.31660165459299583, "grad_norm": 0.2138671875, "learning_rate": 9.398330456136889e-05, "loss": 0.0546, "step": 2462 }, { "epoch": 0.31673024990355353, "grad_norm": 0.2001953125, "learning_rate": 9.397849695930166e-05, "loss": 0.0511, "step": 2463 }, { "epoch": 0.3168588452141112, "grad_norm": 0.19140625, "learning_rate": 9.397368756032445e-05, "loss": 0.0554, "step": 2464 }, { "epoch": 0.3169874405246689, "grad_norm": 0.2138671875, "learning_rate": 9.396887636463381e-05, "loss": 0.0603, "step": 2465 }, { "epoch": 0.3171160358352265, "grad_norm": 0.2236328125, "learning_rate": 9.396406337242627e-05, "loss": 0.049, "step": 2466 }, { "epoch": 0.3172446311457842, "grad_norm": 0.216796875, "learning_rate": 9.39592485838985e-05, "loss": 0.0536, "step": 2467 }, { "epoch": 0.3173732264563419, "grad_norm": 0.2099609375, "learning_rate": 9.395443199924724e-05, "loss": 0.0523, "step": 2468 }, { "epoch": 0.31750182176689956, "grad_norm": 0.2041015625, "learning_rate": 9.394961361866927e-05, "loss": 0.0531, "step": 2469 }, { "epoch": 0.31763041707745726, "grad_norm": 0.177734375, "learning_rate": 9.394479344236149e-05, "loss": 0.048, "step": 2470 }, { "epoch": 0.3177590123880149, "grad_norm": 0.232421875, "learning_rate": 9.393997147052083e-05, "loss": 0.0631, "step": 2471 }, { "epoch": 0.3178876076985726, "grad_norm": 0.2021484375, "learning_rate": 9.393514770334431e-05, "loss": 0.0417, "step": 2472 }, { "epoch": 0.31801620300913025, "grad_norm": 0.2001953125, "learning_rate": 9.393032214102901e-05, "loss": 0.048, "step": 2473 }, { "epoch": 0.31814479831968795, "grad_norm": 0.1787109375, "learning_rate": 9.392549478377213e-05, "loss": 0.0456, "step": 2474 }, { "epoch": 0.3182733936302456, "grad_norm": 0.1923828125, "learning_rate": 9.392066563177089e-05, "loss": 0.0564, "step": 2475 }, { "epoch": 0.3184019889408033, "grad_norm": 0.1904296875, "learning_rate": 9.391583468522259e-05, "loss": 0.0572, "step": 2476 }, { "epoch": 0.318530584251361, "grad_norm": 0.21484375, "learning_rate": 9.391100194432465e-05, "loss": 0.0657, "step": 2477 }, { "epoch": 0.31865917956191864, "grad_norm": 0.197265625, "learning_rate": 9.390616740927449e-05, "loss": 0.0512, "step": 2478 }, { "epoch": 0.31878777487247634, "grad_norm": 0.1806640625, "learning_rate": 9.390133108026968e-05, "loss": 0.0504, "step": 2479 }, { "epoch": 0.318916370183034, "grad_norm": 0.1728515625, "learning_rate": 9.389649295750782e-05, "loss": 0.0484, "step": 2480 }, { "epoch": 0.3190449654935917, "grad_norm": 0.1865234375, "learning_rate": 9.389165304118657e-05, "loss": 0.0484, "step": 2481 }, { "epoch": 0.3191735608041493, "grad_norm": 0.1787109375, "learning_rate": 9.38868113315037e-05, "loss": 0.0519, "step": 2482 }, { "epoch": 0.319302156114707, "grad_norm": 0.193359375, "learning_rate": 9.388196782865704e-05, "loss": 0.0545, "step": 2483 }, { "epoch": 0.31943075142526467, "grad_norm": 0.2265625, "learning_rate": 9.387712253284446e-05, "loss": 0.0626, "step": 2484 }, { "epoch": 0.31955934673582237, "grad_norm": 0.2080078125, "learning_rate": 9.387227544426398e-05, "loss": 0.0564, "step": 2485 }, { "epoch": 0.31968794204638007, "grad_norm": 0.19140625, "learning_rate": 9.386742656311361e-05, "loss": 0.0509, "step": 2486 }, { "epoch": 0.3198165373569377, "grad_norm": 0.189453125, "learning_rate": 9.386257588959148e-05, "loss": 0.0579, "step": 2487 }, { "epoch": 0.3199451326674954, "grad_norm": 0.19921875, "learning_rate": 9.385772342389579e-05, "loss": 0.0605, "step": 2488 }, { "epoch": 0.32007372797805306, "grad_norm": 0.1962890625, "learning_rate": 9.38528691662248e-05, "loss": 0.0527, "step": 2489 }, { "epoch": 0.32020232328861076, "grad_norm": 0.21875, "learning_rate": 9.384801311677684e-05, "loss": 0.0641, "step": 2490 }, { "epoch": 0.3203309185991684, "grad_norm": 0.2177734375, "learning_rate": 9.384315527575034e-05, "loss": 0.0551, "step": 2491 }, { "epoch": 0.3204595139097261, "grad_norm": 0.1708984375, "learning_rate": 9.383829564334378e-05, "loss": 0.0457, "step": 2492 }, { "epoch": 0.32058810922028375, "grad_norm": 0.2041015625, "learning_rate": 9.383343421975571e-05, "loss": 0.061, "step": 2493 }, { "epoch": 0.32071670453084145, "grad_norm": 0.203125, "learning_rate": 9.382857100518477e-05, "loss": 0.0547, "step": 2494 }, { "epoch": 0.3208452998413991, "grad_norm": 0.185546875, "learning_rate": 9.382370599982967e-05, "loss": 0.0459, "step": 2495 }, { "epoch": 0.3209738951519568, "grad_norm": 0.2001953125, "learning_rate": 9.381883920388918e-05, "loss": 0.0527, "step": 2496 }, { "epoch": 0.3211024904625145, "grad_norm": 0.1904296875, "learning_rate": 9.381397061756214e-05, "loss": 0.0485, "step": 2497 }, { "epoch": 0.32123108577307213, "grad_norm": 0.2177734375, "learning_rate": 9.380910024104751e-05, "loss": 0.0536, "step": 2498 }, { "epoch": 0.32135968108362983, "grad_norm": 0.181640625, "learning_rate": 9.380422807454425e-05, "loss": 0.0456, "step": 2499 }, { "epoch": 0.3214882763941875, "grad_norm": 0.1884765625, "learning_rate": 9.379935411825147e-05, "loss": 0.0533, "step": 2500 }, { "epoch": 0.3214882763941875, "eval_loss": 0.05211152881383896, "eval_runtime": 1046.9727, "eval_samples_per_second": 93.819, "eval_steps_per_second": 1.173, "step": 2500 }, { "epoch": 0.3216168717047452, "grad_norm": 0.1904296875, "learning_rate": 9.379447837236828e-05, "loss": 0.0624, "step": 2501 }, { "epoch": 0.3217454670153028, "grad_norm": 0.185546875, "learning_rate": 9.37896008370939e-05, "loss": 0.0567, "step": 2502 }, { "epoch": 0.3218740623258605, "grad_norm": 0.1962890625, "learning_rate": 9.378472151262762e-05, "loss": 0.0549, "step": 2503 }, { "epoch": 0.32200265763641817, "grad_norm": 0.224609375, "learning_rate": 9.377984039916883e-05, "loss": 0.0601, "step": 2504 }, { "epoch": 0.32213125294697587, "grad_norm": 0.173828125, "learning_rate": 9.377495749691696e-05, "loss": 0.0498, "step": 2505 }, { "epoch": 0.32225984825753357, "grad_norm": 0.1708984375, "learning_rate": 9.377007280607149e-05, "loss": 0.0435, "step": 2506 }, { "epoch": 0.3223884435680912, "grad_norm": 0.1962890625, "learning_rate": 9.376518632683204e-05, "loss": 0.0572, "step": 2507 }, { "epoch": 0.3225170388786489, "grad_norm": 0.1982421875, "learning_rate": 9.376029805939823e-05, "loss": 0.0518, "step": 2508 }, { "epoch": 0.32264563418920655, "grad_norm": 0.189453125, "learning_rate": 9.375540800396982e-05, "loss": 0.0552, "step": 2509 }, { "epoch": 0.32277422949976425, "grad_norm": 0.220703125, "learning_rate": 9.375051616074659e-05, "loss": 0.0654, "step": 2510 }, { "epoch": 0.3229028248103219, "grad_norm": 0.185546875, "learning_rate": 9.374562252992842e-05, "loss": 0.0584, "step": 2511 }, { "epoch": 0.3230314201208796, "grad_norm": 0.1875, "learning_rate": 9.374072711171526e-05, "loss": 0.0487, "step": 2512 }, { "epoch": 0.32316001543143724, "grad_norm": 0.19140625, "learning_rate": 9.373582990630716e-05, "loss": 0.0467, "step": 2513 }, { "epoch": 0.32328861074199494, "grad_norm": 0.2001953125, "learning_rate": 9.373093091390418e-05, "loss": 0.0619, "step": 2514 }, { "epoch": 0.32341720605255264, "grad_norm": 0.189453125, "learning_rate": 9.372603013470646e-05, "loss": 0.0512, "step": 2515 }, { "epoch": 0.3235458013631103, "grad_norm": 0.185546875, "learning_rate": 9.372112756891429e-05, "loss": 0.0488, "step": 2516 }, { "epoch": 0.323674396673668, "grad_norm": 0.24609375, "learning_rate": 9.371622321672799e-05, "loss": 0.0645, "step": 2517 }, { "epoch": 0.32380299198422563, "grad_norm": 0.1884765625, "learning_rate": 9.371131707834791e-05, "loss": 0.0488, "step": 2518 }, { "epoch": 0.32393158729478333, "grad_norm": 0.1923828125, "learning_rate": 9.370640915397451e-05, "loss": 0.0515, "step": 2519 }, { "epoch": 0.324060182605341, "grad_norm": 0.1904296875, "learning_rate": 9.370149944380833e-05, "loss": 0.0488, "step": 2520 }, { "epoch": 0.3241887779158987, "grad_norm": 0.1904296875, "learning_rate": 9.369658794805e-05, "loss": 0.0512, "step": 2521 }, { "epoch": 0.3243173732264563, "grad_norm": 0.20703125, "learning_rate": 9.369167466690017e-05, "loss": 0.0517, "step": 2522 }, { "epoch": 0.324445968537014, "grad_norm": 0.2099609375, "learning_rate": 9.368675960055959e-05, "loss": 0.0518, "step": 2523 }, { "epoch": 0.3245745638475717, "grad_norm": 0.2109375, "learning_rate": 9.368184274922911e-05, "loss": 0.0573, "step": 2524 }, { "epoch": 0.32470315915812936, "grad_norm": 0.203125, "learning_rate": 9.36769241131096e-05, "loss": 0.054, "step": 2525 }, { "epoch": 0.32483175446868706, "grad_norm": 0.20703125, "learning_rate": 9.367200369240204e-05, "loss": 0.0544, "step": 2526 }, { "epoch": 0.3249603497792447, "grad_norm": 0.1875, "learning_rate": 9.366708148730746e-05, "loss": 0.0554, "step": 2527 }, { "epoch": 0.3250889450898024, "grad_norm": 0.177734375, "learning_rate": 9.3662157498027e-05, "loss": 0.0534, "step": 2528 }, { "epoch": 0.32521754040036005, "grad_norm": 0.1923828125, "learning_rate": 9.365723172476183e-05, "loss": 0.0484, "step": 2529 }, { "epoch": 0.32534613571091775, "grad_norm": 0.203125, "learning_rate": 9.365230416771322e-05, "loss": 0.0607, "step": 2530 }, { "epoch": 0.3254747310214754, "grad_norm": 0.1865234375, "learning_rate": 9.36473748270825e-05, "loss": 0.0511, "step": 2531 }, { "epoch": 0.3256033263320331, "grad_norm": 0.234375, "learning_rate": 9.364244370307107e-05, "loss": 0.0708, "step": 2532 }, { "epoch": 0.3257319216425908, "grad_norm": 0.1806640625, "learning_rate": 9.363751079588043e-05, "loss": 0.0492, "step": 2533 }, { "epoch": 0.32586051695314844, "grad_norm": 0.19921875, "learning_rate": 9.363257610571211e-05, "loss": 0.0526, "step": 2534 }, { "epoch": 0.32598911226370614, "grad_norm": 0.275390625, "learning_rate": 9.362763963276775e-05, "loss": 0.072, "step": 2535 }, { "epoch": 0.3261177075742638, "grad_norm": 0.203125, "learning_rate": 9.362270137724905e-05, "loss": 0.0575, "step": 2536 }, { "epoch": 0.3262463028848215, "grad_norm": 0.197265625, "learning_rate": 9.361776133935778e-05, "loss": 0.0511, "step": 2537 }, { "epoch": 0.3263748981953791, "grad_norm": 0.224609375, "learning_rate": 9.361281951929577e-05, "loss": 0.0676, "step": 2538 }, { "epoch": 0.3265034935059368, "grad_norm": 0.173828125, "learning_rate": 9.360787591726495e-05, "loss": 0.0512, "step": 2539 }, { "epoch": 0.32663208881649447, "grad_norm": 0.2109375, "learning_rate": 9.36029305334673e-05, "loss": 0.0489, "step": 2540 }, { "epoch": 0.32676068412705217, "grad_norm": 0.171875, "learning_rate": 9.35979833681049e-05, "loss": 0.0476, "step": 2541 }, { "epoch": 0.3268892794376098, "grad_norm": 0.208984375, "learning_rate": 9.359303442137987e-05, "loss": 0.0579, "step": 2542 }, { "epoch": 0.3270178747481675, "grad_norm": 0.2138671875, "learning_rate": 9.358808369349444e-05, "loss": 0.0611, "step": 2543 }, { "epoch": 0.3271464700587252, "grad_norm": 0.234375, "learning_rate": 9.358313118465086e-05, "loss": 0.0646, "step": 2544 }, { "epoch": 0.32727506536928286, "grad_norm": 0.2060546875, "learning_rate": 9.357817689505148e-05, "loss": 0.0661, "step": 2545 }, { "epoch": 0.32740366067984056, "grad_norm": 0.2177734375, "learning_rate": 9.357322082489878e-05, "loss": 0.0562, "step": 2546 }, { "epoch": 0.3275322559903982, "grad_norm": 0.201171875, "learning_rate": 9.35682629743952e-05, "loss": 0.0578, "step": 2547 }, { "epoch": 0.3276608513009559, "grad_norm": 0.1845703125, "learning_rate": 9.356330334374335e-05, "loss": 0.0495, "step": 2548 }, { "epoch": 0.32778944661151355, "grad_norm": 0.1923828125, "learning_rate": 9.355834193314585e-05, "loss": 0.0542, "step": 2549 }, { "epoch": 0.32791804192207125, "grad_norm": 0.205078125, "learning_rate": 9.355337874280543e-05, "loss": 0.0508, "step": 2550 }, { "epoch": 0.3280466372326289, "grad_norm": 0.1728515625, "learning_rate": 9.354841377292488e-05, "loss": 0.0448, "step": 2551 }, { "epoch": 0.3281752325431866, "grad_norm": 0.1875, "learning_rate": 9.354344702370707e-05, "loss": 0.0498, "step": 2552 }, { "epoch": 0.3283038278537443, "grad_norm": 0.17578125, "learning_rate": 9.353847849535491e-05, "loss": 0.0468, "step": 2553 }, { "epoch": 0.32843242316430193, "grad_norm": 0.1982421875, "learning_rate": 9.353350818807143e-05, "loss": 0.0492, "step": 2554 }, { "epoch": 0.32856101847485963, "grad_norm": 0.18359375, "learning_rate": 9.352853610205971e-05, "loss": 0.0436, "step": 2555 }, { "epoch": 0.3286896137854173, "grad_norm": 0.1904296875, "learning_rate": 9.35235622375229e-05, "loss": 0.0505, "step": 2556 }, { "epoch": 0.328818209095975, "grad_norm": 0.1923828125, "learning_rate": 9.351858659466421e-05, "loss": 0.0466, "step": 2557 }, { "epoch": 0.3289468044065326, "grad_norm": 0.2001953125, "learning_rate": 9.351360917368697e-05, "loss": 0.0543, "step": 2558 }, { "epoch": 0.3290753997170903, "grad_norm": 0.2060546875, "learning_rate": 9.350862997479454e-05, "loss": 0.061, "step": 2559 }, { "epoch": 0.32920399502764797, "grad_norm": 0.197265625, "learning_rate": 9.350364899819036e-05, "loss": 0.0495, "step": 2560 }, { "epoch": 0.32933259033820567, "grad_norm": 0.212890625, "learning_rate": 9.349866624407792e-05, "loss": 0.0477, "step": 2561 }, { "epoch": 0.32946118564876337, "grad_norm": 0.2734375, "learning_rate": 9.349368171266086e-05, "loss": 0.0517, "step": 2562 }, { "epoch": 0.329589780959321, "grad_norm": 0.2099609375, "learning_rate": 9.348869540414282e-05, "loss": 0.0542, "step": 2563 }, { "epoch": 0.3297183762698787, "grad_norm": 0.2021484375, "learning_rate": 9.348370731872753e-05, "loss": 0.0457, "step": 2564 }, { "epoch": 0.32984697158043635, "grad_norm": 0.28125, "learning_rate": 9.34787174566188e-05, "loss": 0.0634, "step": 2565 }, { "epoch": 0.32997556689099405, "grad_norm": 0.205078125, "learning_rate": 9.347372581802052e-05, "loss": 0.0525, "step": 2566 }, { "epoch": 0.3301041622015517, "grad_norm": 0.1904296875, "learning_rate": 9.346873240313663e-05, "loss": 0.0482, "step": 2567 }, { "epoch": 0.3302327575121094, "grad_norm": 0.2041015625, "learning_rate": 9.346373721217115e-05, "loss": 0.0506, "step": 2568 }, { "epoch": 0.33036135282266704, "grad_norm": 0.1865234375, "learning_rate": 9.345874024532818e-05, "loss": 0.0544, "step": 2569 }, { "epoch": 0.33048994813322474, "grad_norm": 0.173828125, "learning_rate": 9.345374150281193e-05, "loss": 0.0488, "step": 2570 }, { "epoch": 0.33061854344378244, "grad_norm": 0.17578125, "learning_rate": 9.344874098482656e-05, "loss": 0.0461, "step": 2571 }, { "epoch": 0.3307471387543401, "grad_norm": 0.2197265625, "learning_rate": 9.344373869157646e-05, "loss": 0.0627, "step": 2572 }, { "epoch": 0.3308757340648978, "grad_norm": 0.212890625, "learning_rate": 9.3438734623266e-05, "loss": 0.0453, "step": 2573 }, { "epoch": 0.33100432937545543, "grad_norm": 0.19140625, "learning_rate": 9.343372878009962e-05, "loss": 0.0551, "step": 2574 }, { "epoch": 0.33113292468601313, "grad_norm": 0.2158203125, "learning_rate": 9.342872116228187e-05, "loss": 0.0594, "step": 2575 }, { "epoch": 0.3312615199965708, "grad_norm": 0.2353515625, "learning_rate": 9.342371177001736e-05, "loss": 0.0477, "step": 2576 }, { "epoch": 0.3313901153071285, "grad_norm": 0.2060546875, "learning_rate": 9.341870060351075e-05, "loss": 0.0629, "step": 2577 }, { "epoch": 0.3315187106176861, "grad_norm": 0.18359375, "learning_rate": 9.34136876629668e-05, "loss": 0.0531, "step": 2578 }, { "epoch": 0.3316473059282438, "grad_norm": 0.1865234375, "learning_rate": 9.340867294859032e-05, "loss": 0.0532, "step": 2579 }, { "epoch": 0.3317759012388015, "grad_norm": 0.1865234375, "learning_rate": 9.340365646058625e-05, "loss": 0.0479, "step": 2580 }, { "epoch": 0.33190449654935916, "grad_norm": 0.2021484375, "learning_rate": 9.33986381991595e-05, "loss": 0.054, "step": 2581 }, { "epoch": 0.33203309185991686, "grad_norm": 0.1982421875, "learning_rate": 9.339361816451515e-05, "loss": 0.0553, "step": 2582 }, { "epoch": 0.3321616871704745, "grad_norm": 0.171875, "learning_rate": 9.338859635685829e-05, "loss": 0.0441, "step": 2583 }, { "epoch": 0.3322902824810322, "grad_norm": 0.1796875, "learning_rate": 9.33835727763941e-05, "loss": 0.0497, "step": 2584 }, { "epoch": 0.33241887779158985, "grad_norm": 0.1845703125, "learning_rate": 9.337854742332787e-05, "loss": 0.0441, "step": 2585 }, { "epoch": 0.33254747310214755, "grad_norm": 0.1953125, "learning_rate": 9.337352029786491e-05, "loss": 0.0594, "step": 2586 }, { "epoch": 0.3326760684127052, "grad_norm": 0.1923828125, "learning_rate": 9.336849140021061e-05, "loss": 0.0543, "step": 2587 }, { "epoch": 0.3328046637232629, "grad_norm": 0.19921875, "learning_rate": 9.336346073057047e-05, "loss": 0.0568, "step": 2588 }, { "epoch": 0.3329332590338206, "grad_norm": 0.2421875, "learning_rate": 9.335842828915002e-05, "loss": 0.0568, "step": 2589 }, { "epoch": 0.33306185434437824, "grad_norm": 0.2314453125, "learning_rate": 9.33533940761549e-05, "loss": 0.0724, "step": 2590 }, { "epoch": 0.33319044965493594, "grad_norm": 0.1982421875, "learning_rate": 9.334835809179077e-05, "loss": 0.0548, "step": 2591 }, { "epoch": 0.3333190449654936, "grad_norm": 0.1845703125, "learning_rate": 9.334332033626343e-05, "loss": 0.0548, "step": 2592 }, { "epoch": 0.3334476402760513, "grad_norm": 0.185546875, "learning_rate": 9.333828080977869e-05, "loss": 0.0504, "step": 2593 }, { "epoch": 0.3335762355866089, "grad_norm": 0.2109375, "learning_rate": 9.333323951254247e-05, "loss": 0.0509, "step": 2594 }, { "epoch": 0.3337048308971666, "grad_norm": 0.201171875, "learning_rate": 9.332819644476074e-05, "loss": 0.0583, "step": 2595 }, { "epoch": 0.33383342620772427, "grad_norm": 0.1845703125, "learning_rate": 9.332315160663957e-05, "loss": 0.053, "step": 2596 }, { "epoch": 0.33396202151828197, "grad_norm": 0.173828125, "learning_rate": 9.331810499838508e-05, "loss": 0.0421, "step": 2597 }, { "epoch": 0.3340906168288396, "grad_norm": 0.1943359375, "learning_rate": 9.331305662020346e-05, "loss": 0.0565, "step": 2598 }, { "epoch": 0.3342192121393973, "grad_norm": 0.1904296875, "learning_rate": 9.3308006472301e-05, "loss": 0.0533, "step": 2599 }, { "epoch": 0.334347807449955, "grad_norm": 0.1904296875, "learning_rate": 9.330295455488402e-05, "loss": 0.0499, "step": 2600 }, { "epoch": 0.33447640276051266, "grad_norm": 0.2041015625, "learning_rate": 9.329790086815897e-05, "loss": 0.051, "step": 2601 }, { "epoch": 0.33460499807107036, "grad_norm": 0.1904296875, "learning_rate": 9.32928454123323e-05, "loss": 0.0438, "step": 2602 }, { "epoch": 0.334733593381628, "grad_norm": 0.173828125, "learning_rate": 9.328778818761059e-05, "loss": 0.0532, "step": 2603 }, { "epoch": 0.3348621886921857, "grad_norm": 0.2255859375, "learning_rate": 9.328272919420047e-05, "loss": 0.0631, "step": 2604 }, { "epoch": 0.33499078400274335, "grad_norm": 0.216796875, "learning_rate": 9.327766843230863e-05, "loss": 0.0632, "step": 2605 }, { "epoch": 0.33511937931330105, "grad_norm": 0.2578125, "learning_rate": 9.327260590214187e-05, "loss": 0.0597, "step": 2606 }, { "epoch": 0.3352479746238587, "grad_norm": 0.22265625, "learning_rate": 9.326754160390703e-05, "loss": 0.0664, "step": 2607 }, { "epoch": 0.3353765699344164, "grad_norm": 0.20703125, "learning_rate": 9.326247553781102e-05, "loss": 0.0544, "step": 2608 }, { "epoch": 0.3355051652449741, "grad_norm": 0.1884765625, "learning_rate": 9.325740770406084e-05, "loss": 0.0544, "step": 2609 }, { "epoch": 0.33563376055553173, "grad_norm": 0.20703125, "learning_rate": 9.325233810286357e-05, "loss": 0.0515, "step": 2610 }, { "epoch": 0.33576235586608943, "grad_norm": 0.2138671875, "learning_rate": 9.324726673442632e-05, "loss": 0.0509, "step": 2611 }, { "epoch": 0.3358909511766471, "grad_norm": 0.181640625, "learning_rate": 9.324219359895634e-05, "loss": 0.0472, "step": 2612 }, { "epoch": 0.3360195464872048, "grad_norm": 0.1875, "learning_rate": 9.323711869666087e-05, "loss": 0.0469, "step": 2613 }, { "epoch": 0.3361481417977624, "grad_norm": 0.1875, "learning_rate": 9.32320420277473e-05, "loss": 0.0511, "step": 2614 }, { "epoch": 0.3362767371083201, "grad_norm": 0.189453125, "learning_rate": 9.322696359242303e-05, "loss": 0.0466, "step": 2615 }, { "epoch": 0.33640533241887777, "grad_norm": 0.2314453125, "learning_rate": 9.322188339089557e-05, "loss": 0.0483, "step": 2616 }, { "epoch": 0.33653392772943547, "grad_norm": 0.193359375, "learning_rate": 9.321680142337249e-05, "loss": 0.0533, "step": 2617 }, { "epoch": 0.33666252303999317, "grad_norm": 0.1962890625, "learning_rate": 9.321171769006145e-05, "loss": 0.0559, "step": 2618 }, { "epoch": 0.3367911183505508, "grad_norm": 0.2080078125, "learning_rate": 9.320663219117012e-05, "loss": 0.0578, "step": 2619 }, { "epoch": 0.3369197136611085, "grad_norm": 0.18359375, "learning_rate": 9.320154492690634e-05, "loss": 0.0487, "step": 2620 }, { "epoch": 0.33704830897166616, "grad_norm": 0.220703125, "learning_rate": 9.319645589747794e-05, "loss": 0.0603, "step": 2621 }, { "epoch": 0.33717690428222385, "grad_norm": 0.203125, "learning_rate": 9.319136510309286e-05, "loss": 0.0569, "step": 2622 }, { "epoch": 0.3373054995927815, "grad_norm": 0.1904296875, "learning_rate": 9.318627254395908e-05, "loss": 0.0529, "step": 2623 }, { "epoch": 0.3374340949033392, "grad_norm": 0.1943359375, "learning_rate": 9.318117822028472e-05, "loss": 0.0544, "step": 2624 }, { "epoch": 0.33756269021389684, "grad_norm": 0.1845703125, "learning_rate": 9.317608213227791e-05, "loss": 0.0569, "step": 2625 }, { "epoch": 0.33769128552445454, "grad_norm": 0.2080078125, "learning_rate": 9.317098428014685e-05, "loss": 0.0508, "step": 2626 }, { "epoch": 0.33781988083501224, "grad_norm": 0.1787109375, "learning_rate": 9.316588466409986e-05, "loss": 0.046, "step": 2627 }, { "epoch": 0.3379484761455699, "grad_norm": 0.2021484375, "learning_rate": 9.31607832843453e-05, "loss": 0.0518, "step": 2628 }, { "epoch": 0.3380770714561276, "grad_norm": 0.1845703125, "learning_rate": 9.31556801410916e-05, "loss": 0.0547, "step": 2629 }, { "epoch": 0.33820566676668523, "grad_norm": 0.1953125, "learning_rate": 9.315057523454724e-05, "loss": 0.0508, "step": 2630 }, { "epoch": 0.33833426207724293, "grad_norm": 0.2109375, "learning_rate": 9.314546856492086e-05, "loss": 0.0595, "step": 2631 }, { "epoch": 0.3384628573878006, "grad_norm": 0.1884765625, "learning_rate": 9.314036013242107e-05, "loss": 0.0457, "step": 2632 }, { "epoch": 0.3385914526983583, "grad_norm": 0.2021484375, "learning_rate": 9.31352499372566e-05, "loss": 0.0502, "step": 2633 }, { "epoch": 0.3387200480089159, "grad_norm": 0.19140625, "learning_rate": 9.313013797963626e-05, "loss": 0.059, "step": 2634 }, { "epoch": 0.3388486433194736, "grad_norm": 0.2119140625, "learning_rate": 9.31250242597689e-05, "loss": 0.054, "step": 2635 }, { "epoch": 0.3389772386300313, "grad_norm": 0.1796875, "learning_rate": 9.311990877786347e-05, "loss": 0.0489, "step": 2636 }, { "epoch": 0.33910583394058896, "grad_norm": 0.1787109375, "learning_rate": 9.3114791534129e-05, "loss": 0.0423, "step": 2637 }, { "epoch": 0.33923442925114666, "grad_norm": 0.1943359375, "learning_rate": 9.310967252877455e-05, "loss": 0.0647, "step": 2638 }, { "epoch": 0.3393630245617043, "grad_norm": 0.19140625, "learning_rate": 9.310455176200929e-05, "loss": 0.0517, "step": 2639 }, { "epoch": 0.339491619872262, "grad_norm": 0.1923828125, "learning_rate": 9.309942923404243e-05, "loss": 0.0524, "step": 2640 }, { "epoch": 0.33962021518281965, "grad_norm": 0.2001953125, "learning_rate": 9.30943049450833e-05, "loss": 0.0579, "step": 2641 }, { "epoch": 0.33974881049337735, "grad_norm": 0.1845703125, "learning_rate": 9.308917889534125e-05, "loss": 0.0494, "step": 2642 }, { "epoch": 0.339877405803935, "grad_norm": 0.1982421875, "learning_rate": 9.308405108502574e-05, "loss": 0.0626, "step": 2643 }, { "epoch": 0.3400060011144927, "grad_norm": 0.1865234375, "learning_rate": 9.307892151434625e-05, "loss": 0.051, "step": 2644 }, { "epoch": 0.34013459642505034, "grad_norm": 0.201171875, "learning_rate": 9.307379018351242e-05, "loss": 0.0551, "step": 2645 }, { "epoch": 0.34026319173560804, "grad_norm": 0.1669921875, "learning_rate": 9.306865709273389e-05, "loss": 0.0456, "step": 2646 }, { "epoch": 0.34039178704616574, "grad_norm": 0.185546875, "learning_rate": 9.306352224222038e-05, "loss": 0.0555, "step": 2647 }, { "epoch": 0.3405203823567234, "grad_norm": 0.205078125, "learning_rate": 9.30583856321817e-05, "loss": 0.0575, "step": 2648 }, { "epoch": 0.3406489776672811, "grad_norm": 0.1826171875, "learning_rate": 9.305324726282772e-05, "loss": 0.0502, "step": 2649 }, { "epoch": 0.3407775729778387, "grad_norm": 0.1787109375, "learning_rate": 9.30481071343684e-05, "loss": 0.0453, "step": 2650 }, { "epoch": 0.3409061682883964, "grad_norm": 0.2021484375, "learning_rate": 9.304296524701377e-05, "loss": 0.0591, "step": 2651 }, { "epoch": 0.34103476359895407, "grad_norm": 0.2177734375, "learning_rate": 9.30378216009739e-05, "loss": 0.054, "step": 2652 }, { "epoch": 0.34116335890951177, "grad_norm": 0.2001953125, "learning_rate": 9.303267619645895e-05, "loss": 0.0597, "step": 2653 }, { "epoch": 0.3412919542200694, "grad_norm": 0.2197265625, "learning_rate": 9.302752903367917e-05, "loss": 0.054, "step": 2654 }, { "epoch": 0.3414205495306271, "grad_norm": 0.18359375, "learning_rate": 9.302238011284485e-05, "loss": 0.0462, "step": 2655 }, { "epoch": 0.3415491448411848, "grad_norm": 0.1845703125, "learning_rate": 9.30172294341664e-05, "loss": 0.0484, "step": 2656 }, { "epoch": 0.34167774015174246, "grad_norm": 0.201171875, "learning_rate": 9.301207699785424e-05, "loss": 0.0545, "step": 2657 }, { "epoch": 0.34180633546230016, "grad_norm": 0.181640625, "learning_rate": 9.300692280411891e-05, "loss": 0.0499, "step": 2658 }, { "epoch": 0.3419349307728578, "grad_norm": 0.1875, "learning_rate": 9.300176685317101e-05, "loss": 0.0513, "step": 2659 }, { "epoch": 0.3420635260834155, "grad_norm": 0.20703125, "learning_rate": 9.299660914522119e-05, "loss": 0.0525, "step": 2660 }, { "epoch": 0.34219212139397315, "grad_norm": 0.181640625, "learning_rate": 9.299144968048018e-05, "loss": 0.053, "step": 2661 }, { "epoch": 0.34232071670453085, "grad_norm": 0.2001953125, "learning_rate": 9.298628845915882e-05, "loss": 0.0585, "step": 2662 }, { "epoch": 0.3424493120150885, "grad_norm": 0.203125, "learning_rate": 9.298112548146796e-05, "loss": 0.0553, "step": 2663 }, { "epoch": 0.3425779073256462, "grad_norm": 0.1875, "learning_rate": 9.297596074761856e-05, "loss": 0.0519, "step": 2664 }, { "epoch": 0.3427065026362039, "grad_norm": 0.203125, "learning_rate": 9.297079425782168e-05, "loss": 0.0644, "step": 2665 }, { "epoch": 0.34283509794676154, "grad_norm": 0.1904296875, "learning_rate": 9.296562601228836e-05, "loss": 0.0555, "step": 2666 }, { "epoch": 0.34296369325731924, "grad_norm": 0.1767578125, "learning_rate": 9.296045601122981e-05, "loss": 0.055, "step": 2667 }, { "epoch": 0.3430922885678769, "grad_norm": 0.22265625, "learning_rate": 9.295528425485726e-05, "loss": 0.0613, "step": 2668 }, { "epoch": 0.3432208838784346, "grad_norm": 0.169921875, "learning_rate": 9.295011074338202e-05, "loss": 0.0459, "step": 2669 }, { "epoch": 0.3433494791889922, "grad_norm": 0.1865234375, "learning_rate": 9.294493547701545e-05, "loss": 0.0572, "step": 2670 }, { "epoch": 0.3434780744995499, "grad_norm": 0.1904296875, "learning_rate": 9.293975845596908e-05, "loss": 0.0551, "step": 2671 }, { "epoch": 0.34360666981010757, "grad_norm": 0.1865234375, "learning_rate": 9.293457968045434e-05, "loss": 0.0522, "step": 2672 }, { "epoch": 0.34373526512066527, "grad_norm": 0.1962890625, "learning_rate": 9.292939915068289e-05, "loss": 0.0545, "step": 2673 }, { "epoch": 0.34386386043122297, "grad_norm": 0.19140625, "learning_rate": 9.292421686686639e-05, "loss": 0.0461, "step": 2674 }, { "epoch": 0.3439924557417806, "grad_norm": 0.1689453125, "learning_rate": 9.291903282921655e-05, "loss": 0.0423, "step": 2675 }, { "epoch": 0.3441210510523383, "grad_norm": 0.17578125, "learning_rate": 9.291384703794523e-05, "loss": 0.0496, "step": 2676 }, { "epoch": 0.34424964636289596, "grad_norm": 0.1875, "learning_rate": 9.29086594932643e-05, "loss": 0.0498, "step": 2677 }, { "epoch": 0.34437824167345366, "grad_norm": 0.201171875, "learning_rate": 9.290347019538571e-05, "loss": 0.054, "step": 2678 }, { "epoch": 0.3445068369840113, "grad_norm": 0.1767578125, "learning_rate": 9.28982791445215e-05, "loss": 0.049, "step": 2679 }, { "epoch": 0.344635432294569, "grad_norm": 0.203125, "learning_rate": 9.289308634088374e-05, "loss": 0.0565, "step": 2680 }, { "epoch": 0.34476402760512664, "grad_norm": 0.18359375, "learning_rate": 9.288789178468464e-05, "loss": 0.0516, "step": 2681 }, { "epoch": 0.34489262291568434, "grad_norm": 0.1884765625, "learning_rate": 9.288269547613642e-05, "loss": 0.0466, "step": 2682 }, { "epoch": 0.34502121822624204, "grad_norm": 0.1875, "learning_rate": 9.287749741545141e-05, "loss": 0.0479, "step": 2683 }, { "epoch": 0.3451498135367997, "grad_norm": 0.201171875, "learning_rate": 9.287229760284198e-05, "loss": 0.0462, "step": 2684 }, { "epoch": 0.3452784088473574, "grad_norm": 0.189453125, "learning_rate": 9.286709603852059e-05, "loss": 0.0476, "step": 2685 }, { "epoch": 0.34540700415791503, "grad_norm": 0.1953125, "learning_rate": 9.28618927226998e-05, "loss": 0.0516, "step": 2686 }, { "epoch": 0.34553559946847273, "grad_norm": 0.1806640625, "learning_rate": 9.285668765559217e-05, "loss": 0.0491, "step": 2687 }, { "epoch": 0.3456641947790304, "grad_norm": 0.2138671875, "learning_rate": 9.28514808374104e-05, "loss": 0.0646, "step": 2688 }, { "epoch": 0.3457927900895881, "grad_norm": 0.2001953125, "learning_rate": 9.284627226836722e-05, "loss": 0.0609, "step": 2689 }, { "epoch": 0.3459213854001457, "grad_norm": 0.1748046875, "learning_rate": 9.284106194867545e-05, "loss": 0.0428, "step": 2690 }, { "epoch": 0.3460499807107034, "grad_norm": 0.205078125, "learning_rate": 9.283584987854799e-05, "loss": 0.0584, "step": 2691 }, { "epoch": 0.34617857602126106, "grad_norm": 0.1669921875, "learning_rate": 9.283063605819779e-05, "loss": 0.0452, "step": 2692 }, { "epoch": 0.34630717133181876, "grad_norm": 0.1826171875, "learning_rate": 9.282542048783786e-05, "loss": 0.0455, "step": 2693 }, { "epoch": 0.34643576664237646, "grad_norm": 0.189453125, "learning_rate": 9.282020316768134e-05, "loss": 0.0488, "step": 2694 }, { "epoch": 0.3465643619529341, "grad_norm": 0.1796875, "learning_rate": 9.281498409794138e-05, "loss": 0.0486, "step": 2695 }, { "epoch": 0.3466929572634918, "grad_norm": 0.1904296875, "learning_rate": 9.280976327883122e-05, "loss": 0.0524, "step": 2696 }, { "epoch": 0.34682155257404945, "grad_norm": 0.181640625, "learning_rate": 9.280454071056419e-05, "loss": 0.0494, "step": 2697 }, { "epoch": 0.34695014788460715, "grad_norm": 0.1796875, "learning_rate": 9.279931639335369e-05, "loss": 0.048, "step": 2698 }, { "epoch": 0.3470787431951648, "grad_norm": 0.18359375, "learning_rate": 9.279409032741316e-05, "loss": 0.05, "step": 2699 }, { "epoch": 0.3472073385057225, "grad_norm": 0.19140625, "learning_rate": 9.278886251295611e-05, "loss": 0.0523, "step": 2700 }, { "epoch": 0.34733593381628014, "grad_norm": 0.2001953125, "learning_rate": 9.27836329501962e-05, "loss": 0.0518, "step": 2701 }, { "epoch": 0.34746452912683784, "grad_norm": 0.1708984375, "learning_rate": 9.277840163934705e-05, "loss": 0.0439, "step": 2702 }, { "epoch": 0.34759312443739554, "grad_norm": 0.193359375, "learning_rate": 9.277316858062244e-05, "loss": 0.0533, "step": 2703 }, { "epoch": 0.3477217197479532, "grad_norm": 0.173828125, "learning_rate": 9.276793377423616e-05, "loss": 0.0415, "step": 2704 }, { "epoch": 0.3478503150585109, "grad_norm": 0.2158203125, "learning_rate": 9.276269722040212e-05, "loss": 0.054, "step": 2705 }, { "epoch": 0.34797891036906853, "grad_norm": 0.1953125, "learning_rate": 9.275745891933426e-05, "loss": 0.0546, "step": 2706 }, { "epoch": 0.34810750567962623, "grad_norm": 0.193359375, "learning_rate": 9.275221887124663e-05, "loss": 0.0535, "step": 2707 }, { "epoch": 0.34823610099018387, "grad_norm": 0.380859375, "learning_rate": 9.274697707635332e-05, "loss": 0.0558, "step": 2708 }, { "epoch": 0.34836469630074157, "grad_norm": 0.1884765625, "learning_rate": 9.274173353486852e-05, "loss": 0.0473, "step": 2709 }, { "epoch": 0.3484932916112992, "grad_norm": 0.203125, "learning_rate": 9.273648824700645e-05, "loss": 0.0531, "step": 2710 }, { "epoch": 0.3486218869218569, "grad_norm": 0.216796875, "learning_rate": 9.273124121298144e-05, "loss": 0.0543, "step": 2711 }, { "epoch": 0.3487504822324146, "grad_norm": 0.1708984375, "learning_rate": 9.272599243300788e-05, "loss": 0.043, "step": 2712 }, { "epoch": 0.34887907754297226, "grad_norm": 0.189453125, "learning_rate": 9.272074190730021e-05, "loss": 0.0402, "step": 2713 }, { "epoch": 0.34900767285352996, "grad_norm": 0.19921875, "learning_rate": 9.271548963607299e-05, "loss": 0.0453, "step": 2714 }, { "epoch": 0.3491362681640876, "grad_norm": 0.1689453125, "learning_rate": 9.27102356195408e-05, "loss": 0.0437, "step": 2715 }, { "epoch": 0.3492648634746453, "grad_norm": 0.1875, "learning_rate": 9.270497985791833e-05, "loss": 0.0499, "step": 2716 }, { "epoch": 0.34939345878520295, "grad_norm": 0.2099609375, "learning_rate": 9.26997223514203e-05, "loss": 0.0467, "step": 2717 }, { "epoch": 0.34952205409576065, "grad_norm": 0.19921875, "learning_rate": 9.269446310026157e-05, "loss": 0.0517, "step": 2718 }, { "epoch": 0.3496506494063183, "grad_norm": 0.189453125, "learning_rate": 9.268920210465696e-05, "loss": 0.0531, "step": 2719 }, { "epoch": 0.349779244716876, "grad_norm": 0.1875, "learning_rate": 9.268393936482149e-05, "loss": 0.0526, "step": 2720 }, { "epoch": 0.3499078400274337, "grad_norm": 0.1796875, "learning_rate": 9.267867488097016e-05, "loss": 0.0463, "step": 2721 }, { "epoch": 0.35003643533799134, "grad_norm": 0.2138671875, "learning_rate": 9.267340865331807e-05, "loss": 0.0621, "step": 2722 }, { "epoch": 0.35016503064854904, "grad_norm": 0.1787109375, "learning_rate": 9.26681406820804e-05, "loss": 0.0502, "step": 2723 }, { "epoch": 0.3502936259591067, "grad_norm": 0.203125, "learning_rate": 9.26628709674724e-05, "loss": 0.051, "step": 2724 }, { "epoch": 0.3504222212696644, "grad_norm": 0.1875, "learning_rate": 9.265759950970935e-05, "loss": 0.0508, "step": 2725 }, { "epoch": 0.350550816580222, "grad_norm": 0.177734375, "learning_rate": 9.265232630900666e-05, "loss": 0.0431, "step": 2726 }, { "epoch": 0.3506794118907797, "grad_norm": 0.1923828125, "learning_rate": 9.264705136557981e-05, "loss": 0.046, "step": 2727 }, { "epoch": 0.35080800720133737, "grad_norm": 0.2158203125, "learning_rate": 9.26417746796443e-05, "loss": 0.0612, "step": 2728 }, { "epoch": 0.35093660251189507, "grad_norm": 0.1611328125, "learning_rate": 9.263649625141573e-05, "loss": 0.0387, "step": 2729 }, { "epoch": 0.35106519782245277, "grad_norm": 0.1591796875, "learning_rate": 9.263121608110977e-05, "loss": 0.039, "step": 2730 }, { "epoch": 0.3511937931330104, "grad_norm": 0.189453125, "learning_rate": 9.262593416894217e-05, "loss": 0.0482, "step": 2731 }, { "epoch": 0.3513223884435681, "grad_norm": 0.19921875, "learning_rate": 9.262065051512873e-05, "loss": 0.0481, "step": 2732 }, { "epoch": 0.35145098375412576, "grad_norm": 0.2236328125, "learning_rate": 9.261536511988536e-05, "loss": 0.0717, "step": 2733 }, { "epoch": 0.35157957906468346, "grad_norm": 0.205078125, "learning_rate": 9.2610077983428e-05, "loss": 0.0585, "step": 2734 }, { "epoch": 0.3517081743752411, "grad_norm": 0.17578125, "learning_rate": 9.260478910597266e-05, "loss": 0.0498, "step": 2735 }, { "epoch": 0.3518367696857988, "grad_norm": 0.21875, "learning_rate": 9.259949848773546e-05, "loss": 0.058, "step": 2736 }, { "epoch": 0.35196536499635644, "grad_norm": 0.2109375, "learning_rate": 9.259420612893254e-05, "loss": 0.0501, "step": 2737 }, { "epoch": 0.35209396030691414, "grad_norm": 0.19921875, "learning_rate": 9.258891202978018e-05, "loss": 0.0527, "step": 2738 }, { "epoch": 0.3522225556174718, "grad_norm": 0.2099609375, "learning_rate": 9.258361619049467e-05, "loss": 0.0463, "step": 2739 }, { "epoch": 0.3523511509280295, "grad_norm": 0.1669921875, "learning_rate": 9.257831861129239e-05, "loss": 0.0371, "step": 2740 }, { "epoch": 0.3524797462385872, "grad_norm": 0.181640625, "learning_rate": 9.25730192923898e-05, "loss": 0.0452, "step": 2741 }, { "epoch": 0.35260834154914483, "grad_norm": 0.201171875, "learning_rate": 9.256771823400341e-05, "loss": 0.0556, "step": 2742 }, { "epoch": 0.35273693685970253, "grad_norm": 0.21875, "learning_rate": 9.256241543634983e-05, "loss": 0.0613, "step": 2743 }, { "epoch": 0.3528655321702602, "grad_norm": 0.1796875, "learning_rate": 9.255711089964571e-05, "loss": 0.0407, "step": 2744 }, { "epoch": 0.3529941274808179, "grad_norm": 0.1650390625, "learning_rate": 9.25518046241078e-05, "loss": 0.044, "step": 2745 }, { "epoch": 0.3531227227913755, "grad_norm": 0.173828125, "learning_rate": 9.25464966099529e-05, "loss": 0.0452, "step": 2746 }, { "epoch": 0.3532513181019332, "grad_norm": 0.1943359375, "learning_rate": 9.25411868573979e-05, "loss": 0.0536, "step": 2747 }, { "epoch": 0.35337991341249086, "grad_norm": 0.1904296875, "learning_rate": 9.253587536665975e-05, "loss": 0.0489, "step": 2748 }, { "epoch": 0.35350850872304856, "grad_norm": 0.1865234375, "learning_rate": 9.253056213795548e-05, "loss": 0.0559, "step": 2749 }, { "epoch": 0.35363710403360626, "grad_norm": 0.1796875, "learning_rate": 9.252524717150215e-05, "loss": 0.0474, "step": 2750 }, { "epoch": 0.3537656993441639, "grad_norm": 0.1982421875, "learning_rate": 9.251993046751695e-05, "loss": 0.0452, "step": 2751 }, { "epoch": 0.3538942946547216, "grad_norm": 0.2109375, "learning_rate": 9.251461202621709e-05, "loss": 0.0565, "step": 2752 }, { "epoch": 0.35402288996527925, "grad_norm": 0.1904296875, "learning_rate": 9.250929184781991e-05, "loss": 0.0508, "step": 2753 }, { "epoch": 0.35415148527583695, "grad_norm": 0.2275390625, "learning_rate": 9.250396993254276e-05, "loss": 0.0578, "step": 2754 }, { "epoch": 0.3542800805863946, "grad_norm": 0.1787109375, "learning_rate": 9.24986462806031e-05, "loss": 0.0453, "step": 2755 }, { "epoch": 0.3544086758969523, "grad_norm": 0.1875, "learning_rate": 9.249332089221842e-05, "loss": 0.0479, "step": 2756 }, { "epoch": 0.35453727120750994, "grad_norm": 0.1787109375, "learning_rate": 9.248799376760636e-05, "loss": 0.0447, "step": 2757 }, { "epoch": 0.35466586651806764, "grad_norm": 0.18359375, "learning_rate": 9.248266490698455e-05, "loss": 0.0536, "step": 2758 }, { "epoch": 0.35479446182862534, "grad_norm": 0.197265625, "learning_rate": 9.247733431057071e-05, "loss": 0.0558, "step": 2759 }, { "epoch": 0.354923057139183, "grad_norm": 0.158203125, "learning_rate": 9.247200197858267e-05, "loss": 0.0405, "step": 2760 }, { "epoch": 0.3550516524497407, "grad_norm": 0.185546875, "learning_rate": 9.246666791123829e-05, "loss": 0.0508, "step": 2761 }, { "epoch": 0.35518024776029833, "grad_norm": 0.1962890625, "learning_rate": 9.246133210875549e-05, "loss": 0.0533, "step": 2762 }, { "epoch": 0.35530884307085603, "grad_norm": 0.1728515625, "learning_rate": 9.245599457135233e-05, "loss": 0.0409, "step": 2763 }, { "epoch": 0.3554374383814137, "grad_norm": 0.203125, "learning_rate": 9.245065529924686e-05, "loss": 0.0561, "step": 2764 }, { "epoch": 0.3555660336919714, "grad_norm": 0.232421875, "learning_rate": 9.244531429265725e-05, "loss": 0.0585, "step": 2765 }, { "epoch": 0.355694629002529, "grad_norm": 0.2138671875, "learning_rate": 9.243997155180172e-05, "loss": 0.0572, "step": 2766 }, { "epoch": 0.3558232243130867, "grad_norm": 0.1982421875, "learning_rate": 9.243462707689859e-05, "loss": 0.0496, "step": 2767 }, { "epoch": 0.3559518196236444, "grad_norm": 0.2021484375, "learning_rate": 9.24292808681662e-05, "loss": 0.0568, "step": 2768 }, { "epoch": 0.35608041493420206, "grad_norm": 0.1689453125, "learning_rate": 9.242393292582301e-05, "loss": 0.0473, "step": 2769 }, { "epoch": 0.35620901024475976, "grad_norm": 0.1806640625, "learning_rate": 9.241858325008752e-05, "loss": 0.0463, "step": 2770 }, { "epoch": 0.3563376055553174, "grad_norm": 0.201171875, "learning_rate": 9.241323184117832e-05, "loss": 0.0558, "step": 2771 }, { "epoch": 0.3564662008658751, "grad_norm": 0.169921875, "learning_rate": 9.240787869931405e-05, "loss": 0.0449, "step": 2772 }, { "epoch": 0.35659479617643275, "grad_norm": 0.1923828125, "learning_rate": 9.240252382471344e-05, "loss": 0.049, "step": 2773 }, { "epoch": 0.35672339148699045, "grad_norm": 0.1982421875, "learning_rate": 9.239716721759528e-05, "loss": 0.0637, "step": 2774 }, { "epoch": 0.3568519867975481, "grad_norm": 0.240234375, "learning_rate": 9.239180887817846e-05, "loss": 0.0578, "step": 2775 }, { "epoch": 0.3569805821081058, "grad_norm": 0.193359375, "learning_rate": 9.238644880668187e-05, "loss": 0.048, "step": 2776 }, { "epoch": 0.3571091774186635, "grad_norm": 0.185546875, "learning_rate": 9.238108700332456e-05, "loss": 0.0498, "step": 2777 }, { "epoch": 0.35723777272922114, "grad_norm": 0.1884765625, "learning_rate": 9.237572346832557e-05, "loss": 0.0499, "step": 2778 }, { "epoch": 0.35736636803977884, "grad_norm": 0.2001953125, "learning_rate": 9.237035820190409e-05, "loss": 0.0577, "step": 2779 }, { "epoch": 0.3574949633503365, "grad_norm": 0.2255859375, "learning_rate": 9.23649912042793e-05, "loss": 0.0522, "step": 2780 }, { "epoch": 0.3576235586608942, "grad_norm": 0.1806640625, "learning_rate": 9.23596224756705e-05, "loss": 0.0463, "step": 2781 }, { "epoch": 0.3577521539714518, "grad_norm": 0.1767578125, "learning_rate": 9.235425201629706e-05, "loss": 0.0434, "step": 2782 }, { "epoch": 0.3578807492820095, "grad_norm": 0.2138671875, "learning_rate": 9.234887982637838e-05, "loss": 0.0618, "step": 2783 }, { "epoch": 0.35800934459256717, "grad_norm": 0.22265625, "learning_rate": 9.234350590613402e-05, "loss": 0.0657, "step": 2784 }, { "epoch": 0.35813793990312487, "grad_norm": 0.2177734375, "learning_rate": 9.23381302557835e-05, "loss": 0.0587, "step": 2785 }, { "epoch": 0.35826653521368257, "grad_norm": 0.201171875, "learning_rate": 9.233275287554648e-05, "loss": 0.0584, "step": 2786 }, { "epoch": 0.3583951305242402, "grad_norm": 0.17578125, "learning_rate": 9.232737376564269e-05, "loss": 0.0461, "step": 2787 }, { "epoch": 0.3585237258347979, "grad_norm": 0.197265625, "learning_rate": 9.232199292629189e-05, "loss": 0.0506, "step": 2788 }, { "epoch": 0.35865232114535556, "grad_norm": 0.197265625, "learning_rate": 9.231661035771393e-05, "loss": 0.0497, "step": 2789 }, { "epoch": 0.35878091645591326, "grad_norm": 0.181640625, "learning_rate": 9.231122606012876e-05, "loss": 0.0436, "step": 2790 }, { "epoch": 0.3589095117664709, "grad_norm": 0.1748046875, "learning_rate": 9.230584003375635e-05, "loss": 0.0429, "step": 2791 }, { "epoch": 0.3590381070770286, "grad_norm": 0.18359375, "learning_rate": 9.230045227881681e-05, "loss": 0.0533, "step": 2792 }, { "epoch": 0.35916670238758625, "grad_norm": 0.1904296875, "learning_rate": 9.229506279553022e-05, "loss": 0.047, "step": 2793 }, { "epoch": 0.35929529769814395, "grad_norm": 0.1953125, "learning_rate": 9.228967158411683e-05, "loss": 0.0575, "step": 2794 }, { "epoch": 0.3594238930087016, "grad_norm": 0.1796875, "learning_rate": 9.228427864479689e-05, "loss": 0.0428, "step": 2795 }, { "epoch": 0.3595524883192593, "grad_norm": 0.189453125, "learning_rate": 9.227888397779079e-05, "loss": 0.0439, "step": 2796 }, { "epoch": 0.359681083629817, "grad_norm": 0.2138671875, "learning_rate": 9.227348758331891e-05, "loss": 0.0577, "step": 2797 }, { "epoch": 0.35980967894037463, "grad_norm": 0.185546875, "learning_rate": 9.226808946160175e-05, "loss": 0.0463, "step": 2798 }, { "epoch": 0.35993827425093233, "grad_norm": 0.201171875, "learning_rate": 9.226268961285989e-05, "loss": 0.0444, "step": 2799 }, { "epoch": 0.36006686956149, "grad_norm": 0.2119140625, "learning_rate": 9.225728803731392e-05, "loss": 0.0536, "step": 2800 }, { "epoch": 0.3601954648720477, "grad_norm": 0.1884765625, "learning_rate": 9.22518847351846e-05, "loss": 0.0516, "step": 2801 }, { "epoch": 0.3603240601826053, "grad_norm": 0.19140625, "learning_rate": 9.224647970669265e-05, "loss": 0.0582, "step": 2802 }, { "epoch": 0.360452655493163, "grad_norm": 0.1865234375, "learning_rate": 9.224107295205894e-05, "loss": 0.0541, "step": 2803 }, { "epoch": 0.36058125080372067, "grad_norm": 0.232421875, "learning_rate": 9.223566447150438e-05, "loss": 0.0489, "step": 2804 }, { "epoch": 0.36070984611427837, "grad_norm": 0.19140625, "learning_rate": 9.223025426524996e-05, "loss": 0.0484, "step": 2805 }, { "epoch": 0.36083844142483606, "grad_norm": 0.1923828125, "learning_rate": 9.222484233351672e-05, "loss": 0.0519, "step": 2806 }, { "epoch": 0.3609670367353937, "grad_norm": 0.197265625, "learning_rate": 9.22194286765258e-05, "loss": 0.0544, "step": 2807 }, { "epoch": 0.3610956320459514, "grad_norm": 0.2158203125, "learning_rate": 9.221401329449836e-05, "loss": 0.055, "step": 2808 }, { "epoch": 0.36122422735650905, "grad_norm": 0.1796875, "learning_rate": 9.220859618765573e-05, "loss": 0.0491, "step": 2809 }, { "epoch": 0.36135282266706675, "grad_norm": 0.1767578125, "learning_rate": 9.22031773562192e-05, "loss": 0.0462, "step": 2810 }, { "epoch": 0.3614814179776244, "grad_norm": 0.19140625, "learning_rate": 9.219775680041018e-05, "loss": 0.0546, "step": 2811 }, { "epoch": 0.3616100132881821, "grad_norm": 0.1943359375, "learning_rate": 9.219233452045015e-05, "loss": 0.0458, "step": 2812 }, { "epoch": 0.36173860859873974, "grad_norm": 0.189453125, "learning_rate": 9.218691051656068e-05, "loss": 0.0541, "step": 2813 }, { "epoch": 0.36186720390929744, "grad_norm": 0.193359375, "learning_rate": 9.218148478896336e-05, "loss": 0.0441, "step": 2814 }, { "epoch": 0.36199579921985514, "grad_norm": 0.2255859375, "learning_rate": 9.217605733787991e-05, "loss": 0.048, "step": 2815 }, { "epoch": 0.3621243945304128, "grad_norm": 0.189453125, "learning_rate": 9.217062816353205e-05, "loss": 0.0452, "step": 2816 }, { "epoch": 0.3622529898409705, "grad_norm": 0.193359375, "learning_rate": 9.216519726614164e-05, "loss": 0.0555, "step": 2817 }, { "epoch": 0.36238158515152813, "grad_norm": 0.1875, "learning_rate": 9.215976464593055e-05, "loss": 0.0524, "step": 2818 }, { "epoch": 0.36251018046208583, "grad_norm": 0.2138671875, "learning_rate": 9.215433030312079e-05, "loss": 0.0603, "step": 2819 }, { "epoch": 0.3626387757726435, "grad_norm": 0.185546875, "learning_rate": 9.214889423793438e-05, "loss": 0.0591, "step": 2820 }, { "epoch": 0.3627673710832012, "grad_norm": 0.1787109375, "learning_rate": 9.214345645059343e-05, "loss": 0.0487, "step": 2821 }, { "epoch": 0.3628959663937588, "grad_norm": 0.22265625, "learning_rate": 9.213801694132012e-05, "loss": 0.0503, "step": 2822 }, { "epoch": 0.3630245617043165, "grad_norm": 0.2001953125, "learning_rate": 9.213257571033672e-05, "loss": 0.0518, "step": 2823 }, { "epoch": 0.3631531570148742, "grad_norm": 0.2255859375, "learning_rate": 9.212713275786553e-05, "loss": 0.0532, "step": 2824 }, { "epoch": 0.36328175232543186, "grad_norm": 0.169921875, "learning_rate": 9.212168808412895e-05, "loss": 0.0446, "step": 2825 }, { "epoch": 0.36341034763598956, "grad_norm": 0.171875, "learning_rate": 9.211624168934945e-05, "loss": 0.046, "step": 2826 }, { "epoch": 0.3635389429465472, "grad_norm": 0.193359375, "learning_rate": 9.211079357374956e-05, "loss": 0.0665, "step": 2827 }, { "epoch": 0.3636675382571049, "grad_norm": 0.181640625, "learning_rate": 9.210534373755186e-05, "loss": 0.0468, "step": 2828 }, { "epoch": 0.36379613356766255, "grad_norm": 0.1865234375, "learning_rate": 9.209989218097908e-05, "loss": 0.0484, "step": 2829 }, { "epoch": 0.36392472887822025, "grad_norm": 0.2060546875, "learning_rate": 9.20944389042539e-05, "loss": 0.0516, "step": 2830 }, { "epoch": 0.3640533241887779, "grad_norm": 0.2001953125, "learning_rate": 9.208898390759918e-05, "loss": 0.0563, "step": 2831 }, { "epoch": 0.3641819194993356, "grad_norm": 0.1826171875, "learning_rate": 9.208352719123779e-05, "loss": 0.0467, "step": 2832 }, { "epoch": 0.3643105148098933, "grad_norm": 0.189453125, "learning_rate": 9.207806875539266e-05, "loss": 0.0473, "step": 2833 }, { "epoch": 0.36443911012045094, "grad_norm": 0.2060546875, "learning_rate": 9.207260860028686e-05, "loss": 0.0525, "step": 2834 }, { "epoch": 0.36456770543100864, "grad_norm": 0.19140625, "learning_rate": 9.206714672614347e-05, "loss": 0.0381, "step": 2835 }, { "epoch": 0.3646963007415663, "grad_norm": 0.212890625, "learning_rate": 9.206168313318564e-05, "loss": 0.0493, "step": 2836 }, { "epoch": 0.364824896052124, "grad_norm": 0.18359375, "learning_rate": 9.205621782163662e-05, "loss": 0.0498, "step": 2837 }, { "epoch": 0.3649534913626816, "grad_norm": 0.18359375, "learning_rate": 9.20507507917197e-05, "loss": 0.0454, "step": 2838 }, { "epoch": 0.3650820866732393, "grad_norm": 0.1884765625, "learning_rate": 9.204528204365829e-05, "loss": 0.0517, "step": 2839 }, { "epoch": 0.36521068198379697, "grad_norm": 0.1806640625, "learning_rate": 9.203981157767581e-05, "loss": 0.0427, "step": 2840 }, { "epoch": 0.36533927729435467, "grad_norm": 0.1943359375, "learning_rate": 9.203433939399577e-05, "loss": 0.055, "step": 2841 }, { "epoch": 0.3654678726049123, "grad_norm": 0.181640625, "learning_rate": 9.202886549284178e-05, "loss": 0.0589, "step": 2842 }, { "epoch": 0.36559646791547, "grad_norm": 0.1865234375, "learning_rate": 9.202338987443749e-05, "loss": 0.051, "step": 2843 }, { "epoch": 0.3657250632260277, "grad_norm": 0.2021484375, "learning_rate": 9.201791253900662e-05, "loss": 0.0572, "step": 2844 }, { "epoch": 0.36585365853658536, "grad_norm": 0.1689453125, "learning_rate": 9.201243348677298e-05, "loss": 0.0386, "step": 2845 }, { "epoch": 0.36598225384714306, "grad_norm": 0.166015625, "learning_rate": 9.20069527179604e-05, "loss": 0.0423, "step": 2846 }, { "epoch": 0.3661108491577007, "grad_norm": 0.1953125, "learning_rate": 9.200147023279288e-05, "loss": 0.0544, "step": 2847 }, { "epoch": 0.3662394444682584, "grad_norm": 0.2158203125, "learning_rate": 9.199598603149437e-05, "loss": 0.0513, "step": 2848 }, { "epoch": 0.36636803977881605, "grad_norm": 0.1640625, "learning_rate": 9.1990500114289e-05, "loss": 0.0455, "step": 2849 }, { "epoch": 0.36649663508937375, "grad_norm": 0.16796875, "learning_rate": 9.198501248140088e-05, "loss": 0.0392, "step": 2850 }, { "epoch": 0.3666252303999314, "grad_norm": 0.201171875, "learning_rate": 9.197952313305424e-05, "loss": 0.057, "step": 2851 }, { "epoch": 0.3667538257104891, "grad_norm": 0.20703125, "learning_rate": 9.197403206947338e-05, "loss": 0.0573, "step": 2852 }, { "epoch": 0.3668824210210468, "grad_norm": 0.197265625, "learning_rate": 9.196853929088262e-05, "loss": 0.048, "step": 2853 }, { "epoch": 0.36701101633160443, "grad_norm": 0.220703125, "learning_rate": 9.196304479750642e-05, "loss": 0.0665, "step": 2854 }, { "epoch": 0.36713961164216213, "grad_norm": 0.2001953125, "learning_rate": 9.19575485895693e-05, "loss": 0.0438, "step": 2855 }, { "epoch": 0.3672682069527198, "grad_norm": 0.2060546875, "learning_rate": 9.195205066729577e-05, "loss": 0.0691, "step": 2856 }, { "epoch": 0.3673968022632775, "grad_norm": 0.1689453125, "learning_rate": 9.194655103091052e-05, "loss": 0.0422, "step": 2857 }, { "epoch": 0.3675253975738351, "grad_norm": 0.1923828125, "learning_rate": 9.194104968063823e-05, "loss": 0.0544, "step": 2858 }, { "epoch": 0.3676539928843928, "grad_norm": 0.228515625, "learning_rate": 9.193554661670369e-05, "loss": 0.0527, "step": 2859 }, { "epoch": 0.36778258819495047, "grad_norm": 0.1865234375, "learning_rate": 9.193004183933175e-05, "loss": 0.0505, "step": 2860 }, { "epoch": 0.36791118350550817, "grad_norm": 0.1875, "learning_rate": 9.192453534874733e-05, "loss": 0.05, "step": 2861 }, { "epoch": 0.36803977881606587, "grad_norm": 0.181640625, "learning_rate": 9.191902714517542e-05, "loss": 0.0456, "step": 2862 }, { "epoch": 0.3681683741266235, "grad_norm": 0.1806640625, "learning_rate": 9.191351722884105e-05, "loss": 0.0509, "step": 2863 }, { "epoch": 0.3682969694371812, "grad_norm": 0.2099609375, "learning_rate": 9.19080055999694e-05, "loss": 0.0609, "step": 2864 }, { "epoch": 0.36842556474773885, "grad_norm": 0.2041015625, "learning_rate": 9.190249225878562e-05, "loss": 0.0556, "step": 2865 }, { "epoch": 0.36855416005829655, "grad_norm": 0.173828125, "learning_rate": 9.1896977205515e-05, "loss": 0.0422, "step": 2866 }, { "epoch": 0.3686827553688542, "grad_norm": 0.228515625, "learning_rate": 9.189146044038289e-05, "loss": 0.0528, "step": 2867 }, { "epoch": 0.3688113506794119, "grad_norm": 0.2197265625, "learning_rate": 9.188594196361469e-05, "loss": 0.0571, "step": 2868 }, { "epoch": 0.36893994598996954, "grad_norm": 0.1962890625, "learning_rate": 9.188042177543586e-05, "loss": 0.053, "step": 2869 }, { "epoch": 0.36906854130052724, "grad_norm": 0.1875, "learning_rate": 9.187489987607197e-05, "loss": 0.0515, "step": 2870 }, { "epoch": 0.36919713661108494, "grad_norm": 0.1806640625, "learning_rate": 9.186937626574863e-05, "loss": 0.0472, "step": 2871 }, { "epoch": 0.3693257319216426, "grad_norm": 0.1748046875, "learning_rate": 9.186385094469153e-05, "loss": 0.0483, "step": 2872 }, { "epoch": 0.3694543272322003, "grad_norm": 0.1796875, "learning_rate": 9.185832391312644e-05, "loss": 0.0454, "step": 2873 }, { "epoch": 0.36958292254275793, "grad_norm": 0.17578125, "learning_rate": 9.185279517127916e-05, "loss": 0.0447, "step": 2874 }, { "epoch": 0.36971151785331563, "grad_norm": 0.2060546875, "learning_rate": 9.184726471937561e-05, "loss": 0.0578, "step": 2875 }, { "epoch": 0.3698401131638733, "grad_norm": 0.1875, "learning_rate": 9.184173255764176e-05, "loss": 0.0513, "step": 2876 }, { "epoch": 0.369968708474431, "grad_norm": 0.19140625, "learning_rate": 9.183619868630364e-05, "loss": 0.0556, "step": 2877 }, { "epoch": 0.3700973037849886, "grad_norm": 0.173828125, "learning_rate": 9.183066310558735e-05, "loss": 0.0447, "step": 2878 }, { "epoch": 0.3702258990955463, "grad_norm": 0.1845703125, "learning_rate": 9.182512581571907e-05, "loss": 0.0454, "step": 2879 }, { "epoch": 0.370354494406104, "grad_norm": 0.1884765625, "learning_rate": 9.181958681692506e-05, "loss": 0.0472, "step": 2880 }, { "epoch": 0.37048308971666166, "grad_norm": 0.2060546875, "learning_rate": 9.181404610943163e-05, "loss": 0.0518, "step": 2881 }, { "epoch": 0.37061168502721936, "grad_norm": 0.1845703125, "learning_rate": 9.180850369346516e-05, "loss": 0.046, "step": 2882 }, { "epoch": 0.370740280337777, "grad_norm": 0.1806640625, "learning_rate": 9.180295956925211e-05, "loss": 0.0517, "step": 2883 }, { "epoch": 0.3708688756483347, "grad_norm": 0.16796875, "learning_rate": 9.179741373701903e-05, "loss": 0.0417, "step": 2884 }, { "epoch": 0.37099747095889235, "grad_norm": 0.19921875, "learning_rate": 9.179186619699247e-05, "loss": 0.0456, "step": 2885 }, { "epoch": 0.37112606626945005, "grad_norm": 0.1767578125, "learning_rate": 9.178631694939913e-05, "loss": 0.0441, "step": 2886 }, { "epoch": 0.3712546615800077, "grad_norm": 0.2353515625, "learning_rate": 9.178076599446574e-05, "loss": 0.0585, "step": 2887 }, { "epoch": 0.3713832568905654, "grad_norm": 0.177734375, "learning_rate": 9.17752133324191e-05, "loss": 0.044, "step": 2888 }, { "epoch": 0.37151185220112304, "grad_norm": 0.197265625, "learning_rate": 9.176965896348608e-05, "loss": 0.0479, "step": 2889 }, { "epoch": 0.37164044751168074, "grad_norm": 0.205078125, "learning_rate": 9.176410288789364e-05, "loss": 0.0623, "step": 2890 }, { "epoch": 0.37176904282223844, "grad_norm": 0.197265625, "learning_rate": 9.17585451058688e-05, "loss": 0.0491, "step": 2891 }, { "epoch": 0.3718976381327961, "grad_norm": 0.2109375, "learning_rate": 9.175298561763861e-05, "loss": 0.0533, "step": 2892 }, { "epoch": 0.3720262334433538, "grad_norm": 0.19140625, "learning_rate": 9.174742442343027e-05, "loss": 0.0506, "step": 2893 }, { "epoch": 0.3721548287539114, "grad_norm": 0.203125, "learning_rate": 9.174186152347095e-05, "loss": 0.0618, "step": 2894 }, { "epoch": 0.3722834240644691, "grad_norm": 0.201171875, "learning_rate": 9.173629691798799e-05, "loss": 0.0605, "step": 2895 }, { "epoch": 0.37241201937502677, "grad_norm": 0.19921875, "learning_rate": 9.173073060720874e-05, "loss": 0.0553, "step": 2896 }, { "epoch": 0.37254061468558447, "grad_norm": 0.1708984375, "learning_rate": 9.172516259136063e-05, "loss": 0.0412, "step": 2897 }, { "epoch": 0.3726692099961421, "grad_norm": 0.2138671875, "learning_rate": 9.171959287067116e-05, "loss": 0.0512, "step": 2898 }, { "epoch": 0.3727978053066998, "grad_norm": 0.1865234375, "learning_rate": 9.17140214453679e-05, "loss": 0.0478, "step": 2899 }, { "epoch": 0.3729264006172575, "grad_norm": 0.216796875, "learning_rate": 9.17084483156785e-05, "loss": 0.051, "step": 2900 }, { "epoch": 0.37305499592781516, "grad_norm": 0.201171875, "learning_rate": 9.170287348183066e-05, "loss": 0.0598, "step": 2901 }, { "epoch": 0.37318359123837286, "grad_norm": 0.208984375, "learning_rate": 9.169729694405218e-05, "loss": 0.0489, "step": 2902 }, { "epoch": 0.3733121865489305, "grad_norm": 0.169921875, "learning_rate": 9.16917187025709e-05, "loss": 0.0449, "step": 2903 }, { "epoch": 0.3734407818594882, "grad_norm": 0.1728515625, "learning_rate": 9.168613875761474e-05, "loss": 0.0517, "step": 2904 }, { "epoch": 0.37356937717004585, "grad_norm": 0.1923828125, "learning_rate": 9.168055710941169e-05, "loss": 0.0462, "step": 2905 }, { "epoch": 0.37369797248060355, "grad_norm": 0.1728515625, "learning_rate": 9.167497375818981e-05, "loss": 0.041, "step": 2906 }, { "epoch": 0.3738265677911612, "grad_norm": 0.1669921875, "learning_rate": 9.166938870417722e-05, "loss": 0.0447, "step": 2907 }, { "epoch": 0.3739551631017189, "grad_norm": 0.189453125, "learning_rate": 9.166380194760215e-05, "loss": 0.0531, "step": 2908 }, { "epoch": 0.3740837584122766, "grad_norm": 0.193359375, "learning_rate": 9.165821348869284e-05, "loss": 0.05, "step": 2909 }, { "epoch": 0.37421235372283423, "grad_norm": 0.201171875, "learning_rate": 9.165262332767761e-05, "loss": 0.0508, "step": 2910 }, { "epoch": 0.37434094903339193, "grad_norm": 0.1796875, "learning_rate": 9.164703146478492e-05, "loss": 0.0503, "step": 2911 }, { "epoch": 0.3744695443439496, "grad_norm": 0.2080078125, "learning_rate": 9.164143790024319e-05, "loss": 0.0503, "step": 2912 }, { "epoch": 0.3745981396545073, "grad_norm": 0.2353515625, "learning_rate": 9.163584263428102e-05, "loss": 0.059, "step": 2913 }, { "epoch": 0.3747267349650649, "grad_norm": 0.181640625, "learning_rate": 9.1630245667127e-05, "loss": 0.048, "step": 2914 }, { "epoch": 0.3748553302756226, "grad_norm": 0.19140625, "learning_rate": 9.162464699900982e-05, "loss": 0.0489, "step": 2915 }, { "epoch": 0.37498392558618027, "grad_norm": 0.1806640625, "learning_rate": 9.161904663015821e-05, "loss": 0.0507, "step": 2916 }, { "epoch": 0.37511252089673797, "grad_norm": 0.1845703125, "learning_rate": 9.161344456080105e-05, "loss": 0.0499, "step": 2917 }, { "epoch": 0.37524111620729567, "grad_norm": 0.1796875, "learning_rate": 9.160784079116718e-05, "loss": 0.0437, "step": 2918 }, { "epoch": 0.3753697115178533, "grad_norm": 0.2109375, "learning_rate": 9.160223532148557e-05, "loss": 0.058, "step": 2919 }, { "epoch": 0.375498306828411, "grad_norm": 0.1748046875, "learning_rate": 9.159662815198528e-05, "loss": 0.049, "step": 2920 }, { "epoch": 0.37562690213896865, "grad_norm": 0.16796875, "learning_rate": 9.159101928289541e-05, "loss": 0.0459, "step": 2921 }, { "epoch": 0.37575549744952635, "grad_norm": 0.18359375, "learning_rate": 9.158540871444513e-05, "loss": 0.0548, "step": 2922 }, { "epoch": 0.375884092760084, "grad_norm": 0.19921875, "learning_rate": 9.157979644686365e-05, "loss": 0.0446, "step": 2923 }, { "epoch": 0.3760126880706417, "grad_norm": 0.185546875, "learning_rate": 9.15741824803803e-05, "loss": 0.0461, "step": 2924 }, { "epoch": 0.37614128338119934, "grad_norm": 0.1845703125, "learning_rate": 9.156856681522448e-05, "loss": 0.0513, "step": 2925 }, { "epoch": 0.37626987869175704, "grad_norm": 0.16015625, "learning_rate": 9.15629494516256e-05, "loss": 0.0415, "step": 2926 }, { "epoch": 0.37639847400231474, "grad_norm": 0.1748046875, "learning_rate": 9.155733038981322e-05, "loss": 0.0426, "step": 2927 }, { "epoch": 0.3765270693128724, "grad_norm": 0.197265625, "learning_rate": 9.155170963001692e-05, "loss": 0.0555, "step": 2928 }, { "epoch": 0.3766556646234301, "grad_norm": 0.1953125, "learning_rate": 9.154608717246632e-05, "loss": 0.0617, "step": 2929 }, { "epoch": 0.37678425993398773, "grad_norm": 0.189453125, "learning_rate": 9.154046301739119e-05, "loss": 0.0469, "step": 2930 }, { "epoch": 0.37691285524454543, "grad_norm": 0.205078125, "learning_rate": 9.153483716502131e-05, "loss": 0.0489, "step": 2931 }, { "epoch": 0.3770414505551031, "grad_norm": 0.201171875, "learning_rate": 9.152920961558655e-05, "loss": 0.057, "step": 2932 }, { "epoch": 0.3771700458656608, "grad_norm": 0.177734375, "learning_rate": 9.152358036931683e-05, "loss": 0.0422, "step": 2933 }, { "epoch": 0.3772986411762184, "grad_norm": 0.1767578125, "learning_rate": 9.151794942644218e-05, "loss": 0.0491, "step": 2934 }, { "epoch": 0.3774272364867761, "grad_norm": 0.1826171875, "learning_rate": 9.151231678719266e-05, "loss": 0.0382, "step": 2935 }, { "epoch": 0.37755583179733376, "grad_norm": 0.193359375, "learning_rate": 9.15066824517984e-05, "loss": 0.0576, "step": 2936 }, { "epoch": 0.37768442710789146, "grad_norm": 0.18359375, "learning_rate": 9.150104642048963e-05, "loss": 0.0514, "step": 2937 }, { "epoch": 0.37781302241844916, "grad_norm": 0.1953125, "learning_rate": 9.149540869349664e-05, "loss": 0.0435, "step": 2938 }, { "epoch": 0.3779416177290068, "grad_norm": 0.181640625, "learning_rate": 9.148976927104975e-05, "loss": 0.0428, "step": 2939 }, { "epoch": 0.3780702130395645, "grad_norm": 0.2021484375, "learning_rate": 9.148412815337941e-05, "loss": 0.0594, "step": 2940 }, { "epoch": 0.37819880835012215, "grad_norm": 0.2158203125, "learning_rate": 9.14784853407161e-05, "loss": 0.0526, "step": 2941 }, { "epoch": 0.37832740366067985, "grad_norm": 0.173828125, "learning_rate": 9.147284083329036e-05, "loss": 0.0462, "step": 2942 }, { "epoch": 0.3784559989712375, "grad_norm": 0.203125, "learning_rate": 9.146719463133284e-05, "loss": 0.0544, "step": 2943 }, { "epoch": 0.3785845942817952, "grad_norm": 0.1943359375, "learning_rate": 9.146154673507426e-05, "loss": 0.0463, "step": 2944 }, { "epoch": 0.37871318959235284, "grad_norm": 0.1689453125, "learning_rate": 9.145589714474532e-05, "loss": 0.0416, "step": 2945 }, { "epoch": 0.37884178490291054, "grad_norm": 0.193359375, "learning_rate": 9.14502458605769e-05, "loss": 0.0577, "step": 2946 }, { "epoch": 0.37897038021346824, "grad_norm": 0.197265625, "learning_rate": 9.144459288279992e-05, "loss": 0.0443, "step": 2947 }, { "epoch": 0.3790989755240259, "grad_norm": 0.189453125, "learning_rate": 9.143893821164531e-05, "loss": 0.0505, "step": 2948 }, { "epoch": 0.3792275708345836, "grad_norm": 0.1845703125, "learning_rate": 9.143328184734415e-05, "loss": 0.0441, "step": 2949 }, { "epoch": 0.3793561661451412, "grad_norm": 0.232421875, "learning_rate": 9.142762379012753e-05, "loss": 0.0437, "step": 2950 }, { "epoch": 0.3794847614556989, "grad_norm": 0.224609375, "learning_rate": 9.142196404022665e-05, "loss": 0.0518, "step": 2951 }, { "epoch": 0.37961335676625657, "grad_norm": 0.1923828125, "learning_rate": 9.141630259787275e-05, "loss": 0.0503, "step": 2952 }, { "epoch": 0.37974195207681427, "grad_norm": 0.1953125, "learning_rate": 9.141063946329713e-05, "loss": 0.0558, "step": 2953 }, { "epoch": 0.3798705473873719, "grad_norm": 0.1787109375, "learning_rate": 9.140497463673122e-05, "loss": 0.0464, "step": 2954 }, { "epoch": 0.3799991426979296, "grad_norm": 0.1875, "learning_rate": 9.139930811840645e-05, "loss": 0.0474, "step": 2955 }, { "epoch": 0.3801277380084873, "grad_norm": 0.2060546875, "learning_rate": 9.139363990855437e-05, "loss": 0.0478, "step": 2956 }, { "epoch": 0.38025633331904496, "grad_norm": 0.216796875, "learning_rate": 9.138797000740655e-05, "loss": 0.0532, "step": 2957 }, { "epoch": 0.38038492862960266, "grad_norm": 0.1845703125, "learning_rate": 9.138229841519465e-05, "loss": 0.0481, "step": 2958 }, { "epoch": 0.3805135239401603, "grad_norm": 0.1953125, "learning_rate": 9.137662513215045e-05, "loss": 0.0514, "step": 2959 }, { "epoch": 0.380642119250718, "grad_norm": 0.203125, "learning_rate": 9.137095015850572e-05, "loss": 0.0516, "step": 2960 }, { "epoch": 0.38077071456127565, "grad_norm": 0.1865234375, "learning_rate": 9.136527349449233e-05, "loss": 0.0476, "step": 2961 }, { "epoch": 0.38089930987183335, "grad_norm": 0.2294921875, "learning_rate": 9.135959514034223e-05, "loss": 0.0569, "step": 2962 }, { "epoch": 0.381027905182391, "grad_norm": 0.19140625, "learning_rate": 9.135391509628743e-05, "loss": 0.0496, "step": 2963 }, { "epoch": 0.3811565004929487, "grad_norm": 0.18359375, "learning_rate": 9.134823336256002e-05, "loss": 0.0511, "step": 2964 }, { "epoch": 0.3812850958035064, "grad_norm": 0.169921875, "learning_rate": 9.134254993939213e-05, "loss": 0.0389, "step": 2965 }, { "epoch": 0.38141369111406404, "grad_norm": 0.169921875, "learning_rate": 9.1336864827016e-05, "loss": 0.0457, "step": 2966 }, { "epoch": 0.38154228642462173, "grad_norm": 0.1884765625, "learning_rate": 9.133117802566388e-05, "loss": 0.0501, "step": 2967 }, { "epoch": 0.3816708817351794, "grad_norm": 0.1767578125, "learning_rate": 9.13254895355682e-05, "loss": 0.0438, "step": 2968 }, { "epoch": 0.3817994770457371, "grad_norm": 0.21484375, "learning_rate": 9.131979935696129e-05, "loss": 0.0671, "step": 2969 }, { "epoch": 0.3819280723562947, "grad_norm": 0.2490234375, "learning_rate": 9.131410749007569e-05, "loss": 0.0756, "step": 2970 }, { "epoch": 0.3820566676668524, "grad_norm": 0.2138671875, "learning_rate": 9.1308413935144e-05, "loss": 0.0547, "step": 2971 }, { "epoch": 0.38218526297741007, "grad_norm": 0.17578125, "learning_rate": 9.13027186923988e-05, "loss": 0.0418, "step": 2972 }, { "epoch": 0.38231385828796777, "grad_norm": 0.201171875, "learning_rate": 9.129702176207279e-05, "loss": 0.0548, "step": 2973 }, { "epoch": 0.38244245359852547, "grad_norm": 0.19140625, "learning_rate": 9.129132314439876e-05, "loss": 0.0483, "step": 2974 }, { "epoch": 0.3825710489090831, "grad_norm": 0.1884765625, "learning_rate": 9.128562283960955e-05, "loss": 0.053, "step": 2975 }, { "epoch": 0.3826996442196408, "grad_norm": 0.2041015625, "learning_rate": 9.127992084793807e-05, "loss": 0.0531, "step": 2976 }, { "epoch": 0.38282823953019846, "grad_norm": 0.19140625, "learning_rate": 9.127421716961728e-05, "loss": 0.0591, "step": 2977 }, { "epoch": 0.38295683484075616, "grad_norm": 0.162109375, "learning_rate": 9.126851180488024e-05, "loss": 0.0431, "step": 2978 }, { "epoch": 0.3830854301513138, "grad_norm": 0.169921875, "learning_rate": 9.126280475396005e-05, "loss": 0.0411, "step": 2979 }, { "epoch": 0.3832140254618715, "grad_norm": 0.2021484375, "learning_rate": 9.125709601708992e-05, "loss": 0.0578, "step": 2980 }, { "epoch": 0.38334262077242914, "grad_norm": 0.267578125, "learning_rate": 9.125138559450308e-05, "loss": 0.0546, "step": 2981 }, { "epoch": 0.38347121608298684, "grad_norm": 0.2080078125, "learning_rate": 9.124567348643285e-05, "loss": 0.0556, "step": 2982 }, { "epoch": 0.38359981139354454, "grad_norm": 0.1962890625, "learning_rate": 9.123995969311263e-05, "loss": 0.052, "step": 2983 }, { "epoch": 0.3837284067041022, "grad_norm": 0.1904296875, "learning_rate": 9.123424421477587e-05, "loss": 0.051, "step": 2984 }, { "epoch": 0.3838570020146599, "grad_norm": 0.203125, "learning_rate": 9.122852705165612e-05, "loss": 0.0552, "step": 2985 }, { "epoch": 0.38398559732521753, "grad_norm": 0.2001953125, "learning_rate": 9.122280820398695e-05, "loss": 0.0567, "step": 2986 }, { "epoch": 0.38411419263577523, "grad_norm": 0.2001953125, "learning_rate": 9.121708767200203e-05, "loss": 0.0613, "step": 2987 }, { "epoch": 0.3842427879463329, "grad_norm": 0.1630859375, "learning_rate": 9.121136545593509e-05, "loss": 0.0439, "step": 2988 }, { "epoch": 0.3843713832568906, "grad_norm": 0.2041015625, "learning_rate": 9.120564155601996e-05, "loss": 0.0608, "step": 2989 }, { "epoch": 0.3844999785674482, "grad_norm": 0.1689453125, "learning_rate": 9.119991597249049e-05, "loss": 0.0351, "step": 2990 }, { "epoch": 0.3846285738780059, "grad_norm": 0.2060546875, "learning_rate": 9.119418870558063e-05, "loss": 0.0569, "step": 2991 }, { "epoch": 0.38475716918856356, "grad_norm": 0.2734375, "learning_rate": 9.118845975552438e-05, "loss": 0.0511, "step": 2992 }, { "epoch": 0.38488576449912126, "grad_norm": 0.2021484375, "learning_rate": 9.118272912255584e-05, "loss": 0.0568, "step": 2993 }, { "epoch": 0.38501435980967896, "grad_norm": 0.2431640625, "learning_rate": 9.11769968069091e-05, "loss": 0.0465, "step": 2994 }, { "epoch": 0.3851429551202366, "grad_norm": 0.205078125, "learning_rate": 9.117126280881845e-05, "loss": 0.0584, "step": 2995 }, { "epoch": 0.3852715504307943, "grad_norm": 0.1611328125, "learning_rate": 9.116552712851811e-05, "loss": 0.0433, "step": 2996 }, { "epoch": 0.38540014574135195, "grad_norm": 0.228515625, "learning_rate": 9.115978976624249e-05, "loss": 0.0579, "step": 2997 }, { "epoch": 0.38552874105190965, "grad_norm": 0.1845703125, "learning_rate": 9.115405072222598e-05, "loss": 0.0516, "step": 2998 }, { "epoch": 0.3856573363624673, "grad_norm": 0.185546875, "learning_rate": 9.114830999670307e-05, "loss": 0.0528, "step": 2999 }, { "epoch": 0.385785931673025, "grad_norm": 0.2021484375, "learning_rate": 9.114256758990833e-05, "loss": 0.0595, "step": 3000 }, { "epoch": 0.385785931673025, "eval_loss": 0.04979272559285164, "eval_runtime": 1043.9942, "eval_samples_per_second": 94.087, "eval_steps_per_second": 1.176, "step": 3000 }, { "epoch": 0.38591452698358264, "grad_norm": 0.21484375, "learning_rate": 9.113682350207637e-05, "loss": 0.0488, "step": 3001 }, { "epoch": 0.38604312229414034, "grad_norm": 0.20703125, "learning_rate": 9.113107773344192e-05, "loss": 0.0539, "step": 3002 }, { "epoch": 0.38617171760469804, "grad_norm": 0.2099609375, "learning_rate": 9.11253302842397e-05, "loss": 0.05, "step": 3003 }, { "epoch": 0.3863003129152557, "grad_norm": 0.1865234375, "learning_rate": 9.111958115470461e-05, "loss": 0.0439, "step": 3004 }, { "epoch": 0.3864289082258134, "grad_norm": 0.1943359375, "learning_rate": 9.111383034507148e-05, "loss": 0.0518, "step": 3005 }, { "epoch": 0.386557503536371, "grad_norm": 0.19140625, "learning_rate": 9.110807785557531e-05, "loss": 0.0478, "step": 3006 }, { "epoch": 0.3866860988469287, "grad_norm": 0.203125, "learning_rate": 9.110232368645115e-05, "loss": 0.057, "step": 3007 }, { "epoch": 0.38681469415748637, "grad_norm": 0.2060546875, "learning_rate": 9.10965678379341e-05, "loss": 0.0617, "step": 3008 }, { "epoch": 0.38694328946804407, "grad_norm": 0.166015625, "learning_rate": 9.109081031025935e-05, "loss": 0.0387, "step": 3009 }, { "epoch": 0.3870718847786017, "grad_norm": 0.1953125, "learning_rate": 9.108505110366213e-05, "loss": 0.0465, "step": 3010 }, { "epoch": 0.3872004800891594, "grad_norm": 0.1845703125, "learning_rate": 9.107929021837776e-05, "loss": 0.0527, "step": 3011 }, { "epoch": 0.3873290753997171, "grad_norm": 0.19140625, "learning_rate": 9.10735276546416e-05, "loss": 0.0591, "step": 3012 }, { "epoch": 0.38745767071027476, "grad_norm": 0.2021484375, "learning_rate": 9.106776341268917e-05, "loss": 0.052, "step": 3013 }, { "epoch": 0.38758626602083246, "grad_norm": 0.1923828125, "learning_rate": 9.10619974927559e-05, "loss": 0.0495, "step": 3014 }, { "epoch": 0.3877148613313901, "grad_norm": 0.220703125, "learning_rate": 9.105622989507745e-05, "loss": 0.0656, "step": 3015 }, { "epoch": 0.3878434566419478, "grad_norm": 0.193359375, "learning_rate": 9.105046061988944e-05, "loss": 0.0563, "step": 3016 }, { "epoch": 0.38797205195250545, "grad_norm": 0.166015625, "learning_rate": 9.104468966742761e-05, "loss": 0.0395, "step": 3017 }, { "epoch": 0.38810064726306315, "grad_norm": 0.19921875, "learning_rate": 9.103891703792775e-05, "loss": 0.0505, "step": 3018 }, { "epoch": 0.3882292425736208, "grad_norm": 0.193359375, "learning_rate": 9.103314273162571e-05, "loss": 0.0509, "step": 3019 }, { "epoch": 0.3883578378841785, "grad_norm": 0.1708984375, "learning_rate": 9.102736674875744e-05, "loss": 0.0456, "step": 3020 }, { "epoch": 0.3884864331947362, "grad_norm": 0.193359375, "learning_rate": 9.102158908955896e-05, "loss": 0.0522, "step": 3021 }, { "epoch": 0.38861502850529384, "grad_norm": 0.2412109375, "learning_rate": 9.10158097542663e-05, "loss": 0.0504, "step": 3022 }, { "epoch": 0.38874362381585154, "grad_norm": 0.1923828125, "learning_rate": 9.101002874311559e-05, "loss": 0.0587, "step": 3023 }, { "epoch": 0.3888722191264092, "grad_norm": 0.1982421875, "learning_rate": 9.100424605634306e-05, "loss": 0.0542, "step": 3024 }, { "epoch": 0.3890008144369669, "grad_norm": 0.1923828125, "learning_rate": 9.0998461694185e-05, "loss": 0.0534, "step": 3025 }, { "epoch": 0.3891294097475245, "grad_norm": 0.1875, "learning_rate": 9.099267565687771e-05, "loss": 0.0563, "step": 3026 }, { "epoch": 0.3892580050580822, "grad_norm": 0.208984375, "learning_rate": 9.098688794465763e-05, "loss": 0.057, "step": 3027 }, { "epoch": 0.38938660036863987, "grad_norm": 0.1728515625, "learning_rate": 9.098109855776125e-05, "loss": 0.0515, "step": 3028 }, { "epoch": 0.38951519567919757, "grad_norm": 0.1865234375, "learning_rate": 9.097530749642507e-05, "loss": 0.0474, "step": 3029 }, { "epoch": 0.38964379098975527, "grad_norm": 0.189453125, "learning_rate": 9.096951476088575e-05, "loss": 0.0524, "step": 3030 }, { "epoch": 0.3897723863003129, "grad_norm": 0.1826171875, "learning_rate": 9.096372035137995e-05, "loss": 0.0506, "step": 3031 }, { "epoch": 0.3899009816108706, "grad_norm": 0.185546875, "learning_rate": 9.095792426814445e-05, "loss": 0.0555, "step": 3032 }, { "epoch": 0.39002957692142826, "grad_norm": 0.1513671875, "learning_rate": 9.095212651141603e-05, "loss": 0.0379, "step": 3033 }, { "epoch": 0.39015817223198596, "grad_norm": 0.2041015625, "learning_rate": 9.094632708143162e-05, "loss": 0.0528, "step": 3034 }, { "epoch": 0.3902867675425436, "grad_norm": 0.1884765625, "learning_rate": 9.094052597842815e-05, "loss": 0.0526, "step": 3035 }, { "epoch": 0.3904153628531013, "grad_norm": 0.1650390625, "learning_rate": 9.093472320264266e-05, "loss": 0.0422, "step": 3036 }, { "epoch": 0.39054395816365894, "grad_norm": 0.154296875, "learning_rate": 9.092891875431225e-05, "loss": 0.0337, "step": 3037 }, { "epoch": 0.39067255347421664, "grad_norm": 0.2353515625, "learning_rate": 9.092311263367406e-05, "loss": 0.051, "step": 3038 }, { "epoch": 0.3908011487847743, "grad_norm": 0.203125, "learning_rate": 9.091730484096535e-05, "loss": 0.0538, "step": 3039 }, { "epoch": 0.390929744095332, "grad_norm": 0.16796875, "learning_rate": 9.091149537642339e-05, "loss": 0.0436, "step": 3040 }, { "epoch": 0.3910583394058897, "grad_norm": 0.2001953125, "learning_rate": 9.090568424028558e-05, "loss": 0.0621, "step": 3041 }, { "epoch": 0.39118693471644733, "grad_norm": 0.1728515625, "learning_rate": 9.089987143278933e-05, "loss": 0.0364, "step": 3042 }, { "epoch": 0.39131553002700503, "grad_norm": 0.18359375, "learning_rate": 9.089405695417216e-05, "loss": 0.0498, "step": 3043 }, { "epoch": 0.3914441253375627, "grad_norm": 0.1875, "learning_rate": 9.088824080467163e-05, "loss": 0.053, "step": 3044 }, { "epoch": 0.3915727206481204, "grad_norm": 0.208984375, "learning_rate": 9.088242298452539e-05, "loss": 0.0565, "step": 3045 }, { "epoch": 0.391701315958678, "grad_norm": 0.1767578125, "learning_rate": 9.087660349397116e-05, "loss": 0.0484, "step": 3046 }, { "epoch": 0.3918299112692357, "grad_norm": 0.1884765625, "learning_rate": 9.087078233324669e-05, "loss": 0.0461, "step": 3047 }, { "epoch": 0.39195850657979336, "grad_norm": 0.1806640625, "learning_rate": 9.086495950258984e-05, "loss": 0.0456, "step": 3048 }, { "epoch": 0.39208710189035106, "grad_norm": 0.1748046875, "learning_rate": 9.085913500223853e-05, "loss": 0.0512, "step": 3049 }, { "epoch": 0.39221569720090876, "grad_norm": 0.16015625, "learning_rate": 9.085330883243074e-05, "loss": 0.0435, "step": 3050 }, { "epoch": 0.3923442925114664, "grad_norm": 0.1962890625, "learning_rate": 9.084748099340452e-05, "loss": 0.0537, "step": 3051 }, { "epoch": 0.3924728878220241, "grad_norm": 0.1904296875, "learning_rate": 9.084165148539798e-05, "loss": 0.0533, "step": 3052 }, { "epoch": 0.39260148313258175, "grad_norm": 0.1904296875, "learning_rate": 9.083582030864932e-05, "loss": 0.0523, "step": 3053 }, { "epoch": 0.39273007844313945, "grad_norm": 0.1826171875, "learning_rate": 9.082998746339677e-05, "loss": 0.0497, "step": 3054 }, { "epoch": 0.3928586737536971, "grad_norm": 0.193359375, "learning_rate": 9.08241529498787e-05, "loss": 0.0481, "step": 3055 }, { "epoch": 0.3929872690642548, "grad_norm": 0.1845703125, "learning_rate": 9.081831676833344e-05, "loss": 0.0445, "step": 3056 }, { "epoch": 0.39311586437481244, "grad_norm": 0.17578125, "learning_rate": 9.08124789189995e-05, "loss": 0.0426, "step": 3057 }, { "epoch": 0.39324445968537014, "grad_norm": 0.1865234375, "learning_rate": 9.080663940211539e-05, "loss": 0.0514, "step": 3058 }, { "epoch": 0.39337305499592784, "grad_norm": 0.193359375, "learning_rate": 9.080079821791971e-05, "loss": 0.0509, "step": 3059 }, { "epoch": 0.3935016503064855, "grad_norm": 0.1708984375, "learning_rate": 9.079495536665112e-05, "loss": 0.045, "step": 3060 }, { "epoch": 0.3936302456170432, "grad_norm": 0.166015625, "learning_rate": 9.078911084854833e-05, "loss": 0.0444, "step": 3061 }, { "epoch": 0.39375884092760083, "grad_norm": 0.2197265625, "learning_rate": 9.078326466385019e-05, "loss": 0.0622, "step": 3062 }, { "epoch": 0.39388743623815853, "grad_norm": 0.1689453125, "learning_rate": 9.077741681279553e-05, "loss": 0.0389, "step": 3063 }, { "epoch": 0.3940160315487162, "grad_norm": 0.1923828125, "learning_rate": 9.07715672956233e-05, "loss": 0.0515, "step": 3064 }, { "epoch": 0.39414462685927387, "grad_norm": 0.2216796875, "learning_rate": 9.07657161125725e-05, "loss": 0.0754, "step": 3065 }, { "epoch": 0.3942732221698315, "grad_norm": 0.181640625, "learning_rate": 9.075986326388219e-05, "loss": 0.0522, "step": 3066 }, { "epoch": 0.3944018174803892, "grad_norm": 0.185546875, "learning_rate": 9.075400874979155e-05, "loss": 0.0516, "step": 3067 }, { "epoch": 0.3945304127909469, "grad_norm": 0.220703125, "learning_rate": 9.074815257053974e-05, "loss": 0.0522, "step": 3068 }, { "epoch": 0.39465900810150456, "grad_norm": 0.1884765625, "learning_rate": 9.074229472636607e-05, "loss": 0.0458, "step": 3069 }, { "epoch": 0.39478760341206226, "grad_norm": 0.171875, "learning_rate": 9.073643521750988e-05, "loss": 0.0432, "step": 3070 }, { "epoch": 0.3949161987226199, "grad_norm": 0.1796875, "learning_rate": 9.073057404421056e-05, "loss": 0.0466, "step": 3071 }, { "epoch": 0.3950447940331776, "grad_norm": 0.169921875, "learning_rate": 9.072471120670763e-05, "loss": 0.0433, "step": 3072 }, { "epoch": 0.39517338934373525, "grad_norm": 0.1767578125, "learning_rate": 9.07188467052406e-05, "loss": 0.0403, "step": 3073 }, { "epoch": 0.39530198465429295, "grad_norm": 0.1923828125, "learning_rate": 9.071298054004911e-05, "loss": 0.0437, "step": 3074 }, { "epoch": 0.3954305799648506, "grad_norm": 0.1669921875, "learning_rate": 9.070711271137283e-05, "loss": 0.0386, "step": 3075 }, { "epoch": 0.3955591752754083, "grad_norm": 0.2158203125, "learning_rate": 9.070124321945152e-05, "loss": 0.0477, "step": 3076 }, { "epoch": 0.395687770585966, "grad_norm": 0.216796875, "learning_rate": 9.069537206452501e-05, "loss": 0.0444, "step": 3077 }, { "epoch": 0.39581636589652364, "grad_norm": 0.1748046875, "learning_rate": 9.068949924683316e-05, "loss": 0.0477, "step": 3078 }, { "epoch": 0.39594496120708134, "grad_norm": 0.2001953125, "learning_rate": 9.068362476661596e-05, "loss": 0.0588, "step": 3079 }, { "epoch": 0.396073556517639, "grad_norm": 0.1953125, "learning_rate": 9.067774862411342e-05, "loss": 0.0443, "step": 3080 }, { "epoch": 0.3962021518281967, "grad_norm": 0.1923828125, "learning_rate": 9.067187081956564e-05, "loss": 0.0527, "step": 3081 }, { "epoch": 0.3963307471387543, "grad_norm": 0.17578125, "learning_rate": 9.066599135321276e-05, "loss": 0.0415, "step": 3082 }, { "epoch": 0.396459342449312, "grad_norm": 0.185546875, "learning_rate": 9.066011022529504e-05, "loss": 0.0507, "step": 3083 }, { "epoch": 0.39658793775986967, "grad_norm": 0.1982421875, "learning_rate": 9.065422743605273e-05, "loss": 0.0563, "step": 3084 }, { "epoch": 0.39671653307042737, "grad_norm": 0.2333984375, "learning_rate": 9.064834298572624e-05, "loss": 0.0527, "step": 3085 }, { "epoch": 0.396845128380985, "grad_norm": 0.2080078125, "learning_rate": 9.064245687455597e-05, "loss": 0.0504, "step": 3086 }, { "epoch": 0.3969737236915427, "grad_norm": 0.2021484375, "learning_rate": 9.063656910278244e-05, "loss": 0.0538, "step": 3087 }, { "epoch": 0.3971023190021004, "grad_norm": 0.201171875, "learning_rate": 9.06306796706462e-05, "loss": 0.0548, "step": 3088 }, { "epoch": 0.39723091431265806, "grad_norm": 0.2001953125, "learning_rate": 9.062478857838791e-05, "loss": 0.0471, "step": 3089 }, { "epoch": 0.39735950962321576, "grad_norm": 0.21484375, "learning_rate": 9.061889582624826e-05, "loss": 0.063, "step": 3090 }, { "epoch": 0.3974881049337734, "grad_norm": 0.1787109375, "learning_rate": 9.0613001414468e-05, "loss": 0.0485, "step": 3091 }, { "epoch": 0.3976167002443311, "grad_norm": 0.2080078125, "learning_rate": 9.0607105343288e-05, "loss": 0.0555, "step": 3092 }, { "epoch": 0.39774529555488874, "grad_norm": 0.20703125, "learning_rate": 9.060120761294914e-05, "loss": 0.0512, "step": 3093 }, { "epoch": 0.39787389086544644, "grad_norm": 0.1904296875, "learning_rate": 9.059530822369243e-05, "loss": 0.0509, "step": 3094 }, { "epoch": 0.3980024861760041, "grad_norm": 0.181640625, "learning_rate": 9.058940717575887e-05, "loss": 0.0466, "step": 3095 }, { "epoch": 0.3981310814865618, "grad_norm": 0.205078125, "learning_rate": 9.058350446938961e-05, "loss": 0.0502, "step": 3096 }, { "epoch": 0.3982596767971195, "grad_norm": 0.1767578125, "learning_rate": 9.057760010482581e-05, "loss": 0.0462, "step": 3097 }, { "epoch": 0.39838827210767713, "grad_norm": 0.177734375, "learning_rate": 9.05716940823087e-05, "loss": 0.0493, "step": 3098 }, { "epoch": 0.39851686741823483, "grad_norm": 0.1884765625, "learning_rate": 9.056578640207962e-05, "loss": 0.048, "step": 3099 }, { "epoch": 0.3986454627287925, "grad_norm": 0.19140625, "learning_rate": 9.055987706437992e-05, "loss": 0.0489, "step": 3100 }, { "epoch": 0.3987740580393502, "grad_norm": 0.193359375, "learning_rate": 9.055396606945108e-05, "loss": 0.0495, "step": 3101 }, { "epoch": 0.3989026533499078, "grad_norm": 0.201171875, "learning_rate": 9.054805341753458e-05, "loss": 0.0489, "step": 3102 }, { "epoch": 0.3990312486604655, "grad_norm": 0.1806640625, "learning_rate": 9.054213910887206e-05, "loss": 0.0481, "step": 3103 }, { "epoch": 0.39915984397102316, "grad_norm": 0.18359375, "learning_rate": 9.053622314370512e-05, "loss": 0.0469, "step": 3104 }, { "epoch": 0.39928843928158086, "grad_norm": 0.203125, "learning_rate": 9.05303055222755e-05, "loss": 0.048, "step": 3105 }, { "epoch": 0.39941703459213856, "grad_norm": 0.150390625, "learning_rate": 9.0524386244825e-05, "loss": 0.0356, "step": 3106 }, { "epoch": 0.3995456299026962, "grad_norm": 0.193359375, "learning_rate": 9.051846531159545e-05, "loss": 0.0573, "step": 3107 }, { "epoch": 0.3996742252132539, "grad_norm": 0.17578125, "learning_rate": 9.051254272282876e-05, "loss": 0.0481, "step": 3108 }, { "epoch": 0.39980282052381155, "grad_norm": 0.2177734375, "learning_rate": 9.050661847876698e-05, "loss": 0.0524, "step": 3109 }, { "epoch": 0.39993141583436925, "grad_norm": 0.2021484375, "learning_rate": 9.050069257965211e-05, "loss": 0.0508, "step": 3110 }, { "epoch": 0.4000600111449269, "grad_norm": 0.1865234375, "learning_rate": 9.049476502572629e-05, "loss": 0.0584, "step": 3111 }, { "epoch": 0.4001886064554846, "grad_norm": 0.1806640625, "learning_rate": 9.048883581723173e-05, "loss": 0.0459, "step": 3112 }, { "epoch": 0.40031720176604224, "grad_norm": 0.189453125, "learning_rate": 9.048290495441068e-05, "loss": 0.0493, "step": 3113 }, { "epoch": 0.40044579707659994, "grad_norm": 0.181640625, "learning_rate": 9.047697243750546e-05, "loss": 0.0415, "step": 3114 }, { "epoch": 0.40057439238715764, "grad_norm": 0.185546875, "learning_rate": 9.047103826675846e-05, "loss": 0.0533, "step": 3115 }, { "epoch": 0.4007029876977153, "grad_norm": 0.1953125, "learning_rate": 9.046510244241219e-05, "loss": 0.0439, "step": 3116 }, { "epoch": 0.400831583008273, "grad_norm": 0.171875, "learning_rate": 9.045916496470912e-05, "loss": 0.0442, "step": 3117 }, { "epoch": 0.40096017831883063, "grad_norm": 0.2080078125, "learning_rate": 9.045322583389188e-05, "loss": 0.0467, "step": 3118 }, { "epoch": 0.40108877362938833, "grad_norm": 0.220703125, "learning_rate": 9.044728505020315e-05, "loss": 0.0572, "step": 3119 }, { "epoch": 0.401217368939946, "grad_norm": 0.1943359375, "learning_rate": 9.044134261388561e-05, "loss": 0.0507, "step": 3120 }, { "epoch": 0.4013459642505037, "grad_norm": 0.19140625, "learning_rate": 9.043539852518213e-05, "loss": 0.0544, "step": 3121 }, { "epoch": 0.4014745595610613, "grad_norm": 0.1787109375, "learning_rate": 9.042945278433552e-05, "loss": 0.0477, "step": 3122 }, { "epoch": 0.401603154871619, "grad_norm": 0.2060546875, "learning_rate": 9.042350539158875e-05, "loss": 0.0524, "step": 3123 }, { "epoch": 0.4017317501821767, "grad_norm": 0.1826171875, "learning_rate": 9.04175563471848e-05, "loss": 0.0456, "step": 3124 }, { "epoch": 0.40186034549273436, "grad_norm": 0.21875, "learning_rate": 9.041160565136679e-05, "loss": 0.0658, "step": 3125 }, { "epoch": 0.40198894080329206, "grad_norm": 0.1826171875, "learning_rate": 9.040565330437779e-05, "loss": 0.0478, "step": 3126 }, { "epoch": 0.4021175361138497, "grad_norm": 0.18359375, "learning_rate": 9.039969930646105e-05, "loss": 0.0491, "step": 3127 }, { "epoch": 0.4022461314244074, "grad_norm": 0.1650390625, "learning_rate": 9.039374365785983e-05, "loss": 0.0467, "step": 3128 }, { "epoch": 0.40237472673496505, "grad_norm": 0.1650390625, "learning_rate": 9.038778635881749e-05, "loss": 0.0426, "step": 3129 }, { "epoch": 0.40250332204552275, "grad_norm": 0.23828125, "learning_rate": 9.038182740957739e-05, "loss": 0.0528, "step": 3130 }, { "epoch": 0.4026319173560804, "grad_norm": 0.18359375, "learning_rate": 9.037586681038306e-05, "loss": 0.0498, "step": 3131 }, { "epoch": 0.4027605126666381, "grad_norm": 0.2060546875, "learning_rate": 9.0369904561478e-05, "loss": 0.0574, "step": 3132 }, { "epoch": 0.40288910797719574, "grad_norm": 0.1767578125, "learning_rate": 9.036394066310585e-05, "loss": 0.0502, "step": 3133 }, { "epoch": 0.40301770328775344, "grad_norm": 0.1689453125, "learning_rate": 9.03579751155103e-05, "loss": 0.0447, "step": 3134 }, { "epoch": 0.40314629859831114, "grad_norm": 0.177734375, "learning_rate": 9.035200791893506e-05, "loss": 0.0428, "step": 3135 }, { "epoch": 0.4032748939088688, "grad_norm": 0.20703125, "learning_rate": 9.034603907362395e-05, "loss": 0.0524, "step": 3136 }, { "epoch": 0.4034034892194265, "grad_norm": 0.173828125, "learning_rate": 9.034006857982086e-05, "loss": 0.0443, "step": 3137 }, { "epoch": 0.4035320845299841, "grad_norm": 0.1865234375, "learning_rate": 9.033409643776975e-05, "loss": 0.0564, "step": 3138 }, { "epoch": 0.4036606798405418, "grad_norm": 0.1845703125, "learning_rate": 9.03281226477146e-05, "loss": 0.0514, "step": 3139 }, { "epoch": 0.40378927515109947, "grad_norm": 0.1806640625, "learning_rate": 9.032214720989951e-05, "loss": 0.0415, "step": 3140 }, { "epoch": 0.40391787046165717, "grad_norm": 0.1787109375, "learning_rate": 9.031617012456864e-05, "loss": 0.0469, "step": 3141 }, { "epoch": 0.4040464657722148, "grad_norm": 0.203125, "learning_rate": 9.03101913919662e-05, "loss": 0.0515, "step": 3142 }, { "epoch": 0.4041750610827725, "grad_norm": 0.19140625, "learning_rate": 9.030421101233646e-05, "loss": 0.0497, "step": 3143 }, { "epoch": 0.4043036563933302, "grad_norm": 0.16796875, "learning_rate": 9.029822898592379e-05, "loss": 0.0434, "step": 3144 }, { "epoch": 0.40443225170388786, "grad_norm": 0.2158203125, "learning_rate": 9.02922453129726e-05, "loss": 0.0633, "step": 3145 }, { "epoch": 0.40456084701444556, "grad_norm": 0.1708984375, "learning_rate": 9.028625999372738e-05, "loss": 0.0393, "step": 3146 }, { "epoch": 0.4046894423250032, "grad_norm": 0.2060546875, "learning_rate": 9.028027302843268e-05, "loss": 0.0537, "step": 3147 }, { "epoch": 0.4048180376355609, "grad_norm": 0.193359375, "learning_rate": 9.027428441733309e-05, "loss": 0.0533, "step": 3148 }, { "epoch": 0.40494663294611855, "grad_norm": 0.19140625, "learning_rate": 9.026829416067337e-05, "loss": 0.0394, "step": 3149 }, { "epoch": 0.40507522825667625, "grad_norm": 0.17578125, "learning_rate": 9.026230225869821e-05, "loss": 0.0479, "step": 3150 }, { "epoch": 0.4052038235672339, "grad_norm": 0.2099609375, "learning_rate": 9.025630871165247e-05, "loss": 0.0529, "step": 3151 }, { "epoch": 0.4053324188777916, "grad_norm": 0.208984375, "learning_rate": 9.025031351978102e-05, "loss": 0.0513, "step": 3152 }, { "epoch": 0.4054610141883493, "grad_norm": 0.1748046875, "learning_rate": 9.024431668332882e-05, "loss": 0.0514, "step": 3153 }, { "epoch": 0.40558960949890693, "grad_norm": 0.201171875, "learning_rate": 9.02383182025409e-05, "loss": 0.0474, "step": 3154 }, { "epoch": 0.40571820480946463, "grad_norm": 0.2001953125, "learning_rate": 9.023231807766233e-05, "loss": 0.0524, "step": 3155 }, { "epoch": 0.4058468001200223, "grad_norm": 0.2060546875, "learning_rate": 9.02263163089383e-05, "loss": 0.0513, "step": 3156 }, { "epoch": 0.40597539543058, "grad_norm": 0.1552734375, "learning_rate": 9.022031289661401e-05, "loss": 0.0376, "step": 3157 }, { "epoch": 0.4061039907411376, "grad_norm": 0.1953125, "learning_rate": 9.021430784093476e-05, "loss": 0.0524, "step": 3158 }, { "epoch": 0.4062325860516953, "grad_norm": 0.1806640625, "learning_rate": 9.020830114214592e-05, "loss": 0.0531, "step": 3159 }, { "epoch": 0.40636118136225297, "grad_norm": 0.1884765625, "learning_rate": 9.02022928004929e-05, "loss": 0.0457, "step": 3160 }, { "epoch": 0.40648977667281067, "grad_norm": 0.177734375, "learning_rate": 9.01962828162212e-05, "loss": 0.0479, "step": 3161 }, { "epoch": 0.40661837198336837, "grad_norm": 0.19921875, "learning_rate": 9.019027118957639e-05, "loss": 0.0554, "step": 3162 }, { "epoch": 0.406746967293926, "grad_norm": 0.2119140625, "learning_rate": 9.018425792080409e-05, "loss": 0.0546, "step": 3163 }, { "epoch": 0.4068755626044837, "grad_norm": 0.2041015625, "learning_rate": 9.017824301014998e-05, "loss": 0.0493, "step": 3164 }, { "epoch": 0.40700415791504135, "grad_norm": 0.189453125, "learning_rate": 9.017222645785986e-05, "loss": 0.0498, "step": 3165 }, { "epoch": 0.40713275322559905, "grad_norm": 0.177734375, "learning_rate": 9.016620826417952e-05, "loss": 0.0417, "step": 3166 }, { "epoch": 0.4072613485361567, "grad_norm": 0.1943359375, "learning_rate": 9.016018842935489e-05, "loss": 0.0491, "step": 3167 }, { "epoch": 0.4073899438467144, "grad_norm": 0.1875, "learning_rate": 9.01541669536319e-05, "loss": 0.0447, "step": 3168 }, { "epoch": 0.40751853915727204, "grad_norm": 0.1806640625, "learning_rate": 9.01481438372566e-05, "loss": 0.0517, "step": 3169 }, { "epoch": 0.40764713446782974, "grad_norm": 0.203125, "learning_rate": 9.01421190804751e-05, "loss": 0.0518, "step": 3170 }, { "epoch": 0.40777572977838744, "grad_norm": 0.19921875, "learning_rate": 9.013609268353353e-05, "loss": 0.0529, "step": 3171 }, { "epoch": 0.4079043250889451, "grad_norm": 0.1865234375, "learning_rate": 9.013006464667815e-05, "loss": 0.0538, "step": 3172 }, { "epoch": 0.4080329203995028, "grad_norm": 0.2021484375, "learning_rate": 9.012403497015525e-05, "loss": 0.0461, "step": 3173 }, { "epoch": 0.40816151571006043, "grad_norm": 0.1826171875, "learning_rate": 9.011800365421119e-05, "loss": 0.0499, "step": 3174 }, { "epoch": 0.40829011102061813, "grad_norm": 0.193359375, "learning_rate": 9.011197069909241e-05, "loss": 0.05, "step": 3175 }, { "epoch": 0.4084187063311758, "grad_norm": 0.17578125, "learning_rate": 9.010593610504541e-05, "loss": 0.0428, "step": 3176 }, { "epoch": 0.4085473016417335, "grad_norm": 0.1708984375, "learning_rate": 9.009989987231676e-05, "loss": 0.0456, "step": 3177 }, { "epoch": 0.4086758969522911, "grad_norm": 0.173828125, "learning_rate": 9.009386200115308e-05, "loss": 0.0431, "step": 3178 }, { "epoch": 0.4088044922628488, "grad_norm": 0.197265625, "learning_rate": 9.008782249180107e-05, "loss": 0.0476, "step": 3179 }, { "epoch": 0.4089330875734065, "grad_norm": 0.1787109375, "learning_rate": 9.008178134450751e-05, "loss": 0.0479, "step": 3180 }, { "epoch": 0.40906168288396416, "grad_norm": 0.1982421875, "learning_rate": 9.007573855951923e-05, "loss": 0.0538, "step": 3181 }, { "epoch": 0.40919027819452186, "grad_norm": 0.1767578125, "learning_rate": 9.006969413708313e-05, "loss": 0.0463, "step": 3182 }, { "epoch": 0.4093188735050795, "grad_norm": 0.2236328125, "learning_rate": 9.006364807744618e-05, "loss": 0.0477, "step": 3183 }, { "epoch": 0.4094474688156372, "grad_norm": 0.1982421875, "learning_rate": 9.00576003808554e-05, "loss": 0.0461, "step": 3184 }, { "epoch": 0.40957606412619485, "grad_norm": 0.1650390625, "learning_rate": 9.005155104755792e-05, "loss": 0.0387, "step": 3185 }, { "epoch": 0.40970465943675255, "grad_norm": 0.181640625, "learning_rate": 9.004550007780089e-05, "loss": 0.049, "step": 3186 }, { "epoch": 0.4098332547473102, "grad_norm": 0.1826171875, "learning_rate": 9.003944747183156e-05, "loss": 0.0426, "step": 3187 }, { "epoch": 0.4099618500578679, "grad_norm": 0.1884765625, "learning_rate": 9.00333932298972e-05, "loss": 0.0484, "step": 3188 }, { "epoch": 0.41009044536842554, "grad_norm": 0.2021484375, "learning_rate": 9.00273373522452e-05, "loss": 0.0538, "step": 3189 }, { "epoch": 0.41021904067898324, "grad_norm": 0.1953125, "learning_rate": 9.002127983912301e-05, "loss": 0.0589, "step": 3190 }, { "epoch": 0.41034763598954094, "grad_norm": 0.208984375, "learning_rate": 9.001522069077812e-05, "loss": 0.0573, "step": 3191 }, { "epoch": 0.4104762313000986, "grad_norm": 0.2109375, "learning_rate": 9.000915990745808e-05, "loss": 0.0556, "step": 3192 }, { "epoch": 0.4106048266106563, "grad_norm": 0.1708984375, "learning_rate": 9.000309748941057e-05, "loss": 0.0389, "step": 3193 }, { "epoch": 0.4107334219212139, "grad_norm": 0.16796875, "learning_rate": 8.999703343688327e-05, "loss": 0.046, "step": 3194 }, { "epoch": 0.4108620172317716, "grad_norm": 0.177734375, "learning_rate": 8.999096775012392e-05, "loss": 0.0484, "step": 3195 }, { "epoch": 0.41099061254232927, "grad_norm": 0.1982421875, "learning_rate": 8.998490042938041e-05, "loss": 0.056, "step": 3196 }, { "epoch": 0.41111920785288697, "grad_norm": 0.224609375, "learning_rate": 8.997883147490061e-05, "loss": 0.0614, "step": 3197 }, { "epoch": 0.4112478031634446, "grad_norm": 0.1904296875, "learning_rate": 8.99727608869325e-05, "loss": 0.0536, "step": 3198 }, { "epoch": 0.4113763984740023, "grad_norm": 0.1923828125, "learning_rate": 8.996668866572414e-05, "loss": 0.0525, "step": 3199 }, { "epoch": 0.41150499378456, "grad_norm": 0.185546875, "learning_rate": 8.996061481152358e-05, "loss": 0.0472, "step": 3200 }, { "epoch": 0.41163358909511766, "grad_norm": 0.201171875, "learning_rate": 8.995453932457903e-05, "loss": 0.0494, "step": 3201 }, { "epoch": 0.41176218440567536, "grad_norm": 0.2099609375, "learning_rate": 8.994846220513872e-05, "loss": 0.0624, "step": 3202 }, { "epoch": 0.411890779716233, "grad_norm": 0.1689453125, "learning_rate": 8.994238345345094e-05, "loss": 0.0405, "step": 3203 }, { "epoch": 0.4120193750267907, "grad_norm": 0.1943359375, "learning_rate": 8.993630306976411e-05, "loss": 0.0516, "step": 3204 }, { "epoch": 0.41214797033734835, "grad_norm": 0.1787109375, "learning_rate": 8.99302210543266e-05, "loss": 0.044, "step": 3205 }, { "epoch": 0.41227656564790605, "grad_norm": 0.20703125, "learning_rate": 8.992413740738696e-05, "loss": 0.0658, "step": 3206 }, { "epoch": 0.4124051609584637, "grad_norm": 0.1806640625, "learning_rate": 8.991805212919373e-05, "loss": 0.0457, "step": 3207 }, { "epoch": 0.4125337562690214, "grad_norm": 0.1865234375, "learning_rate": 8.991196521999558e-05, "loss": 0.0508, "step": 3208 }, { "epoch": 0.4126623515795791, "grad_norm": 0.2041015625, "learning_rate": 8.990587668004118e-05, "loss": 0.0514, "step": 3209 }, { "epoch": 0.41279094689013673, "grad_norm": 0.2109375, "learning_rate": 8.989978650957934e-05, "loss": 0.0548, "step": 3210 }, { "epoch": 0.41291954220069443, "grad_norm": 0.240234375, "learning_rate": 8.989369470885885e-05, "loss": 0.0589, "step": 3211 }, { "epoch": 0.4130481375112521, "grad_norm": 0.1982421875, "learning_rate": 8.988760127812865e-05, "loss": 0.0537, "step": 3212 }, { "epoch": 0.4131767328218098, "grad_norm": 0.201171875, "learning_rate": 8.98815062176377e-05, "loss": 0.0561, "step": 3213 }, { "epoch": 0.4133053281323674, "grad_norm": 0.193359375, "learning_rate": 8.987540952763505e-05, "loss": 0.0432, "step": 3214 }, { "epoch": 0.4134339234429251, "grad_norm": 0.203125, "learning_rate": 8.986931120836976e-05, "loss": 0.0559, "step": 3215 }, { "epoch": 0.41356251875348277, "grad_norm": 0.2109375, "learning_rate": 8.986321126009105e-05, "loss": 0.0516, "step": 3216 }, { "epoch": 0.41369111406404047, "grad_norm": 0.2099609375, "learning_rate": 8.985710968304813e-05, "loss": 0.0529, "step": 3217 }, { "epoch": 0.41381970937459817, "grad_norm": 0.17578125, "learning_rate": 8.985100647749031e-05, "loss": 0.0437, "step": 3218 }, { "epoch": 0.4139483046851558, "grad_norm": 0.193359375, "learning_rate": 8.984490164366697e-05, "loss": 0.0468, "step": 3219 }, { "epoch": 0.4140768999957135, "grad_norm": 0.169921875, "learning_rate": 8.983879518182753e-05, "loss": 0.0391, "step": 3220 }, { "epoch": 0.41420549530627115, "grad_norm": 0.1806640625, "learning_rate": 8.98326870922215e-05, "loss": 0.0464, "step": 3221 }, { "epoch": 0.41433409061682885, "grad_norm": 0.2109375, "learning_rate": 8.982657737509845e-05, "loss": 0.0628, "step": 3222 }, { "epoch": 0.4144626859273865, "grad_norm": 0.1806640625, "learning_rate": 8.9820466030708e-05, "loss": 0.0519, "step": 3223 }, { "epoch": 0.4145912812379442, "grad_norm": 0.1923828125, "learning_rate": 8.981435305929989e-05, "loss": 0.053, "step": 3224 }, { "epoch": 0.41471987654850184, "grad_norm": 0.177734375, "learning_rate": 8.980823846112384e-05, "loss": 0.0463, "step": 3225 }, { "epoch": 0.41484847185905954, "grad_norm": 0.1826171875, "learning_rate": 8.980212223642974e-05, "loss": 0.0464, "step": 3226 }, { "epoch": 0.41497706716961724, "grad_norm": 0.2060546875, "learning_rate": 8.979600438546743e-05, "loss": 0.0461, "step": 3227 }, { "epoch": 0.4151056624801749, "grad_norm": 0.2080078125, "learning_rate": 8.978988490848694e-05, "loss": 0.0431, "step": 3228 }, { "epoch": 0.4152342577907326, "grad_norm": 0.1962890625, "learning_rate": 8.978376380573827e-05, "loss": 0.0582, "step": 3229 }, { "epoch": 0.41536285310129023, "grad_norm": 0.1904296875, "learning_rate": 8.977764107747151e-05, "loss": 0.0486, "step": 3230 }, { "epoch": 0.41549144841184793, "grad_norm": 0.201171875, "learning_rate": 8.977151672393686e-05, "loss": 0.0423, "step": 3231 }, { "epoch": 0.4156200437224056, "grad_norm": 0.2001953125, "learning_rate": 8.976539074538453e-05, "loss": 0.0579, "step": 3232 }, { "epoch": 0.4157486390329633, "grad_norm": 0.1826171875, "learning_rate": 8.975926314206484e-05, "loss": 0.0481, "step": 3233 }, { "epoch": 0.4158772343435209, "grad_norm": 0.1845703125, "learning_rate": 8.975313391422813e-05, "loss": 0.0463, "step": 3234 }, { "epoch": 0.4160058296540786, "grad_norm": 0.234375, "learning_rate": 8.974700306212487e-05, "loss": 0.0476, "step": 3235 }, { "epoch": 0.41613442496463626, "grad_norm": 0.18359375, "learning_rate": 8.974087058600551e-05, "loss": 0.0435, "step": 3236 }, { "epoch": 0.41626302027519396, "grad_norm": 0.1533203125, "learning_rate": 8.973473648612067e-05, "loss": 0.0395, "step": 3237 }, { "epoch": 0.41639161558575166, "grad_norm": 0.1962890625, "learning_rate": 8.972860076272093e-05, "loss": 0.0601, "step": 3238 }, { "epoch": 0.4165202108963093, "grad_norm": 0.1826171875, "learning_rate": 8.972246341605704e-05, "loss": 0.0449, "step": 3239 }, { "epoch": 0.416648806206867, "grad_norm": 0.1953125, "learning_rate": 8.971632444637972e-05, "loss": 0.048, "step": 3240 }, { "epoch": 0.41677740151742465, "grad_norm": 0.1826171875, "learning_rate": 8.971018385393984e-05, "loss": 0.0503, "step": 3241 }, { "epoch": 0.41690599682798235, "grad_norm": 0.171875, "learning_rate": 8.970404163898826e-05, "loss": 0.0408, "step": 3242 }, { "epoch": 0.41703459213854, "grad_norm": 0.201171875, "learning_rate": 8.969789780177596e-05, "loss": 0.0505, "step": 3243 }, { "epoch": 0.4171631874490977, "grad_norm": 0.181640625, "learning_rate": 8.969175234255398e-05, "loss": 0.0515, "step": 3244 }, { "epoch": 0.41729178275965534, "grad_norm": 0.19140625, "learning_rate": 8.968560526157341e-05, "loss": 0.0515, "step": 3245 }, { "epoch": 0.41742037807021304, "grad_norm": 0.1806640625, "learning_rate": 8.967945655908541e-05, "loss": 0.0474, "step": 3246 }, { "epoch": 0.41754897338077074, "grad_norm": 0.1748046875, "learning_rate": 8.967330623534121e-05, "loss": 0.0487, "step": 3247 }, { "epoch": 0.4176775686913284, "grad_norm": 0.185546875, "learning_rate": 8.96671542905921e-05, "loss": 0.0547, "step": 3248 }, { "epoch": 0.4178061640018861, "grad_norm": 0.1884765625, "learning_rate": 8.966100072508943e-05, "loss": 0.0504, "step": 3249 }, { "epoch": 0.4179347593124437, "grad_norm": 0.1875, "learning_rate": 8.965484553908465e-05, "loss": 0.0525, "step": 3250 }, { "epoch": 0.4180633546230014, "grad_norm": 0.1953125, "learning_rate": 8.964868873282925e-05, "loss": 0.052, "step": 3251 }, { "epoch": 0.41819194993355907, "grad_norm": 0.189453125, "learning_rate": 8.964253030657481e-05, "loss": 0.0469, "step": 3252 }, { "epoch": 0.41832054524411677, "grad_norm": 0.197265625, "learning_rate": 8.963637026057291e-05, "loss": 0.0485, "step": 3253 }, { "epoch": 0.4184491405546744, "grad_norm": 0.203125, "learning_rate": 8.963020859507528e-05, "loss": 0.053, "step": 3254 }, { "epoch": 0.4185777358652321, "grad_norm": 0.1728515625, "learning_rate": 8.962404531033363e-05, "loss": 0.0442, "step": 3255 }, { "epoch": 0.4187063311757898, "grad_norm": 0.1845703125, "learning_rate": 8.961788040659984e-05, "loss": 0.0537, "step": 3256 }, { "epoch": 0.41883492648634746, "grad_norm": 0.1767578125, "learning_rate": 8.961171388412578e-05, "loss": 0.0512, "step": 3257 }, { "epoch": 0.41896352179690516, "grad_norm": 0.181640625, "learning_rate": 8.96055457431634e-05, "loss": 0.0488, "step": 3258 }, { "epoch": 0.4190921171074628, "grad_norm": 0.177734375, "learning_rate": 8.959937598396474e-05, "loss": 0.0438, "step": 3259 }, { "epoch": 0.4192207124180205, "grad_norm": 0.1962890625, "learning_rate": 8.959320460678187e-05, "loss": 0.0447, "step": 3260 }, { "epoch": 0.41934930772857815, "grad_norm": 0.1611328125, "learning_rate": 8.958703161186694e-05, "loss": 0.0404, "step": 3261 }, { "epoch": 0.41947790303913585, "grad_norm": 0.193359375, "learning_rate": 8.958085699947221e-05, "loss": 0.0573, "step": 3262 }, { "epoch": 0.4196064983496935, "grad_norm": 0.224609375, "learning_rate": 8.957468076984992e-05, "loss": 0.0555, "step": 3263 }, { "epoch": 0.4197350936602512, "grad_norm": 0.2001953125, "learning_rate": 8.956850292325245e-05, "loss": 0.0564, "step": 3264 }, { "epoch": 0.4198636889708089, "grad_norm": 0.1865234375, "learning_rate": 8.956232345993223e-05, "loss": 0.0524, "step": 3265 }, { "epoch": 0.41999228428136653, "grad_norm": 0.177734375, "learning_rate": 8.955614238014172e-05, "loss": 0.0505, "step": 3266 }, { "epoch": 0.42012087959192423, "grad_norm": 0.1884765625, "learning_rate": 8.954995968413346e-05, "loss": 0.0474, "step": 3267 }, { "epoch": 0.4202494749024819, "grad_norm": 0.17578125, "learning_rate": 8.954377537216012e-05, "loss": 0.0518, "step": 3268 }, { "epoch": 0.4203780702130396, "grad_norm": 0.1884765625, "learning_rate": 8.953758944447434e-05, "loss": 0.0491, "step": 3269 }, { "epoch": 0.4205066655235972, "grad_norm": 0.19140625, "learning_rate": 8.953140190132889e-05, "loss": 0.0553, "step": 3270 }, { "epoch": 0.4206352608341549, "grad_norm": 0.171875, "learning_rate": 8.952521274297658e-05, "loss": 0.0519, "step": 3271 }, { "epoch": 0.42076385614471257, "grad_norm": 0.177734375, "learning_rate": 8.951902196967029e-05, "loss": 0.046, "step": 3272 }, { "epoch": 0.42089245145527027, "grad_norm": 0.181640625, "learning_rate": 8.951282958166297e-05, "loss": 0.0477, "step": 3273 }, { "epoch": 0.42102104676582797, "grad_norm": 0.1748046875, "learning_rate": 8.950663557920764e-05, "loss": 0.0378, "step": 3274 }, { "epoch": 0.4211496420763856, "grad_norm": 0.1689453125, "learning_rate": 8.950043996255735e-05, "loss": 0.0459, "step": 3275 }, { "epoch": 0.4212782373869433, "grad_norm": 0.203125, "learning_rate": 8.949424273196527e-05, "loss": 0.0528, "step": 3276 }, { "epoch": 0.42140683269750095, "grad_norm": 0.2001953125, "learning_rate": 8.948804388768463e-05, "loss": 0.0578, "step": 3277 }, { "epoch": 0.42153542800805865, "grad_norm": 0.171875, "learning_rate": 8.948184342996868e-05, "loss": 0.0448, "step": 3278 }, { "epoch": 0.4216640233186163, "grad_norm": 0.1953125, "learning_rate": 8.947564135907076e-05, "loss": 0.0616, "step": 3279 }, { "epoch": 0.421792618629174, "grad_norm": 0.193359375, "learning_rate": 8.94694376752443e-05, "loss": 0.0549, "step": 3280 }, { "epoch": 0.42192121393973164, "grad_norm": 0.173828125, "learning_rate": 8.946323237874276e-05, "loss": 0.0493, "step": 3281 }, { "epoch": 0.42204980925028934, "grad_norm": 0.181640625, "learning_rate": 8.945702546981969e-05, "loss": 0.0494, "step": 3282 }, { "epoch": 0.422178404560847, "grad_norm": 0.2001953125, "learning_rate": 8.945081694872868e-05, "loss": 0.0509, "step": 3283 }, { "epoch": 0.4223069998714047, "grad_norm": 0.1826171875, "learning_rate": 8.944460681572344e-05, "loss": 0.0475, "step": 3284 }, { "epoch": 0.4224355951819624, "grad_norm": 0.181640625, "learning_rate": 8.943839507105766e-05, "loss": 0.0563, "step": 3285 }, { "epoch": 0.42256419049252003, "grad_norm": 0.2333984375, "learning_rate": 8.943218171498519e-05, "loss": 0.0543, "step": 3286 }, { "epoch": 0.42269278580307773, "grad_norm": 0.1904296875, "learning_rate": 8.942596674775987e-05, "loss": 0.0515, "step": 3287 }, { "epoch": 0.4228213811136354, "grad_norm": 0.1640625, "learning_rate": 8.941975016963564e-05, "loss": 0.0422, "step": 3288 }, { "epoch": 0.4229499764241931, "grad_norm": 0.19140625, "learning_rate": 8.941353198086652e-05, "loss": 0.0514, "step": 3289 }, { "epoch": 0.4230785717347507, "grad_norm": 0.166015625, "learning_rate": 8.940731218170657e-05, "loss": 0.0417, "step": 3290 }, { "epoch": 0.4232071670453084, "grad_norm": 0.189453125, "learning_rate": 8.94010907724099e-05, "loss": 0.0522, "step": 3291 }, { "epoch": 0.42333576235586606, "grad_norm": 0.1787109375, "learning_rate": 8.939486775323075e-05, "loss": 0.0476, "step": 3292 }, { "epoch": 0.42346435766642376, "grad_norm": 0.1767578125, "learning_rate": 8.938864312442335e-05, "loss": 0.0411, "step": 3293 }, { "epoch": 0.42359295297698146, "grad_norm": 0.1875, "learning_rate": 8.938241688624206e-05, "loss": 0.0433, "step": 3294 }, { "epoch": 0.4237215482875391, "grad_norm": 0.1845703125, "learning_rate": 8.937618903894125e-05, "loss": 0.0428, "step": 3295 }, { "epoch": 0.4238501435980968, "grad_norm": 0.1865234375, "learning_rate": 8.936995958277541e-05, "loss": 0.0522, "step": 3296 }, { "epoch": 0.42397873890865445, "grad_norm": 0.1748046875, "learning_rate": 8.936372851799904e-05, "loss": 0.0468, "step": 3297 }, { "epoch": 0.42410733421921215, "grad_norm": 0.2060546875, "learning_rate": 8.935749584486674e-05, "loss": 0.0623, "step": 3298 }, { "epoch": 0.4242359295297698, "grad_norm": 0.185546875, "learning_rate": 8.93512615636332e-05, "loss": 0.0457, "step": 3299 }, { "epoch": 0.4243645248403275, "grad_norm": 0.2255859375, "learning_rate": 8.934502567455312e-05, "loss": 0.0468, "step": 3300 }, { "epoch": 0.42449312015088514, "grad_norm": 0.24609375, "learning_rate": 8.933878817788128e-05, "loss": 0.0646, "step": 3301 }, { "epoch": 0.42462171546144284, "grad_norm": 0.1669921875, "learning_rate": 8.933254907387257e-05, "loss": 0.0427, "step": 3302 }, { "epoch": 0.42475031077200054, "grad_norm": 0.1669921875, "learning_rate": 8.932630836278186e-05, "loss": 0.0415, "step": 3303 }, { "epoch": 0.4248789060825582, "grad_norm": 0.18359375, "learning_rate": 8.93200660448642e-05, "loss": 0.0511, "step": 3304 }, { "epoch": 0.4250075013931159, "grad_norm": 0.1845703125, "learning_rate": 8.93138221203746e-05, "loss": 0.0522, "step": 3305 }, { "epoch": 0.4251360967036735, "grad_norm": 0.2099609375, "learning_rate": 8.930757658956821e-05, "loss": 0.0613, "step": 3306 }, { "epoch": 0.4252646920142312, "grad_norm": 0.1953125, "learning_rate": 8.930132945270019e-05, "loss": 0.0523, "step": 3307 }, { "epoch": 0.42539328732478887, "grad_norm": 0.1669921875, "learning_rate": 8.92950807100258e-05, "loss": 0.0463, "step": 3308 }, { "epoch": 0.42552188263534657, "grad_norm": 0.173828125, "learning_rate": 8.928883036180036e-05, "loss": 0.0499, "step": 3309 }, { "epoch": 0.4256504779459042, "grad_norm": 0.1875, "learning_rate": 8.928257840827924e-05, "loss": 0.0454, "step": 3310 }, { "epoch": 0.4257790732564619, "grad_norm": 0.1630859375, "learning_rate": 8.92763248497179e-05, "loss": 0.0438, "step": 3311 }, { "epoch": 0.4259076685670196, "grad_norm": 0.177734375, "learning_rate": 8.927006968637186e-05, "loss": 0.0526, "step": 3312 }, { "epoch": 0.42603626387757726, "grad_norm": 0.1787109375, "learning_rate": 8.926381291849668e-05, "loss": 0.0535, "step": 3313 }, { "epoch": 0.42616485918813496, "grad_norm": 0.2109375, "learning_rate": 8.925755454634801e-05, "loss": 0.0533, "step": 3314 }, { "epoch": 0.4262934544986926, "grad_norm": 0.1875, "learning_rate": 8.925129457018158e-05, "loss": 0.0521, "step": 3315 }, { "epoch": 0.4264220498092503, "grad_norm": 0.169921875, "learning_rate": 8.924503299025312e-05, "loss": 0.0464, "step": 3316 }, { "epoch": 0.42655064511980795, "grad_norm": 0.18359375, "learning_rate": 8.923876980681851e-05, "loss": 0.0539, "step": 3317 }, { "epoch": 0.42667924043036565, "grad_norm": 0.193359375, "learning_rate": 8.923250502013365e-05, "loss": 0.0468, "step": 3318 }, { "epoch": 0.4268078357409233, "grad_norm": 0.1865234375, "learning_rate": 8.92262386304545e-05, "loss": 0.0443, "step": 3319 }, { "epoch": 0.426936431051481, "grad_norm": 0.1845703125, "learning_rate": 8.921997063803711e-05, "loss": 0.0561, "step": 3320 }, { "epoch": 0.4270650263620387, "grad_norm": 0.1728515625, "learning_rate": 8.921370104313756e-05, "loss": 0.0459, "step": 3321 }, { "epoch": 0.42719362167259634, "grad_norm": 0.1767578125, "learning_rate": 8.920742984601204e-05, "loss": 0.0455, "step": 3322 }, { "epoch": 0.42732221698315404, "grad_norm": 0.1748046875, "learning_rate": 8.92011570469168e-05, "loss": 0.0535, "step": 3323 }, { "epoch": 0.4274508122937117, "grad_norm": 0.1748046875, "learning_rate": 8.91948826461081e-05, "loss": 0.0498, "step": 3324 }, { "epoch": 0.4275794076042694, "grad_norm": 0.20703125, "learning_rate": 8.918860664384232e-05, "loss": 0.0592, "step": 3325 }, { "epoch": 0.427708002914827, "grad_norm": 0.1953125, "learning_rate": 8.91823290403759e-05, "loss": 0.046, "step": 3326 }, { "epoch": 0.4278365982253847, "grad_norm": 0.1748046875, "learning_rate": 8.917604983596533e-05, "loss": 0.042, "step": 3327 }, { "epoch": 0.42796519353594237, "grad_norm": 0.1787109375, "learning_rate": 8.916976903086717e-05, "loss": 0.0494, "step": 3328 }, { "epoch": 0.42809378884650007, "grad_norm": 0.19921875, "learning_rate": 8.916348662533805e-05, "loss": 0.0576, "step": 3329 }, { "epoch": 0.4282223841570577, "grad_norm": 0.1953125, "learning_rate": 8.915720261963463e-05, "loss": 0.0526, "step": 3330 }, { "epoch": 0.4283509794676154, "grad_norm": 0.1865234375, "learning_rate": 8.915091701401373e-05, "loss": 0.0513, "step": 3331 }, { "epoch": 0.4284795747781731, "grad_norm": 0.181640625, "learning_rate": 8.91446298087321e-05, "loss": 0.0538, "step": 3332 }, { "epoch": 0.42860817008873076, "grad_norm": 0.1962890625, "learning_rate": 8.913834100404668e-05, "loss": 0.0443, "step": 3333 }, { "epoch": 0.42873676539928846, "grad_norm": 0.1787109375, "learning_rate": 8.913205060021442e-05, "loss": 0.0517, "step": 3334 }, { "epoch": 0.4288653607098461, "grad_norm": 0.201171875, "learning_rate": 8.912575859749233e-05, "loss": 0.0543, "step": 3335 }, { "epoch": 0.4289939560204038, "grad_norm": 0.205078125, "learning_rate": 8.911946499613747e-05, "loss": 0.0554, "step": 3336 }, { "epoch": 0.42912255133096144, "grad_norm": 0.189453125, "learning_rate": 8.911316979640701e-05, "loss": 0.0505, "step": 3337 }, { "epoch": 0.42925114664151914, "grad_norm": 0.2177734375, "learning_rate": 8.910687299855818e-05, "loss": 0.0532, "step": 3338 }, { "epoch": 0.4293797419520768, "grad_norm": 0.171875, "learning_rate": 8.910057460284824e-05, "loss": 0.0437, "step": 3339 }, { "epoch": 0.4295083372626345, "grad_norm": 0.1728515625, "learning_rate": 8.909427460953452e-05, "loss": 0.041, "step": 3340 }, { "epoch": 0.4296369325731922, "grad_norm": 0.20703125, "learning_rate": 8.908797301887448e-05, "loss": 0.0529, "step": 3341 }, { "epoch": 0.42976552788374983, "grad_norm": 0.1826171875, "learning_rate": 8.908166983112555e-05, "loss": 0.0438, "step": 3342 }, { "epoch": 0.42989412319430753, "grad_norm": 0.2119140625, "learning_rate": 8.907536504654528e-05, "loss": 0.0517, "step": 3343 }, { "epoch": 0.4300227185048652, "grad_norm": 0.203125, "learning_rate": 8.906905866539128e-05, "loss": 0.0549, "step": 3344 }, { "epoch": 0.4301513138154229, "grad_norm": 0.1953125, "learning_rate": 8.906275068792123e-05, "loss": 0.0528, "step": 3345 }, { "epoch": 0.4302799091259805, "grad_norm": 0.1875, "learning_rate": 8.905644111439287e-05, "loss": 0.0516, "step": 3346 }, { "epoch": 0.4304085044365382, "grad_norm": 0.203125, "learning_rate": 8.905012994506398e-05, "loss": 0.0466, "step": 3347 }, { "epoch": 0.43053709974709586, "grad_norm": 0.18359375, "learning_rate": 8.904381718019243e-05, "loss": 0.0498, "step": 3348 }, { "epoch": 0.43066569505765356, "grad_norm": 0.203125, "learning_rate": 8.903750282003617e-05, "loss": 0.0448, "step": 3349 }, { "epoch": 0.43079429036821126, "grad_norm": 0.1845703125, "learning_rate": 8.903118686485319e-05, "loss": 0.0472, "step": 3350 }, { "epoch": 0.4309228856787689, "grad_norm": 0.169921875, "learning_rate": 8.902486931490155e-05, "loss": 0.0438, "step": 3351 }, { "epoch": 0.4310514809893266, "grad_norm": 0.169921875, "learning_rate": 8.901855017043939e-05, "loss": 0.0449, "step": 3352 }, { "epoch": 0.43118007629988425, "grad_norm": 0.1953125, "learning_rate": 8.901222943172487e-05, "loss": 0.0483, "step": 3353 }, { "epoch": 0.43130867161044195, "grad_norm": 0.166015625, "learning_rate": 8.900590709901628e-05, "loss": 0.0448, "step": 3354 }, { "epoch": 0.4314372669209996, "grad_norm": 0.2099609375, "learning_rate": 8.899958317257193e-05, "loss": 0.0577, "step": 3355 }, { "epoch": 0.4315658622315573, "grad_norm": 0.1806640625, "learning_rate": 8.899325765265021e-05, "loss": 0.047, "step": 3356 }, { "epoch": 0.43169445754211494, "grad_norm": 0.1884765625, "learning_rate": 8.898693053950959e-05, "loss": 0.0459, "step": 3357 }, { "epoch": 0.43182305285267264, "grad_norm": 0.2236328125, "learning_rate": 8.898060183340855e-05, "loss": 0.0522, "step": 3358 }, { "epoch": 0.43195164816323034, "grad_norm": 0.1787109375, "learning_rate": 8.897427153460572e-05, "loss": 0.042, "step": 3359 }, { "epoch": 0.432080243473788, "grad_norm": 0.1875, "learning_rate": 8.896793964335969e-05, "loss": 0.0476, "step": 3360 }, { "epoch": 0.4322088387843457, "grad_norm": 0.2412109375, "learning_rate": 8.896160615992923e-05, "loss": 0.059, "step": 3361 }, { "epoch": 0.43233743409490333, "grad_norm": 0.1513671875, "learning_rate": 8.895527108457309e-05, "loss": 0.0422, "step": 3362 }, { "epoch": 0.432466029405461, "grad_norm": 0.1787109375, "learning_rate": 8.894893441755012e-05, "loss": 0.0516, "step": 3363 }, { "epoch": 0.43259462471601867, "grad_norm": 0.1904296875, "learning_rate": 8.894259615911924e-05, "loss": 0.0566, "step": 3364 }, { "epoch": 0.43272322002657637, "grad_norm": 0.1796875, "learning_rate": 8.89362563095394e-05, "loss": 0.044, "step": 3365 }, { "epoch": 0.432851815337134, "grad_norm": 0.173828125, "learning_rate": 8.892991486906968e-05, "loss": 0.0474, "step": 3366 }, { "epoch": 0.4329804106476917, "grad_norm": 0.1845703125, "learning_rate": 8.892357183796914e-05, "loss": 0.0497, "step": 3367 }, { "epoch": 0.4331090059582494, "grad_norm": 0.2109375, "learning_rate": 8.891722721649696e-05, "loss": 0.0527, "step": 3368 }, { "epoch": 0.43323760126880706, "grad_norm": 0.1796875, "learning_rate": 8.891088100491238e-05, "loss": 0.0458, "step": 3369 }, { "epoch": 0.43336619657936476, "grad_norm": 0.18359375, "learning_rate": 8.890453320347469e-05, "loss": 0.0475, "step": 3370 }, { "epoch": 0.4334947918899224, "grad_norm": 0.1767578125, "learning_rate": 8.889818381244327e-05, "loss": 0.0425, "step": 3371 }, { "epoch": 0.4336233872004801, "grad_norm": 0.1796875, "learning_rate": 8.889183283207754e-05, "loss": 0.0476, "step": 3372 }, { "epoch": 0.43375198251103775, "grad_norm": 0.1904296875, "learning_rate": 8.888548026263699e-05, "loss": 0.0459, "step": 3373 }, { "epoch": 0.43388057782159545, "grad_norm": 0.1953125, "learning_rate": 8.887912610438118e-05, "loss": 0.05, "step": 3374 }, { "epoch": 0.4340091731321531, "grad_norm": 0.2021484375, "learning_rate": 8.887277035756974e-05, "loss": 0.0525, "step": 3375 }, { "epoch": 0.4341377684427108, "grad_norm": 0.20703125, "learning_rate": 8.886641302246235e-05, "loss": 0.0459, "step": 3376 }, { "epoch": 0.4342663637532685, "grad_norm": 0.208984375, "learning_rate": 8.886005409931879e-05, "loss": 0.0491, "step": 3377 }, { "epoch": 0.43439495906382614, "grad_norm": 0.2041015625, "learning_rate": 8.885369358839883e-05, "loss": 0.0539, "step": 3378 }, { "epoch": 0.43452355437438384, "grad_norm": 0.20703125, "learning_rate": 8.884733148996238e-05, "loss": 0.0559, "step": 3379 }, { "epoch": 0.4346521496849415, "grad_norm": 0.177734375, "learning_rate": 8.88409678042694e-05, "loss": 0.0458, "step": 3380 }, { "epoch": 0.4347807449954992, "grad_norm": 0.1953125, "learning_rate": 8.883460253157988e-05, "loss": 0.0567, "step": 3381 }, { "epoch": 0.4349093403060568, "grad_norm": 0.1591796875, "learning_rate": 8.882823567215391e-05, "loss": 0.0392, "step": 3382 }, { "epoch": 0.4350379356166145, "grad_norm": 0.1845703125, "learning_rate": 8.882186722625164e-05, "loss": 0.0561, "step": 3383 }, { "epoch": 0.43516653092717217, "grad_norm": 0.1806640625, "learning_rate": 8.881549719413325e-05, "loss": 0.0548, "step": 3384 }, { "epoch": 0.43529512623772987, "grad_norm": 0.1708984375, "learning_rate": 8.880912557605903e-05, "loss": 0.0443, "step": 3385 }, { "epoch": 0.4354237215482875, "grad_norm": 0.1875, "learning_rate": 8.880275237228932e-05, "loss": 0.0582, "step": 3386 }, { "epoch": 0.4355523168588452, "grad_norm": 0.1845703125, "learning_rate": 8.87963775830845e-05, "loss": 0.0514, "step": 3387 }, { "epoch": 0.4356809121694029, "grad_norm": 0.177734375, "learning_rate": 8.879000120870506e-05, "loss": 0.0461, "step": 3388 }, { "epoch": 0.43580950747996056, "grad_norm": 0.2041015625, "learning_rate": 8.878362324941155e-05, "loss": 0.0536, "step": 3389 }, { "epoch": 0.43593810279051826, "grad_norm": 0.16015625, "learning_rate": 8.877724370546452e-05, "loss": 0.0423, "step": 3390 }, { "epoch": 0.4360666981010759, "grad_norm": 0.197265625, "learning_rate": 8.877086257712466e-05, "loss": 0.051, "step": 3391 }, { "epoch": 0.4361952934116336, "grad_norm": 0.20703125, "learning_rate": 8.876447986465269e-05, "loss": 0.052, "step": 3392 }, { "epoch": 0.43632388872219124, "grad_norm": 0.201171875, "learning_rate": 8.875809556830936e-05, "loss": 0.0504, "step": 3393 }, { "epoch": 0.43645248403274894, "grad_norm": 0.2158203125, "learning_rate": 8.87517096883556e-05, "loss": 0.0585, "step": 3394 }, { "epoch": 0.4365810793433066, "grad_norm": 0.166015625, "learning_rate": 8.874532222505228e-05, "loss": 0.0463, "step": 3395 }, { "epoch": 0.4367096746538643, "grad_norm": 0.1552734375, "learning_rate": 8.87389331786604e-05, "loss": 0.0386, "step": 3396 }, { "epoch": 0.436838269964422, "grad_norm": 0.2275390625, "learning_rate": 8.8732542549441e-05, "loss": 0.053, "step": 3397 }, { "epoch": 0.43696686527497963, "grad_norm": 0.158203125, "learning_rate": 8.872615033765521e-05, "loss": 0.0405, "step": 3398 }, { "epoch": 0.43709546058553733, "grad_norm": 0.193359375, "learning_rate": 8.871975654356418e-05, "loss": 0.0559, "step": 3399 }, { "epoch": 0.437224055896095, "grad_norm": 0.173828125, "learning_rate": 8.871336116742918e-05, "loss": 0.0454, "step": 3400 }, { "epoch": 0.4373526512066527, "grad_norm": 0.203125, "learning_rate": 8.87069642095115e-05, "loss": 0.0528, "step": 3401 }, { "epoch": 0.4374812465172103, "grad_norm": 0.169921875, "learning_rate": 8.870056567007252e-05, "loss": 0.0455, "step": 3402 }, { "epoch": 0.437609841827768, "grad_norm": 0.1884765625, "learning_rate": 8.869416554937369e-05, "loss": 0.0545, "step": 3403 }, { "epoch": 0.43773843713832566, "grad_norm": 0.193359375, "learning_rate": 8.868776384767648e-05, "loss": 0.0498, "step": 3404 }, { "epoch": 0.43786703244888336, "grad_norm": 0.203125, "learning_rate": 8.868136056524247e-05, "loss": 0.0502, "step": 3405 }, { "epoch": 0.43799562775944106, "grad_norm": 0.17578125, "learning_rate": 8.867495570233331e-05, "loss": 0.0442, "step": 3406 }, { "epoch": 0.4381242230699987, "grad_norm": 0.1875, "learning_rate": 8.866854925921069e-05, "loss": 0.0469, "step": 3407 }, { "epoch": 0.4382528183805564, "grad_norm": 0.201171875, "learning_rate": 8.866214123613634e-05, "loss": 0.0526, "step": 3408 }, { "epoch": 0.43838141369111405, "grad_norm": 0.1826171875, "learning_rate": 8.86557316333721e-05, "loss": 0.0541, "step": 3409 }, { "epoch": 0.43851000900167175, "grad_norm": 0.1962890625, "learning_rate": 8.864932045117987e-05, "loss": 0.0506, "step": 3410 }, { "epoch": 0.4386386043122294, "grad_norm": 0.1767578125, "learning_rate": 8.86429076898216e-05, "loss": 0.0484, "step": 3411 }, { "epoch": 0.4387671996227871, "grad_norm": 0.2041015625, "learning_rate": 8.863649334955931e-05, "loss": 0.0595, "step": 3412 }, { "epoch": 0.43889579493334474, "grad_norm": 0.1865234375, "learning_rate": 8.863007743065505e-05, "loss": 0.055, "step": 3413 }, { "epoch": 0.43902439024390244, "grad_norm": 0.1875, "learning_rate": 8.8623659933371e-05, "loss": 0.047, "step": 3414 }, { "epoch": 0.43915298555446014, "grad_norm": 0.1875, "learning_rate": 8.861724085796939e-05, "loss": 0.0528, "step": 3415 }, { "epoch": 0.4392815808650178, "grad_norm": 0.1748046875, "learning_rate": 8.861082020471245e-05, "loss": 0.0472, "step": 3416 }, { "epoch": 0.4394101761755755, "grad_norm": 0.1787109375, "learning_rate": 8.860439797386256e-05, "loss": 0.0495, "step": 3417 }, { "epoch": 0.43953877148613313, "grad_norm": 0.1845703125, "learning_rate": 8.859797416568207e-05, "loss": 0.047, "step": 3418 }, { "epoch": 0.43966736679669083, "grad_norm": 0.1708984375, "learning_rate": 8.859154878043352e-05, "loss": 0.0495, "step": 3419 }, { "epoch": 0.4397959621072485, "grad_norm": 0.1826171875, "learning_rate": 8.858512181837939e-05, "loss": 0.0481, "step": 3420 }, { "epoch": 0.4399245574178062, "grad_norm": 0.1787109375, "learning_rate": 8.85786932797823e-05, "loss": 0.0537, "step": 3421 }, { "epoch": 0.4400531527283638, "grad_norm": 0.1962890625, "learning_rate": 8.857226316490493e-05, "loss": 0.0479, "step": 3422 }, { "epoch": 0.4401817480389215, "grad_norm": 0.1845703125, "learning_rate": 8.856583147400995e-05, "loss": 0.0547, "step": 3423 }, { "epoch": 0.4403103433494792, "grad_norm": 0.189453125, "learning_rate": 8.855939820736021e-05, "loss": 0.0469, "step": 3424 }, { "epoch": 0.44043893866003686, "grad_norm": 0.1572265625, "learning_rate": 8.855296336521855e-05, "loss": 0.041, "step": 3425 }, { "epoch": 0.44056753397059456, "grad_norm": 0.1884765625, "learning_rate": 8.854652694784787e-05, "loss": 0.0542, "step": 3426 }, { "epoch": 0.4406961292811522, "grad_norm": 0.1904296875, "learning_rate": 8.854008895551117e-05, "loss": 0.0454, "step": 3427 }, { "epoch": 0.4408247245917099, "grad_norm": 0.17578125, "learning_rate": 8.853364938847149e-05, "loss": 0.0491, "step": 3428 }, { "epoch": 0.44095331990226755, "grad_norm": 0.171875, "learning_rate": 8.852720824699197e-05, "loss": 0.0504, "step": 3429 }, { "epoch": 0.44108191521282525, "grad_norm": 0.2021484375, "learning_rate": 8.852076553133575e-05, "loss": 0.0508, "step": 3430 }, { "epoch": 0.4412105105233829, "grad_norm": 0.18359375, "learning_rate": 8.851432124176611e-05, "loss": 0.0445, "step": 3431 }, { "epoch": 0.4413391058339406, "grad_norm": 0.1552734375, "learning_rate": 8.850787537854631e-05, "loss": 0.0336, "step": 3432 }, { "epoch": 0.44146770114449824, "grad_norm": 0.1962890625, "learning_rate": 8.850142794193976e-05, "loss": 0.0418, "step": 3433 }, { "epoch": 0.44159629645505594, "grad_norm": 0.166015625, "learning_rate": 8.849497893220988e-05, "loss": 0.0461, "step": 3434 }, { "epoch": 0.44172489176561364, "grad_norm": 0.1884765625, "learning_rate": 8.848852834962018e-05, "loss": 0.0528, "step": 3435 }, { "epoch": 0.4418534870761713, "grad_norm": 0.189453125, "learning_rate": 8.848207619443421e-05, "loss": 0.0505, "step": 3436 }, { "epoch": 0.441982082386729, "grad_norm": 0.2041015625, "learning_rate": 8.847562246691559e-05, "loss": 0.0459, "step": 3437 }, { "epoch": 0.4421106776972866, "grad_norm": 0.1923828125, "learning_rate": 8.846916716732802e-05, "loss": 0.0496, "step": 3438 }, { "epoch": 0.4422392730078443, "grad_norm": 0.2001953125, "learning_rate": 8.846271029593526e-05, "loss": 0.0453, "step": 3439 }, { "epoch": 0.44236786831840197, "grad_norm": 0.1669921875, "learning_rate": 8.845625185300114e-05, "loss": 0.0493, "step": 3440 }, { "epoch": 0.44249646362895967, "grad_norm": 0.1787109375, "learning_rate": 8.844979183878952e-05, "loss": 0.0454, "step": 3441 }, { "epoch": 0.4426250589395173, "grad_norm": 0.17578125, "learning_rate": 8.844333025356437e-05, "loss": 0.0465, "step": 3442 }, { "epoch": 0.442753654250075, "grad_norm": 0.1796875, "learning_rate": 8.84368670975897e-05, "loss": 0.0461, "step": 3443 }, { "epoch": 0.4428822495606327, "grad_norm": 0.1796875, "learning_rate": 8.843040237112956e-05, "loss": 0.0461, "step": 3444 }, { "epoch": 0.44301084487119036, "grad_norm": 0.1591796875, "learning_rate": 8.842393607444814e-05, "loss": 0.0387, "step": 3445 }, { "epoch": 0.44313944018174806, "grad_norm": 0.1982421875, "learning_rate": 8.84174682078096e-05, "loss": 0.047, "step": 3446 }, { "epoch": 0.4432680354923057, "grad_norm": 0.19921875, "learning_rate": 8.841099877147823e-05, "loss": 0.054, "step": 3447 }, { "epoch": 0.4433966308028634, "grad_norm": 0.1669921875, "learning_rate": 8.840452776571836e-05, "loss": 0.0489, "step": 3448 }, { "epoch": 0.44352522611342104, "grad_norm": 0.1767578125, "learning_rate": 8.83980551907944e-05, "loss": 0.0427, "step": 3449 }, { "epoch": 0.44365382142397874, "grad_norm": 0.1865234375, "learning_rate": 8.839158104697079e-05, "loss": 0.047, "step": 3450 }, { "epoch": 0.4437824167345364, "grad_norm": 0.181640625, "learning_rate": 8.838510533451207e-05, "loss": 0.0442, "step": 3451 }, { "epoch": 0.4439110120450941, "grad_norm": 0.1787109375, "learning_rate": 8.837862805368283e-05, "loss": 0.0541, "step": 3452 }, { "epoch": 0.4440396073556518, "grad_norm": 0.2041015625, "learning_rate": 8.837214920474772e-05, "loss": 0.0458, "step": 3453 }, { "epoch": 0.44416820266620943, "grad_norm": 0.177734375, "learning_rate": 8.836566878797145e-05, "loss": 0.0442, "step": 3454 }, { "epoch": 0.44429679797676713, "grad_norm": 0.17578125, "learning_rate": 8.835918680361882e-05, "loss": 0.0437, "step": 3455 }, { "epoch": 0.4444253932873248, "grad_norm": 0.1748046875, "learning_rate": 8.835270325195466e-05, "loss": 0.0416, "step": 3456 }, { "epoch": 0.4445539885978825, "grad_norm": 0.17578125, "learning_rate": 8.834621813324391e-05, "loss": 0.0448, "step": 3457 }, { "epoch": 0.4446825839084401, "grad_norm": 0.21484375, "learning_rate": 8.83397314477515e-05, "loss": 0.0546, "step": 3458 }, { "epoch": 0.4448111792189978, "grad_norm": 0.185546875, "learning_rate": 8.83332431957425e-05, "loss": 0.0476, "step": 3459 }, { "epoch": 0.44493977452955547, "grad_norm": 0.189453125, "learning_rate": 8.8326753377482e-05, "loss": 0.0543, "step": 3460 }, { "epoch": 0.44506836984011316, "grad_norm": 0.18359375, "learning_rate": 8.832026199323515e-05, "loss": 0.0505, "step": 3461 }, { "epoch": 0.44519696515067086, "grad_norm": 0.1943359375, "learning_rate": 8.831376904326723e-05, "loss": 0.0496, "step": 3462 }, { "epoch": 0.4453255604612285, "grad_norm": 0.1650390625, "learning_rate": 8.830727452784349e-05, "loss": 0.0385, "step": 3463 }, { "epoch": 0.4454541557717862, "grad_norm": 0.1884765625, "learning_rate": 8.83007784472293e-05, "loss": 0.0459, "step": 3464 }, { "epoch": 0.44558275108234385, "grad_norm": 0.189453125, "learning_rate": 8.829428080169009e-05, "loss": 0.0489, "step": 3465 }, { "epoch": 0.44571134639290155, "grad_norm": 0.166015625, "learning_rate": 8.828778159149136e-05, "loss": 0.0387, "step": 3466 }, { "epoch": 0.4458399417034592, "grad_norm": 0.19140625, "learning_rate": 8.828128081689862e-05, "loss": 0.0478, "step": 3467 }, { "epoch": 0.4459685370140169, "grad_norm": 0.16796875, "learning_rate": 8.827477847817749e-05, "loss": 0.051, "step": 3468 }, { "epoch": 0.44609713232457454, "grad_norm": 0.2060546875, "learning_rate": 8.826827457559368e-05, "loss": 0.0529, "step": 3469 }, { "epoch": 0.44622572763513224, "grad_norm": 0.1884765625, "learning_rate": 8.826176910941292e-05, "loss": 0.049, "step": 3470 }, { "epoch": 0.44635432294568994, "grad_norm": 0.1787109375, "learning_rate": 8.825526207990101e-05, "loss": 0.0421, "step": 3471 }, { "epoch": 0.4464829182562476, "grad_norm": 0.17578125, "learning_rate": 8.824875348732382e-05, "loss": 0.0439, "step": 3472 }, { "epoch": 0.4466115135668053, "grad_norm": 0.2236328125, "learning_rate": 8.824224333194726e-05, "loss": 0.0468, "step": 3473 }, { "epoch": 0.44674010887736293, "grad_norm": 0.1826171875, "learning_rate": 8.823573161403738e-05, "loss": 0.0498, "step": 3474 }, { "epoch": 0.44686870418792063, "grad_norm": 0.1748046875, "learning_rate": 8.82292183338602e-05, "loss": 0.0482, "step": 3475 }, { "epoch": 0.4469972994984783, "grad_norm": 0.2060546875, "learning_rate": 8.822270349168187e-05, "loss": 0.0587, "step": 3476 }, { "epoch": 0.447125894809036, "grad_norm": 0.189453125, "learning_rate": 8.821618708776856e-05, "loss": 0.0482, "step": 3477 }, { "epoch": 0.4472544901195936, "grad_norm": 0.1884765625, "learning_rate": 8.820966912238651e-05, "loss": 0.048, "step": 3478 }, { "epoch": 0.4473830854301513, "grad_norm": 0.20703125, "learning_rate": 8.820314959580206e-05, "loss": 0.0502, "step": 3479 }, { "epoch": 0.44751168074070896, "grad_norm": 0.1982421875, "learning_rate": 8.819662850828161e-05, "loss": 0.0599, "step": 3480 }, { "epoch": 0.44764027605126666, "grad_norm": 0.18359375, "learning_rate": 8.819010586009156e-05, "loss": 0.0441, "step": 3481 }, { "epoch": 0.44776887136182436, "grad_norm": 0.2001953125, "learning_rate": 8.818358165149844e-05, "loss": 0.052, "step": 3482 }, { "epoch": 0.447897466672382, "grad_norm": 0.1875, "learning_rate": 8.817705588276882e-05, "loss": 0.0457, "step": 3483 }, { "epoch": 0.4480260619829397, "grad_norm": 0.169921875, "learning_rate": 8.817052855416934e-05, "loss": 0.0404, "step": 3484 }, { "epoch": 0.44815465729349735, "grad_norm": 0.1875, "learning_rate": 8.81639996659667e-05, "loss": 0.0421, "step": 3485 }, { "epoch": 0.44828325260405505, "grad_norm": 0.1953125, "learning_rate": 8.815746921842762e-05, "loss": 0.0532, "step": 3486 }, { "epoch": 0.4484118479146127, "grad_norm": 0.1826171875, "learning_rate": 8.815093721181899e-05, "loss": 0.0468, "step": 3487 }, { "epoch": 0.4485404432251704, "grad_norm": 0.177734375, "learning_rate": 8.814440364640767e-05, "loss": 0.0511, "step": 3488 }, { "epoch": 0.44866903853572804, "grad_norm": 0.2001953125, "learning_rate": 8.813786852246062e-05, "loss": 0.06, "step": 3489 }, { "epoch": 0.44879763384628574, "grad_norm": 0.173828125, "learning_rate": 8.813133184024484e-05, "loss": 0.0467, "step": 3490 }, { "epoch": 0.44892622915684344, "grad_norm": 0.1787109375, "learning_rate": 8.812479360002743e-05, "loss": 0.0561, "step": 3491 }, { "epoch": 0.4490548244674011, "grad_norm": 0.2451171875, "learning_rate": 8.811825380207555e-05, "loss": 0.0529, "step": 3492 }, { "epoch": 0.4491834197779588, "grad_norm": 0.16796875, "learning_rate": 8.811171244665635e-05, "loss": 0.0349, "step": 3493 }, { "epoch": 0.4493120150885164, "grad_norm": 0.1884765625, "learning_rate": 8.810516953403717e-05, "loss": 0.0477, "step": 3494 }, { "epoch": 0.4494406103990741, "grad_norm": 0.1845703125, "learning_rate": 8.80986250644853e-05, "loss": 0.0467, "step": 3495 }, { "epoch": 0.44956920570963177, "grad_norm": 0.177734375, "learning_rate": 8.809207903826817e-05, "loss": 0.047, "step": 3496 }, { "epoch": 0.44969780102018947, "grad_norm": 0.1953125, "learning_rate": 8.808553145565323e-05, "loss": 0.0507, "step": 3497 }, { "epoch": 0.4498263963307471, "grad_norm": 0.1875, "learning_rate": 8.807898231690798e-05, "loss": 0.0503, "step": 3498 }, { "epoch": 0.4499549916413048, "grad_norm": 0.2001953125, "learning_rate": 8.807243162230005e-05, "loss": 0.0599, "step": 3499 }, { "epoch": 0.4500835869518625, "grad_norm": 0.173828125, "learning_rate": 8.806587937209709e-05, "loss": 0.0449, "step": 3500 }, { "epoch": 0.4500835869518625, "eval_loss": 0.04734672233462334, "eval_runtime": 1043.2922, "eval_samples_per_second": 94.15, "eval_steps_per_second": 1.177, "step": 3500 }, { "epoch": 0.45021218226242016, "grad_norm": 0.166015625, "learning_rate": 8.80593255665668e-05, "loss": 0.0493, "step": 3501 }, { "epoch": 0.45034077757297786, "grad_norm": 0.158203125, "learning_rate": 8.805277020597696e-05, "loss": 0.0379, "step": 3502 }, { "epoch": 0.4504693728835355, "grad_norm": 0.173828125, "learning_rate": 8.804621329059543e-05, "loss": 0.043, "step": 3503 }, { "epoch": 0.4505979681940932, "grad_norm": 0.1875, "learning_rate": 8.80396548206901e-05, "loss": 0.0494, "step": 3504 }, { "epoch": 0.45072656350465085, "grad_norm": 0.1767578125, "learning_rate": 8.803309479652896e-05, "loss": 0.0455, "step": 3505 }, { "epoch": 0.45085515881520855, "grad_norm": 0.1728515625, "learning_rate": 8.802653321838004e-05, "loss": 0.0443, "step": 3506 }, { "epoch": 0.4509837541257662, "grad_norm": 0.1904296875, "learning_rate": 8.801997008651142e-05, "loss": 0.0529, "step": 3507 }, { "epoch": 0.4511123494363239, "grad_norm": 0.1875, "learning_rate": 8.801340540119127e-05, "loss": 0.0538, "step": 3508 }, { "epoch": 0.4512409447468816, "grad_norm": 0.1787109375, "learning_rate": 8.800683916268784e-05, "loss": 0.0465, "step": 3509 }, { "epoch": 0.45136954005743923, "grad_norm": 0.1728515625, "learning_rate": 8.800027137126941e-05, "loss": 0.0465, "step": 3510 }, { "epoch": 0.45149813536799693, "grad_norm": 0.1865234375, "learning_rate": 8.79937020272043e-05, "loss": 0.0576, "step": 3511 }, { "epoch": 0.4516267306785546, "grad_norm": 0.189453125, "learning_rate": 8.798713113076094e-05, "loss": 0.0505, "step": 3512 }, { "epoch": 0.4517553259891123, "grad_norm": 0.20703125, "learning_rate": 8.798055868220783e-05, "loss": 0.0477, "step": 3513 }, { "epoch": 0.4518839212996699, "grad_norm": 0.189453125, "learning_rate": 8.797398468181351e-05, "loss": 0.0571, "step": 3514 }, { "epoch": 0.4520125166102276, "grad_norm": 0.171875, "learning_rate": 8.796740912984657e-05, "loss": 0.0516, "step": 3515 }, { "epoch": 0.45214111192078527, "grad_norm": 0.197265625, "learning_rate": 8.796083202657568e-05, "loss": 0.0549, "step": 3516 }, { "epoch": 0.45226970723134297, "grad_norm": 0.1689453125, "learning_rate": 8.795425337226957e-05, "loss": 0.0465, "step": 3517 }, { "epoch": 0.45239830254190067, "grad_norm": 0.1845703125, "learning_rate": 8.794767316719705e-05, "loss": 0.0516, "step": 3518 }, { "epoch": 0.4525268978524583, "grad_norm": 0.1728515625, "learning_rate": 8.794109141162698e-05, "loss": 0.0475, "step": 3519 }, { "epoch": 0.452655493163016, "grad_norm": 0.181640625, "learning_rate": 8.793450810582829e-05, "loss": 0.0421, "step": 3520 }, { "epoch": 0.45278408847357365, "grad_norm": 0.1826171875, "learning_rate": 8.792792325006993e-05, "loss": 0.0431, "step": 3521 }, { "epoch": 0.45291268378413135, "grad_norm": 0.1787109375, "learning_rate": 8.792133684462098e-05, "loss": 0.0432, "step": 3522 }, { "epoch": 0.453041279094689, "grad_norm": 0.1689453125, "learning_rate": 8.791474888975055e-05, "loss": 0.0402, "step": 3523 }, { "epoch": 0.4531698744052467, "grad_norm": 0.1748046875, "learning_rate": 8.790815938572779e-05, "loss": 0.05, "step": 3524 }, { "epoch": 0.45329846971580434, "grad_norm": 0.193359375, "learning_rate": 8.790156833282198e-05, "loss": 0.0525, "step": 3525 }, { "epoch": 0.45342706502636204, "grad_norm": 0.1845703125, "learning_rate": 8.789497573130239e-05, "loss": 0.048, "step": 3526 }, { "epoch": 0.4535556603369197, "grad_norm": 0.193359375, "learning_rate": 8.788838158143841e-05, "loss": 0.0494, "step": 3527 }, { "epoch": 0.4536842556474774, "grad_norm": 0.1943359375, "learning_rate": 8.788178588349947e-05, "loss": 0.062, "step": 3528 }, { "epoch": 0.4538128509580351, "grad_norm": 0.173828125, "learning_rate": 8.787518863775503e-05, "loss": 0.0473, "step": 3529 }, { "epoch": 0.45394144626859273, "grad_norm": 0.1748046875, "learning_rate": 8.786858984447467e-05, "loss": 0.045, "step": 3530 }, { "epoch": 0.45407004157915043, "grad_norm": 0.181640625, "learning_rate": 8.7861989503928e-05, "loss": 0.0462, "step": 3531 }, { "epoch": 0.4541986368897081, "grad_norm": 0.15234375, "learning_rate": 8.785538761638473e-05, "loss": 0.0359, "step": 3532 }, { "epoch": 0.4543272322002658, "grad_norm": 0.162109375, "learning_rate": 8.784878418211458e-05, "loss": 0.0408, "step": 3533 }, { "epoch": 0.4544558275108234, "grad_norm": 0.1630859375, "learning_rate": 8.784217920138735e-05, "loss": 0.0375, "step": 3534 }, { "epoch": 0.4545844228213811, "grad_norm": 0.1796875, "learning_rate": 8.783557267447291e-05, "loss": 0.0522, "step": 3535 }, { "epoch": 0.45471301813193876, "grad_norm": 0.1953125, "learning_rate": 8.782896460164122e-05, "loss": 0.0437, "step": 3536 }, { "epoch": 0.45484161344249646, "grad_norm": 0.169921875, "learning_rate": 8.782235498316229e-05, "loss": 0.0345, "step": 3537 }, { "epoch": 0.45497020875305416, "grad_norm": 0.1767578125, "learning_rate": 8.781574381930613e-05, "loss": 0.0443, "step": 3538 }, { "epoch": 0.4550988040636118, "grad_norm": 0.1767578125, "learning_rate": 8.78091311103429e-05, "loss": 0.0455, "step": 3539 }, { "epoch": 0.4552273993741695, "grad_norm": 0.1826171875, "learning_rate": 8.780251685654279e-05, "loss": 0.0488, "step": 3540 }, { "epoch": 0.45535599468472715, "grad_norm": 0.1875, "learning_rate": 8.779590105817604e-05, "loss": 0.0532, "step": 3541 }, { "epoch": 0.45548458999528485, "grad_norm": 0.1611328125, "learning_rate": 8.778928371551294e-05, "loss": 0.0341, "step": 3542 }, { "epoch": 0.4556131853058425, "grad_norm": 0.2021484375, "learning_rate": 8.778266482882392e-05, "loss": 0.0472, "step": 3543 }, { "epoch": 0.4557417806164002, "grad_norm": 0.1787109375, "learning_rate": 8.777604439837938e-05, "loss": 0.0481, "step": 3544 }, { "epoch": 0.45587037592695784, "grad_norm": 0.1748046875, "learning_rate": 8.776942242444984e-05, "loss": 0.0467, "step": 3545 }, { "epoch": 0.45599897123751554, "grad_norm": 0.2041015625, "learning_rate": 8.776279890730585e-05, "loss": 0.0438, "step": 3546 }, { "epoch": 0.45612756654807324, "grad_norm": 0.181640625, "learning_rate": 8.775617384721805e-05, "loss": 0.0541, "step": 3547 }, { "epoch": 0.4562561618586309, "grad_norm": 0.1875, "learning_rate": 8.774954724445714e-05, "loss": 0.0573, "step": 3548 }, { "epoch": 0.4563847571691886, "grad_norm": 0.1884765625, "learning_rate": 8.774291909929387e-05, "loss": 0.0504, "step": 3549 }, { "epoch": 0.4565133524797462, "grad_norm": 0.1796875, "learning_rate": 8.773628941199903e-05, "loss": 0.046, "step": 3550 }, { "epoch": 0.4566419477903039, "grad_norm": 0.17578125, "learning_rate": 8.772965818284355e-05, "loss": 0.0447, "step": 3551 }, { "epoch": 0.45677054310086157, "grad_norm": 0.1767578125, "learning_rate": 8.772302541209834e-05, "loss": 0.0468, "step": 3552 }, { "epoch": 0.45689913841141927, "grad_norm": 0.1689453125, "learning_rate": 8.771639110003444e-05, "loss": 0.0417, "step": 3553 }, { "epoch": 0.4570277337219769, "grad_norm": 0.1630859375, "learning_rate": 8.770975524692287e-05, "loss": 0.0397, "step": 3554 }, { "epoch": 0.4571563290325346, "grad_norm": 0.16796875, "learning_rate": 8.770311785303481e-05, "loss": 0.0457, "step": 3555 }, { "epoch": 0.4572849243430923, "grad_norm": 0.1728515625, "learning_rate": 8.769647891864143e-05, "loss": 0.0436, "step": 3556 }, { "epoch": 0.45741351965364996, "grad_norm": 0.177734375, "learning_rate": 8.7689838444014e-05, "loss": 0.0489, "step": 3557 }, { "epoch": 0.45754211496420766, "grad_norm": 0.1767578125, "learning_rate": 8.768319642942384e-05, "loss": 0.0482, "step": 3558 }, { "epoch": 0.4576707102747653, "grad_norm": 0.1796875, "learning_rate": 8.767655287514233e-05, "loss": 0.0492, "step": 3559 }, { "epoch": 0.457799305585323, "grad_norm": 0.2119140625, "learning_rate": 8.766990778144094e-05, "loss": 0.0559, "step": 3560 }, { "epoch": 0.45792790089588065, "grad_norm": 0.18359375, "learning_rate": 8.766326114859113e-05, "loss": 0.0509, "step": 3561 }, { "epoch": 0.45805649620643835, "grad_norm": 0.203125, "learning_rate": 8.765661297686453e-05, "loss": 0.0497, "step": 3562 }, { "epoch": 0.458185091516996, "grad_norm": 0.1826171875, "learning_rate": 8.764996326653274e-05, "loss": 0.0491, "step": 3563 }, { "epoch": 0.4583136868275537, "grad_norm": 0.177734375, "learning_rate": 8.764331201786749e-05, "loss": 0.0484, "step": 3564 }, { "epoch": 0.4584422821381114, "grad_norm": 0.1982421875, "learning_rate": 8.76366592311405e-05, "loss": 0.0528, "step": 3565 }, { "epoch": 0.45857087744866903, "grad_norm": 0.1943359375, "learning_rate": 8.763000490662364e-05, "loss": 0.044, "step": 3566 }, { "epoch": 0.45869947275922673, "grad_norm": 0.1748046875, "learning_rate": 8.762334904458875e-05, "loss": 0.051, "step": 3567 }, { "epoch": 0.4588280680697844, "grad_norm": 0.197265625, "learning_rate": 8.761669164530784e-05, "loss": 0.0527, "step": 3568 }, { "epoch": 0.4589566633803421, "grad_norm": 0.193359375, "learning_rate": 8.761003270905288e-05, "loss": 0.057, "step": 3569 }, { "epoch": 0.4590852586908997, "grad_norm": 0.1708984375, "learning_rate": 8.760337223609595e-05, "loss": 0.0523, "step": 3570 }, { "epoch": 0.4592138540014574, "grad_norm": 0.1943359375, "learning_rate": 8.75967102267092e-05, "loss": 0.0472, "step": 3571 }, { "epoch": 0.45934244931201507, "grad_norm": 0.1650390625, "learning_rate": 8.759004668116484e-05, "loss": 0.0434, "step": 3572 }, { "epoch": 0.45947104462257277, "grad_norm": 0.171875, "learning_rate": 8.758338159973512e-05, "loss": 0.0477, "step": 3573 }, { "epoch": 0.45959963993313047, "grad_norm": 0.2021484375, "learning_rate": 8.757671498269236e-05, "loss": 0.0474, "step": 3574 }, { "epoch": 0.4597282352436881, "grad_norm": 0.158203125, "learning_rate": 8.757004683030896e-05, "loss": 0.0423, "step": 3575 }, { "epoch": 0.4598568305542458, "grad_norm": 0.1787109375, "learning_rate": 8.756337714285738e-05, "loss": 0.0469, "step": 3576 }, { "epoch": 0.45998542586480345, "grad_norm": 0.1630859375, "learning_rate": 8.755670592061013e-05, "loss": 0.0378, "step": 3577 }, { "epoch": 0.46011402117536115, "grad_norm": 0.1953125, "learning_rate": 8.75500331638398e-05, "loss": 0.0542, "step": 3578 }, { "epoch": 0.4602426164859188, "grad_norm": 0.171875, "learning_rate": 8.7543358872819e-05, "loss": 0.0466, "step": 3579 }, { "epoch": 0.4603712117964765, "grad_norm": 0.171875, "learning_rate": 8.753668304782046e-05, "loss": 0.0456, "step": 3580 }, { "epoch": 0.46049980710703414, "grad_norm": 0.1669921875, "learning_rate": 8.753000568911694e-05, "loss": 0.0452, "step": 3581 }, { "epoch": 0.46062840241759184, "grad_norm": 0.1923828125, "learning_rate": 8.752332679698128e-05, "loss": 0.0499, "step": 3582 }, { "epoch": 0.4607569977281495, "grad_norm": 0.1865234375, "learning_rate": 8.751664637168635e-05, "loss": 0.0482, "step": 3583 }, { "epoch": 0.4608855930387072, "grad_norm": 0.1962890625, "learning_rate": 8.75099644135051e-05, "loss": 0.0538, "step": 3584 }, { "epoch": 0.4610141883492649, "grad_norm": 0.1787109375, "learning_rate": 8.750328092271058e-05, "loss": 0.0438, "step": 3585 }, { "epoch": 0.46114278365982253, "grad_norm": 0.201171875, "learning_rate": 8.749659589957585e-05, "loss": 0.0547, "step": 3586 }, { "epoch": 0.46127137897038023, "grad_norm": 0.181640625, "learning_rate": 8.748990934437404e-05, "loss": 0.0444, "step": 3587 }, { "epoch": 0.4613999742809379, "grad_norm": 0.1826171875, "learning_rate": 8.748322125737839e-05, "loss": 0.0516, "step": 3588 }, { "epoch": 0.4615285695914956, "grad_norm": 0.1982421875, "learning_rate": 8.747653163886214e-05, "loss": 0.0576, "step": 3589 }, { "epoch": 0.4616571649020532, "grad_norm": 0.181640625, "learning_rate": 8.74698404890986e-05, "loss": 0.048, "step": 3590 }, { "epoch": 0.4617857602126109, "grad_norm": 0.18359375, "learning_rate": 8.746314780836123e-05, "loss": 0.0457, "step": 3591 }, { "epoch": 0.46191435552316856, "grad_norm": 0.173828125, "learning_rate": 8.74564535969234e-05, "loss": 0.0408, "step": 3592 }, { "epoch": 0.46204295083372626, "grad_norm": 0.1787109375, "learning_rate": 8.74497578550587e-05, "loss": 0.0442, "step": 3593 }, { "epoch": 0.46217154614428396, "grad_norm": 0.197265625, "learning_rate": 8.744306058304068e-05, "loss": 0.0518, "step": 3594 }, { "epoch": 0.4623001414548416, "grad_norm": 0.1806640625, "learning_rate": 8.743636178114297e-05, "loss": 0.0493, "step": 3595 }, { "epoch": 0.4624287367653993, "grad_norm": 0.26171875, "learning_rate": 8.74296614496393e-05, "loss": 0.0387, "step": 3596 }, { "epoch": 0.46255733207595695, "grad_norm": 0.1875, "learning_rate": 8.742295958880343e-05, "loss": 0.0509, "step": 3597 }, { "epoch": 0.46268592738651465, "grad_norm": 0.1845703125, "learning_rate": 8.741625619890917e-05, "loss": 0.0517, "step": 3598 }, { "epoch": 0.4628145226970723, "grad_norm": 0.203125, "learning_rate": 8.740955128023043e-05, "loss": 0.057, "step": 3599 }, { "epoch": 0.46294311800763, "grad_norm": 0.193359375, "learning_rate": 8.740284483304117e-05, "loss": 0.057, "step": 3600 }, { "epoch": 0.46307171331818764, "grad_norm": 0.181640625, "learning_rate": 8.739613685761541e-05, "loss": 0.051, "step": 3601 }, { "epoch": 0.46320030862874534, "grad_norm": 0.1962890625, "learning_rate": 8.738942735422723e-05, "loss": 0.0565, "step": 3602 }, { "epoch": 0.46332890393930304, "grad_norm": 0.193359375, "learning_rate": 8.738271632315075e-05, "loss": 0.051, "step": 3603 }, { "epoch": 0.4634574992498607, "grad_norm": 0.1708984375, "learning_rate": 8.737600376466019e-05, "loss": 0.0464, "step": 3604 }, { "epoch": 0.4635860945604184, "grad_norm": 0.1943359375, "learning_rate": 8.736928967902982e-05, "loss": 0.058, "step": 3605 }, { "epoch": 0.463714689870976, "grad_norm": 0.1875, "learning_rate": 8.736257406653397e-05, "loss": 0.0516, "step": 3606 }, { "epoch": 0.4638432851815337, "grad_norm": 0.1787109375, "learning_rate": 8.7355856927447e-05, "loss": 0.0508, "step": 3607 }, { "epoch": 0.46397188049209137, "grad_norm": 0.1845703125, "learning_rate": 8.734913826204342e-05, "loss": 0.0519, "step": 3608 }, { "epoch": 0.46410047580264907, "grad_norm": 0.1767578125, "learning_rate": 8.734241807059773e-05, "loss": 0.0487, "step": 3609 }, { "epoch": 0.4642290711132067, "grad_norm": 0.1748046875, "learning_rate": 8.733569635338449e-05, "loss": 0.0431, "step": 3610 }, { "epoch": 0.4643576664237644, "grad_norm": 0.1982421875, "learning_rate": 8.732897311067834e-05, "loss": 0.0487, "step": 3611 }, { "epoch": 0.4644862617343221, "grad_norm": 0.2138671875, "learning_rate": 8.7322248342754e-05, "loss": 0.0593, "step": 3612 }, { "epoch": 0.46461485704487976, "grad_norm": 0.16796875, "learning_rate": 8.731552204988624e-05, "loss": 0.0482, "step": 3613 }, { "epoch": 0.46474345235543746, "grad_norm": 0.1865234375, "learning_rate": 8.730879423234988e-05, "loss": 0.0455, "step": 3614 }, { "epoch": 0.4648720476659951, "grad_norm": 0.1884765625, "learning_rate": 8.730206489041979e-05, "loss": 0.0504, "step": 3615 }, { "epoch": 0.4650006429765528, "grad_norm": 0.177734375, "learning_rate": 8.729533402437096e-05, "loss": 0.0453, "step": 3616 }, { "epoch": 0.46512923828711045, "grad_norm": 0.18359375, "learning_rate": 8.728860163447838e-05, "loss": 0.0505, "step": 3617 }, { "epoch": 0.46525783359766815, "grad_norm": 0.177734375, "learning_rate": 8.728186772101714e-05, "loss": 0.0507, "step": 3618 }, { "epoch": 0.4653864289082258, "grad_norm": 0.16796875, "learning_rate": 8.727513228426237e-05, "loss": 0.0413, "step": 3619 }, { "epoch": 0.4655150242187835, "grad_norm": 0.1806640625, "learning_rate": 8.72683953244893e-05, "loss": 0.0491, "step": 3620 }, { "epoch": 0.4656436195293412, "grad_norm": 0.1982421875, "learning_rate": 8.726165684197314e-05, "loss": 0.0559, "step": 3621 }, { "epoch": 0.46577221483989883, "grad_norm": 0.1806640625, "learning_rate": 8.725491683698927e-05, "loss": 0.044, "step": 3622 }, { "epoch": 0.46590081015045653, "grad_norm": 0.1708984375, "learning_rate": 8.724817530981304e-05, "loss": 0.0349, "step": 3623 }, { "epoch": 0.4660294054610142, "grad_norm": 0.177734375, "learning_rate": 8.724143226071992e-05, "loss": 0.0509, "step": 3624 }, { "epoch": 0.4661580007715719, "grad_norm": 0.232421875, "learning_rate": 8.723468768998544e-05, "loss": 0.047, "step": 3625 }, { "epoch": 0.4662865960821295, "grad_norm": 0.171875, "learning_rate": 8.722794159788515e-05, "loss": 0.0446, "step": 3626 }, { "epoch": 0.4664151913926872, "grad_norm": 0.1787109375, "learning_rate": 8.722119398469468e-05, "loss": 0.0454, "step": 3627 }, { "epoch": 0.46654378670324487, "grad_norm": 0.193359375, "learning_rate": 8.721444485068975e-05, "loss": 0.0548, "step": 3628 }, { "epoch": 0.46667238201380257, "grad_norm": 0.150390625, "learning_rate": 8.720769419614612e-05, "loss": 0.0355, "step": 3629 }, { "epoch": 0.4668009773243602, "grad_norm": 0.1728515625, "learning_rate": 8.720094202133961e-05, "loss": 0.0377, "step": 3630 }, { "epoch": 0.4669295726349179, "grad_norm": 0.1728515625, "learning_rate": 8.71941883265461e-05, "loss": 0.0451, "step": 3631 }, { "epoch": 0.4670581679454756, "grad_norm": 0.1669921875, "learning_rate": 8.718743311204156e-05, "loss": 0.0376, "step": 3632 }, { "epoch": 0.46718676325603326, "grad_norm": 0.1669921875, "learning_rate": 8.718067637810197e-05, "loss": 0.0374, "step": 3633 }, { "epoch": 0.46731535856659095, "grad_norm": 0.2021484375, "learning_rate": 8.717391812500339e-05, "loss": 0.0514, "step": 3634 }, { "epoch": 0.4674439538771486, "grad_norm": 0.1806640625, "learning_rate": 8.716715835302202e-05, "loss": 0.0452, "step": 3635 }, { "epoch": 0.4675725491877063, "grad_norm": 0.169921875, "learning_rate": 8.7160397062434e-05, "loss": 0.0435, "step": 3636 }, { "epoch": 0.46770114449826394, "grad_norm": 0.216796875, "learning_rate": 8.71536342535156e-05, "loss": 0.0595, "step": 3637 }, { "epoch": 0.46782973980882164, "grad_norm": 0.197265625, "learning_rate": 8.714686992654315e-05, "loss": 0.0488, "step": 3638 }, { "epoch": 0.4679583351193793, "grad_norm": 0.171875, "learning_rate": 8.714010408179303e-05, "loss": 0.0396, "step": 3639 }, { "epoch": 0.468086930429937, "grad_norm": 0.1796875, "learning_rate": 8.713333671954169e-05, "loss": 0.0483, "step": 3640 }, { "epoch": 0.4682155257404947, "grad_norm": 0.1806640625, "learning_rate": 8.712656784006561e-05, "loss": 0.0451, "step": 3641 }, { "epoch": 0.46834412105105233, "grad_norm": 0.1552734375, "learning_rate": 8.711979744364138e-05, "loss": 0.0372, "step": 3642 }, { "epoch": 0.46847271636161003, "grad_norm": 0.208984375, "learning_rate": 8.711302553054563e-05, "loss": 0.0503, "step": 3643 }, { "epoch": 0.4686013116721677, "grad_norm": 0.1904296875, "learning_rate": 8.710625210105508e-05, "loss": 0.0443, "step": 3644 }, { "epoch": 0.4687299069827254, "grad_norm": 0.2041015625, "learning_rate": 8.709947715544642e-05, "loss": 0.0532, "step": 3645 }, { "epoch": 0.468858502293283, "grad_norm": 0.203125, "learning_rate": 8.70927006939965e-05, "loss": 0.0504, "step": 3646 }, { "epoch": 0.4689870976038407, "grad_norm": 0.216796875, "learning_rate": 8.708592271698222e-05, "loss": 0.0535, "step": 3647 }, { "epoch": 0.46911569291439836, "grad_norm": 0.1748046875, "learning_rate": 8.707914322468049e-05, "loss": 0.0403, "step": 3648 }, { "epoch": 0.46924428822495606, "grad_norm": 0.19140625, "learning_rate": 8.70723622173683e-05, "loss": 0.047, "step": 3649 }, { "epoch": 0.46937288353551376, "grad_norm": 0.19921875, "learning_rate": 8.706557969532276e-05, "loss": 0.0503, "step": 3650 }, { "epoch": 0.4695014788460714, "grad_norm": 0.2177734375, "learning_rate": 8.705879565882098e-05, "loss": 0.0432, "step": 3651 }, { "epoch": 0.4696300741566291, "grad_norm": 0.1923828125, "learning_rate": 8.705201010814012e-05, "loss": 0.0478, "step": 3652 }, { "epoch": 0.46975866946718675, "grad_norm": 0.1767578125, "learning_rate": 8.704522304355745e-05, "loss": 0.0488, "step": 3653 }, { "epoch": 0.46988726477774445, "grad_norm": 0.171875, "learning_rate": 8.70384344653503e-05, "loss": 0.0458, "step": 3654 }, { "epoch": 0.4700158600883021, "grad_norm": 0.2001953125, "learning_rate": 8.703164437379601e-05, "loss": 0.057, "step": 3655 }, { "epoch": 0.4701444553988598, "grad_norm": 0.2060546875, "learning_rate": 8.702485276917202e-05, "loss": 0.0471, "step": 3656 }, { "epoch": 0.47027305070941744, "grad_norm": 0.2080078125, "learning_rate": 8.701805965175585e-05, "loss": 0.0498, "step": 3657 }, { "epoch": 0.47040164601997514, "grad_norm": 0.197265625, "learning_rate": 8.701126502182504e-05, "loss": 0.041, "step": 3658 }, { "epoch": 0.47053024133053284, "grad_norm": 0.19140625, "learning_rate": 8.700446887965721e-05, "loss": 0.0477, "step": 3659 }, { "epoch": 0.4706588366410905, "grad_norm": 0.197265625, "learning_rate": 8.699767122553004e-05, "loss": 0.0485, "step": 3660 }, { "epoch": 0.4707874319516482, "grad_norm": 0.1884765625, "learning_rate": 8.69908720597213e-05, "loss": 0.0467, "step": 3661 }, { "epoch": 0.4709160272622058, "grad_norm": 0.1962890625, "learning_rate": 8.698407138250878e-05, "loss": 0.0532, "step": 3662 }, { "epoch": 0.4710446225727635, "grad_norm": 0.181640625, "learning_rate": 8.697726919417034e-05, "loss": 0.0442, "step": 3663 }, { "epoch": 0.47117321788332117, "grad_norm": 0.193359375, "learning_rate": 8.697046549498391e-05, "loss": 0.0436, "step": 3664 }, { "epoch": 0.47130181319387887, "grad_norm": 0.1845703125, "learning_rate": 8.696366028522748e-05, "loss": 0.0498, "step": 3665 }, { "epoch": 0.4714304085044365, "grad_norm": 0.2041015625, "learning_rate": 8.695685356517913e-05, "loss": 0.0532, "step": 3666 }, { "epoch": 0.4715590038149942, "grad_norm": 0.16015625, "learning_rate": 8.695004533511695e-05, "loss": 0.0362, "step": 3667 }, { "epoch": 0.4716875991255519, "grad_norm": 0.1708984375, "learning_rate": 8.694323559531912e-05, "loss": 0.0429, "step": 3668 }, { "epoch": 0.47181619443610956, "grad_norm": 0.2041015625, "learning_rate": 8.693642434606386e-05, "loss": 0.0513, "step": 3669 }, { "epoch": 0.47194478974666726, "grad_norm": 0.1923828125, "learning_rate": 8.69296115876295e-05, "loss": 0.0519, "step": 3670 }, { "epoch": 0.4720733850572249, "grad_norm": 0.2314453125, "learning_rate": 8.69227973202944e-05, "loss": 0.0549, "step": 3671 }, { "epoch": 0.4722019803677826, "grad_norm": 0.197265625, "learning_rate": 8.691598154433696e-05, "loss": 0.0489, "step": 3672 }, { "epoch": 0.47233057567834025, "grad_norm": 0.1826171875, "learning_rate": 8.690916426003569e-05, "loss": 0.0439, "step": 3673 }, { "epoch": 0.47245917098889795, "grad_norm": 0.1875, "learning_rate": 8.69023454676691e-05, "loss": 0.046, "step": 3674 }, { "epoch": 0.4725877662994556, "grad_norm": 0.1943359375, "learning_rate": 8.689552516751584e-05, "loss": 0.0486, "step": 3675 }, { "epoch": 0.4727163616100133, "grad_norm": 0.1875, "learning_rate": 8.688870335985456e-05, "loss": 0.0439, "step": 3676 }, { "epoch": 0.47284495692057094, "grad_norm": 0.1962890625, "learning_rate": 8.688188004496398e-05, "loss": 0.0391, "step": 3677 }, { "epoch": 0.47297355223112864, "grad_norm": 0.2216796875, "learning_rate": 8.687505522312292e-05, "loss": 0.0574, "step": 3678 }, { "epoch": 0.47310214754168634, "grad_norm": 0.2099609375, "learning_rate": 8.68682288946102e-05, "loss": 0.0493, "step": 3679 }, { "epoch": 0.473230742852244, "grad_norm": 0.171875, "learning_rate": 8.686140105970475e-05, "loss": 0.0445, "step": 3680 }, { "epoch": 0.4733593381628017, "grad_norm": 0.1787109375, "learning_rate": 8.685457171868558e-05, "loss": 0.0443, "step": 3681 }, { "epoch": 0.4734879334733593, "grad_norm": 0.1904296875, "learning_rate": 8.684774087183169e-05, "loss": 0.0509, "step": 3682 }, { "epoch": 0.473616528783917, "grad_norm": 0.1884765625, "learning_rate": 8.684090851942218e-05, "loss": 0.044, "step": 3683 }, { "epoch": 0.47374512409447467, "grad_norm": 0.171875, "learning_rate": 8.683407466173623e-05, "loss": 0.0459, "step": 3684 }, { "epoch": 0.47387371940503237, "grad_norm": 0.1767578125, "learning_rate": 8.682723929905305e-05, "loss": 0.0428, "step": 3685 }, { "epoch": 0.47400231471559, "grad_norm": 0.1845703125, "learning_rate": 8.682040243165195e-05, "loss": 0.0475, "step": 3686 }, { "epoch": 0.4741309100261477, "grad_norm": 0.171875, "learning_rate": 8.681356405981224e-05, "loss": 0.0479, "step": 3687 }, { "epoch": 0.4742595053367054, "grad_norm": 0.1953125, "learning_rate": 8.680672418381335e-05, "loss": 0.0427, "step": 3688 }, { "epoch": 0.47438810064726306, "grad_norm": 0.23046875, "learning_rate": 8.679988280393475e-05, "loss": 0.0515, "step": 3689 }, { "epoch": 0.47451669595782076, "grad_norm": 0.2041015625, "learning_rate": 8.679303992045596e-05, "loss": 0.0601, "step": 3690 }, { "epoch": 0.4746452912683784, "grad_norm": 0.1787109375, "learning_rate": 8.678619553365659e-05, "loss": 0.0445, "step": 3691 }, { "epoch": 0.4747738865789361, "grad_norm": 0.1748046875, "learning_rate": 8.677934964381626e-05, "loss": 0.0488, "step": 3692 }, { "epoch": 0.47490248188949374, "grad_norm": 0.18359375, "learning_rate": 8.677250225121473e-05, "loss": 0.0451, "step": 3693 }, { "epoch": 0.47503107720005144, "grad_norm": 0.1904296875, "learning_rate": 8.676565335613174e-05, "loss": 0.0494, "step": 3694 }, { "epoch": 0.4751596725106091, "grad_norm": 0.17578125, "learning_rate": 8.675880295884716e-05, "loss": 0.0444, "step": 3695 }, { "epoch": 0.4752882678211668, "grad_norm": 0.1728515625, "learning_rate": 8.675195105964083e-05, "loss": 0.0512, "step": 3696 }, { "epoch": 0.4754168631317245, "grad_norm": 0.18359375, "learning_rate": 8.674509765879278e-05, "loss": 0.0548, "step": 3697 }, { "epoch": 0.47554545844228213, "grad_norm": 0.1787109375, "learning_rate": 8.6738242756583e-05, "loss": 0.041, "step": 3698 }, { "epoch": 0.47567405375283983, "grad_norm": 0.19921875, "learning_rate": 8.673138635329157e-05, "loss": 0.0489, "step": 3699 }, { "epoch": 0.4758026490633975, "grad_norm": 0.1748046875, "learning_rate": 8.672452844919864e-05, "loss": 0.0425, "step": 3700 }, { "epoch": 0.4759312443739552, "grad_norm": 0.1728515625, "learning_rate": 8.671766904458443e-05, "loss": 0.0486, "step": 3701 }, { "epoch": 0.4760598396845128, "grad_norm": 0.1669921875, "learning_rate": 8.671080813972918e-05, "loss": 0.0452, "step": 3702 }, { "epoch": 0.4761884349950705, "grad_norm": 0.15625, "learning_rate": 8.670394573491323e-05, "loss": 0.039, "step": 3703 }, { "epoch": 0.47631703030562816, "grad_norm": 0.18359375, "learning_rate": 8.669708183041698e-05, "loss": 0.043, "step": 3704 }, { "epoch": 0.47644562561618586, "grad_norm": 0.173828125, "learning_rate": 8.669021642652088e-05, "loss": 0.0459, "step": 3705 }, { "epoch": 0.47657422092674356, "grad_norm": 0.173828125, "learning_rate": 8.668334952350543e-05, "loss": 0.0507, "step": 3706 }, { "epoch": 0.4767028162373012, "grad_norm": 0.17578125, "learning_rate": 8.667648112165121e-05, "loss": 0.0474, "step": 3707 }, { "epoch": 0.4768314115478589, "grad_norm": 0.1982421875, "learning_rate": 8.666961122123884e-05, "loss": 0.0493, "step": 3708 }, { "epoch": 0.47696000685841655, "grad_norm": 0.1796875, "learning_rate": 8.666273982254904e-05, "loss": 0.0435, "step": 3709 }, { "epoch": 0.47708860216897425, "grad_norm": 0.185546875, "learning_rate": 8.665586692586258e-05, "loss": 0.0541, "step": 3710 }, { "epoch": 0.4772171974795319, "grad_norm": 0.2138671875, "learning_rate": 8.664899253146023e-05, "loss": 0.0625, "step": 3711 }, { "epoch": 0.4773457927900896, "grad_norm": 0.1640625, "learning_rate": 8.664211663962289e-05, "loss": 0.0407, "step": 3712 }, { "epoch": 0.47747438810064724, "grad_norm": 0.1865234375, "learning_rate": 8.663523925063153e-05, "loss": 0.0471, "step": 3713 }, { "epoch": 0.47760298341120494, "grad_norm": 0.1630859375, "learning_rate": 8.662836036476712e-05, "loss": 0.0444, "step": 3714 }, { "epoch": 0.47773157872176264, "grad_norm": 0.203125, "learning_rate": 8.662147998231073e-05, "loss": 0.0549, "step": 3715 }, { "epoch": 0.4778601740323203, "grad_norm": 0.193359375, "learning_rate": 8.661459810354349e-05, "loss": 0.0559, "step": 3716 }, { "epoch": 0.477988769342878, "grad_norm": 0.1748046875, "learning_rate": 8.660771472874657e-05, "loss": 0.0423, "step": 3717 }, { "epoch": 0.47811736465343563, "grad_norm": 0.1845703125, "learning_rate": 8.660082985820123e-05, "loss": 0.0444, "step": 3718 }, { "epoch": 0.47824595996399333, "grad_norm": 0.203125, "learning_rate": 8.659394349218878e-05, "loss": 0.0596, "step": 3719 }, { "epoch": 0.47837455527455097, "grad_norm": 0.173828125, "learning_rate": 8.65870556309906e-05, "loss": 0.0419, "step": 3720 }, { "epoch": 0.47850315058510867, "grad_norm": 0.1875, "learning_rate": 8.658016627488809e-05, "loss": 0.0463, "step": 3721 }, { "epoch": 0.4786317458956663, "grad_norm": 0.17578125, "learning_rate": 8.657327542416276e-05, "loss": 0.0423, "step": 3722 }, { "epoch": 0.478760341206224, "grad_norm": 0.1865234375, "learning_rate": 8.656638307909617e-05, "loss": 0.0395, "step": 3723 }, { "epoch": 0.47888893651678166, "grad_norm": 0.1796875, "learning_rate": 8.65594892399699e-05, "loss": 0.0391, "step": 3724 }, { "epoch": 0.47901753182733936, "grad_norm": 0.19140625, "learning_rate": 8.655259390706566e-05, "loss": 0.0425, "step": 3725 }, { "epoch": 0.47914612713789706, "grad_norm": 0.189453125, "learning_rate": 8.654569708066517e-05, "loss": 0.0487, "step": 3726 }, { "epoch": 0.4792747224484547, "grad_norm": 0.173828125, "learning_rate": 8.653879876105023e-05, "loss": 0.0437, "step": 3727 }, { "epoch": 0.4794033177590124, "grad_norm": 0.185546875, "learning_rate": 8.653189894850267e-05, "loss": 0.0472, "step": 3728 }, { "epoch": 0.47953191306957005, "grad_norm": 0.1982421875, "learning_rate": 8.652499764330446e-05, "loss": 0.0443, "step": 3729 }, { "epoch": 0.47966050838012775, "grad_norm": 0.2001953125, "learning_rate": 8.651809484573755e-05, "loss": 0.0507, "step": 3730 }, { "epoch": 0.4797891036906854, "grad_norm": 0.1943359375, "learning_rate": 8.651119055608396e-05, "loss": 0.0511, "step": 3731 }, { "epoch": 0.4799176990012431, "grad_norm": 0.1982421875, "learning_rate": 8.650428477462585e-05, "loss": 0.0508, "step": 3732 }, { "epoch": 0.48004629431180074, "grad_norm": 0.1953125, "learning_rate": 8.649737750164532e-05, "loss": 0.0602, "step": 3733 }, { "epoch": 0.48017488962235844, "grad_norm": 0.1650390625, "learning_rate": 8.649046873742461e-05, "loss": 0.0345, "step": 3734 }, { "epoch": 0.48030348493291614, "grad_norm": 0.1650390625, "learning_rate": 8.648355848224604e-05, "loss": 0.0423, "step": 3735 }, { "epoch": 0.4804320802434738, "grad_norm": 0.1572265625, "learning_rate": 8.647664673639192e-05, "loss": 0.0369, "step": 3736 }, { "epoch": 0.4805606755540315, "grad_norm": 0.158203125, "learning_rate": 8.646973350014464e-05, "loss": 0.041, "step": 3737 }, { "epoch": 0.4806892708645891, "grad_norm": 0.177734375, "learning_rate": 8.646281877378671e-05, "loss": 0.0462, "step": 3738 }, { "epoch": 0.4808178661751468, "grad_norm": 0.1953125, "learning_rate": 8.645590255760065e-05, "loss": 0.0573, "step": 3739 }, { "epoch": 0.48094646148570447, "grad_norm": 0.1630859375, "learning_rate": 8.644898485186901e-05, "loss": 0.046, "step": 3740 }, { "epoch": 0.48107505679626217, "grad_norm": 0.1806640625, "learning_rate": 8.644206565687448e-05, "loss": 0.0486, "step": 3741 }, { "epoch": 0.4812036521068198, "grad_norm": 0.1806640625, "learning_rate": 8.643514497289976e-05, "loss": 0.0447, "step": 3742 }, { "epoch": 0.4813322474173775, "grad_norm": 0.1640625, "learning_rate": 8.642822280022761e-05, "loss": 0.0438, "step": 3743 }, { "epoch": 0.4814608427279352, "grad_norm": 0.1767578125, "learning_rate": 8.642129913914087e-05, "loss": 0.0525, "step": 3744 }, { "epoch": 0.48158943803849286, "grad_norm": 0.18359375, "learning_rate": 8.641437398992244e-05, "loss": 0.0572, "step": 3745 }, { "epoch": 0.48171803334905056, "grad_norm": 0.158203125, "learning_rate": 8.640744735285525e-05, "loss": 0.0444, "step": 3746 }, { "epoch": 0.4818466286596082, "grad_norm": 0.1845703125, "learning_rate": 8.640051922822235e-05, "loss": 0.0502, "step": 3747 }, { "epoch": 0.4819752239701659, "grad_norm": 0.19140625, "learning_rate": 8.639358961630677e-05, "loss": 0.051, "step": 3748 }, { "epoch": 0.48210381928072354, "grad_norm": 0.2353515625, "learning_rate": 8.638665851739167e-05, "loss": 0.0465, "step": 3749 }, { "epoch": 0.48223241459128124, "grad_norm": 0.1865234375, "learning_rate": 8.637972593176027e-05, "loss": 0.0419, "step": 3750 }, { "epoch": 0.4823610099018389, "grad_norm": 0.173828125, "learning_rate": 8.63727918596958e-05, "loss": 0.0442, "step": 3751 }, { "epoch": 0.4824896052123966, "grad_norm": 0.177734375, "learning_rate": 8.636585630148158e-05, "loss": 0.0451, "step": 3752 }, { "epoch": 0.4826182005229543, "grad_norm": 0.1787109375, "learning_rate": 8.635891925740097e-05, "loss": 0.0452, "step": 3753 }, { "epoch": 0.48274679583351193, "grad_norm": 0.197265625, "learning_rate": 8.635198072773745e-05, "loss": 0.0542, "step": 3754 }, { "epoch": 0.48287539114406963, "grad_norm": 0.1796875, "learning_rate": 8.634504071277449e-05, "loss": 0.0414, "step": 3755 }, { "epoch": 0.4830039864546273, "grad_norm": 0.189453125, "learning_rate": 8.633809921279568e-05, "loss": 0.0458, "step": 3756 }, { "epoch": 0.483132581765185, "grad_norm": 0.197265625, "learning_rate": 8.633115622808462e-05, "loss": 0.0529, "step": 3757 }, { "epoch": 0.4832611770757426, "grad_norm": 0.1923828125, "learning_rate": 8.632421175892499e-05, "loss": 0.0522, "step": 3758 }, { "epoch": 0.4833897723863003, "grad_norm": 0.1796875, "learning_rate": 8.631726580560053e-05, "loss": 0.0492, "step": 3759 }, { "epoch": 0.48351836769685796, "grad_norm": 0.2060546875, "learning_rate": 8.631031836839507e-05, "loss": 0.0541, "step": 3760 }, { "epoch": 0.48364696300741566, "grad_norm": 0.2001953125, "learning_rate": 8.630336944759244e-05, "loss": 0.0532, "step": 3761 }, { "epoch": 0.48377555831797336, "grad_norm": 0.154296875, "learning_rate": 8.629641904347658e-05, "loss": 0.0378, "step": 3762 }, { "epoch": 0.483904153628531, "grad_norm": 0.18359375, "learning_rate": 8.62894671563315e-05, "loss": 0.0451, "step": 3763 }, { "epoch": 0.4840327489390887, "grad_norm": 0.208984375, "learning_rate": 8.62825137864412e-05, "loss": 0.0473, "step": 3764 }, { "epoch": 0.48416134424964635, "grad_norm": 0.150390625, "learning_rate": 8.627555893408982e-05, "loss": 0.0358, "step": 3765 }, { "epoch": 0.48428993956020405, "grad_norm": 0.177734375, "learning_rate": 8.626860259956151e-05, "loss": 0.0445, "step": 3766 }, { "epoch": 0.4844185348707617, "grad_norm": 0.203125, "learning_rate": 8.626164478314049e-05, "loss": 0.0602, "step": 3767 }, { "epoch": 0.4845471301813194, "grad_norm": 0.2177734375, "learning_rate": 8.625468548511109e-05, "loss": 0.0526, "step": 3768 }, { "epoch": 0.48467572549187704, "grad_norm": 0.16796875, "learning_rate": 8.624772470575762e-05, "loss": 0.0451, "step": 3769 }, { "epoch": 0.48480432080243474, "grad_norm": 0.193359375, "learning_rate": 8.62407624453645e-05, "loss": 0.0464, "step": 3770 }, { "epoch": 0.48493291611299244, "grad_norm": 0.177734375, "learning_rate": 8.62337987042162e-05, "loss": 0.0547, "step": 3771 }, { "epoch": 0.4850615114235501, "grad_norm": 0.1865234375, "learning_rate": 8.622683348259724e-05, "loss": 0.0518, "step": 3772 }, { "epoch": 0.4851901067341078, "grad_norm": 0.1865234375, "learning_rate": 8.621986678079222e-05, "loss": 0.0571, "step": 3773 }, { "epoch": 0.48531870204466543, "grad_norm": 0.1923828125, "learning_rate": 8.621289859908581e-05, "loss": 0.0585, "step": 3774 }, { "epoch": 0.48544729735522313, "grad_norm": 0.1748046875, "learning_rate": 8.62059289377627e-05, "loss": 0.043, "step": 3775 }, { "epoch": 0.4855758926657808, "grad_norm": 0.19140625, "learning_rate": 8.619895779710766e-05, "loss": 0.0506, "step": 3776 }, { "epoch": 0.4857044879763385, "grad_norm": 0.189453125, "learning_rate": 8.619198517740552e-05, "loss": 0.0605, "step": 3777 }, { "epoch": 0.4858330832868961, "grad_norm": 0.1767578125, "learning_rate": 8.618501107894119e-05, "loss": 0.0449, "step": 3778 }, { "epoch": 0.4859616785974538, "grad_norm": 0.162109375, "learning_rate": 8.617803550199963e-05, "loss": 0.0417, "step": 3779 }, { "epoch": 0.48609027390801146, "grad_norm": 0.1826171875, "learning_rate": 8.617105844686582e-05, "loss": 0.0501, "step": 3780 }, { "epoch": 0.48621886921856916, "grad_norm": 0.1650390625, "learning_rate": 8.616407991382486e-05, "loss": 0.0481, "step": 3781 }, { "epoch": 0.48634746452912686, "grad_norm": 0.1806640625, "learning_rate": 8.61570999031619e-05, "loss": 0.0484, "step": 3782 }, { "epoch": 0.4864760598396845, "grad_norm": 0.1650390625, "learning_rate": 8.615011841516208e-05, "loss": 0.0395, "step": 3783 }, { "epoch": 0.4866046551502422, "grad_norm": 0.171875, "learning_rate": 8.61431354501107e-05, "loss": 0.043, "step": 3784 }, { "epoch": 0.48673325046079985, "grad_norm": 0.181640625, "learning_rate": 8.613615100829306e-05, "loss": 0.0554, "step": 3785 }, { "epoch": 0.48686184577135755, "grad_norm": 0.181640625, "learning_rate": 8.612916508999456e-05, "loss": 0.0492, "step": 3786 }, { "epoch": 0.4869904410819152, "grad_norm": 0.1845703125, "learning_rate": 8.612217769550061e-05, "loss": 0.0426, "step": 3787 }, { "epoch": 0.4871190363924729, "grad_norm": 0.2001953125, "learning_rate": 8.611518882509671e-05, "loss": 0.0447, "step": 3788 }, { "epoch": 0.48724763170303054, "grad_norm": 0.1875, "learning_rate": 8.610819847906841e-05, "loss": 0.0534, "step": 3789 }, { "epoch": 0.48737622701358824, "grad_norm": 0.171875, "learning_rate": 8.610120665770136e-05, "loss": 0.0449, "step": 3790 }, { "epoch": 0.48750482232414594, "grad_norm": 0.1748046875, "learning_rate": 8.60942133612812e-05, "loss": 0.0454, "step": 3791 }, { "epoch": 0.4876334176347036, "grad_norm": 0.185546875, "learning_rate": 8.60872185900937e-05, "loss": 0.0426, "step": 3792 }, { "epoch": 0.4877620129452613, "grad_norm": 0.16015625, "learning_rate": 8.608022234442463e-05, "loss": 0.0427, "step": 3793 }, { "epoch": 0.4878906082558189, "grad_norm": 0.1884765625, "learning_rate": 8.607322462455987e-05, "loss": 0.0504, "step": 3794 }, { "epoch": 0.4880192035663766, "grad_norm": 0.189453125, "learning_rate": 8.606622543078531e-05, "loss": 0.0539, "step": 3795 }, { "epoch": 0.48814779887693427, "grad_norm": 0.2001953125, "learning_rate": 8.605922476338696e-05, "loss": 0.0449, "step": 3796 }, { "epoch": 0.48827639418749197, "grad_norm": 0.1748046875, "learning_rate": 8.605222262265085e-05, "loss": 0.0417, "step": 3797 }, { "epoch": 0.4884049894980496, "grad_norm": 0.2021484375, "learning_rate": 8.604521900886308e-05, "loss": 0.044, "step": 3798 }, { "epoch": 0.4885335848086073, "grad_norm": 0.1875, "learning_rate": 8.603821392230979e-05, "loss": 0.0446, "step": 3799 }, { "epoch": 0.488662180119165, "grad_norm": 0.1923828125, "learning_rate": 8.603120736327723e-05, "loss": 0.0484, "step": 3800 }, { "epoch": 0.48879077542972266, "grad_norm": 0.193359375, "learning_rate": 8.602419933205167e-05, "loss": 0.0408, "step": 3801 }, { "epoch": 0.48891937074028036, "grad_norm": 0.1904296875, "learning_rate": 8.601718982891944e-05, "loss": 0.0482, "step": 3802 }, { "epoch": 0.489047966050838, "grad_norm": 0.1728515625, "learning_rate": 8.601017885416693e-05, "loss": 0.0394, "step": 3803 }, { "epoch": 0.4891765613613957, "grad_norm": 0.158203125, "learning_rate": 8.600316640808064e-05, "loss": 0.0412, "step": 3804 }, { "epoch": 0.48930515667195335, "grad_norm": 0.193359375, "learning_rate": 8.599615249094706e-05, "loss": 0.0435, "step": 3805 }, { "epoch": 0.48943375198251104, "grad_norm": 0.162109375, "learning_rate": 8.598913710305277e-05, "loss": 0.0408, "step": 3806 }, { "epoch": 0.4895623472930687, "grad_norm": 0.19140625, "learning_rate": 8.598212024468441e-05, "loss": 0.0504, "step": 3807 }, { "epoch": 0.4896909426036264, "grad_norm": 0.166015625, "learning_rate": 8.59751019161287e-05, "loss": 0.0459, "step": 3808 }, { "epoch": 0.4898195379141841, "grad_norm": 0.1611328125, "learning_rate": 8.596808211767239e-05, "loss": 0.0426, "step": 3809 }, { "epoch": 0.48994813322474173, "grad_norm": 0.16796875, "learning_rate": 8.596106084960229e-05, "loss": 0.0456, "step": 3810 }, { "epoch": 0.49007672853529943, "grad_norm": 0.1767578125, "learning_rate": 8.595403811220531e-05, "loss": 0.0511, "step": 3811 }, { "epoch": 0.4902053238458571, "grad_norm": 0.2109375, "learning_rate": 8.594701390576834e-05, "loss": 0.0504, "step": 3812 }, { "epoch": 0.4903339191564148, "grad_norm": 0.19140625, "learning_rate": 8.593998823057843e-05, "loss": 0.0427, "step": 3813 }, { "epoch": 0.4904625144669724, "grad_norm": 0.1748046875, "learning_rate": 8.593296108692261e-05, "loss": 0.0424, "step": 3814 }, { "epoch": 0.4905911097775301, "grad_norm": 0.1572265625, "learning_rate": 8.592593247508803e-05, "loss": 0.0375, "step": 3815 }, { "epoch": 0.49071970508808777, "grad_norm": 0.1865234375, "learning_rate": 8.591890239536184e-05, "loss": 0.0499, "step": 3816 }, { "epoch": 0.49084830039864547, "grad_norm": 0.1669921875, "learning_rate": 8.591187084803131e-05, "loss": 0.0423, "step": 3817 }, { "epoch": 0.49097689570920316, "grad_norm": 0.169921875, "learning_rate": 8.590483783338372e-05, "loss": 0.041, "step": 3818 }, { "epoch": 0.4911054910197608, "grad_norm": 0.19140625, "learning_rate": 8.589780335170644e-05, "loss": 0.0467, "step": 3819 }, { "epoch": 0.4912340863303185, "grad_norm": 0.171875, "learning_rate": 8.589076740328689e-05, "loss": 0.0488, "step": 3820 }, { "epoch": 0.49136268164087615, "grad_norm": 0.20703125, "learning_rate": 8.588372998841254e-05, "loss": 0.0544, "step": 3821 }, { "epoch": 0.49149127695143385, "grad_norm": 0.173828125, "learning_rate": 8.587669110737093e-05, "loss": 0.0487, "step": 3822 }, { "epoch": 0.4916198722619915, "grad_norm": 0.1982421875, "learning_rate": 8.586965076044968e-05, "loss": 0.0585, "step": 3823 }, { "epoch": 0.4917484675725492, "grad_norm": 0.1572265625, "learning_rate": 8.586260894793645e-05, "loss": 0.0421, "step": 3824 }, { "epoch": 0.49187706288310684, "grad_norm": 0.171875, "learning_rate": 8.585556567011894e-05, "loss": 0.0451, "step": 3825 }, { "epoch": 0.49200565819366454, "grad_norm": 0.1806640625, "learning_rate": 8.584852092728492e-05, "loss": 0.0492, "step": 3826 }, { "epoch": 0.4921342535042222, "grad_norm": 0.166015625, "learning_rate": 8.584147471972228e-05, "loss": 0.0467, "step": 3827 }, { "epoch": 0.4922628488147799, "grad_norm": 0.162109375, "learning_rate": 8.583442704771887e-05, "loss": 0.0405, "step": 3828 }, { "epoch": 0.4923914441253376, "grad_norm": 0.154296875, "learning_rate": 8.582737791156268e-05, "loss": 0.0347, "step": 3829 }, { "epoch": 0.49252003943589523, "grad_norm": 0.158203125, "learning_rate": 8.582032731154172e-05, "loss": 0.0419, "step": 3830 }, { "epoch": 0.49264863474645293, "grad_norm": 0.173828125, "learning_rate": 8.581327524794407e-05, "loss": 0.0371, "step": 3831 }, { "epoch": 0.4927772300570106, "grad_norm": 0.181640625, "learning_rate": 8.580622172105785e-05, "loss": 0.0458, "step": 3832 }, { "epoch": 0.4929058253675683, "grad_norm": 0.1826171875, "learning_rate": 8.57991667311713e-05, "loss": 0.0464, "step": 3833 }, { "epoch": 0.4930344206781259, "grad_norm": 0.1875, "learning_rate": 8.579211027857263e-05, "loss": 0.051, "step": 3834 }, { "epoch": 0.4931630159886836, "grad_norm": 0.1611328125, "learning_rate": 8.57850523635502e-05, "loss": 0.0415, "step": 3835 }, { "epoch": 0.49329161129924126, "grad_norm": 0.1953125, "learning_rate": 8.577799298639236e-05, "loss": 0.0537, "step": 3836 }, { "epoch": 0.49342020660979896, "grad_norm": 0.1875, "learning_rate": 8.577093214738756e-05, "loss": 0.0425, "step": 3837 }, { "epoch": 0.49354880192035666, "grad_norm": 0.1865234375, "learning_rate": 8.57638698468243e-05, "loss": 0.0453, "step": 3838 }, { "epoch": 0.4936773972309143, "grad_norm": 0.1845703125, "learning_rate": 8.575680608499115e-05, "loss": 0.0511, "step": 3839 }, { "epoch": 0.493805992541472, "grad_norm": 0.1669921875, "learning_rate": 8.574974086217668e-05, "loss": 0.0477, "step": 3840 }, { "epoch": 0.49393458785202965, "grad_norm": 0.16015625, "learning_rate": 8.574267417866962e-05, "loss": 0.043, "step": 3841 }, { "epoch": 0.49406318316258735, "grad_norm": 0.1630859375, "learning_rate": 8.573560603475868e-05, "loss": 0.0399, "step": 3842 }, { "epoch": 0.494191778473145, "grad_norm": 0.21875, "learning_rate": 8.572853643073265e-05, "loss": 0.0453, "step": 3843 }, { "epoch": 0.4943203737837027, "grad_norm": 0.1748046875, "learning_rate": 8.572146536688041e-05, "loss": 0.0432, "step": 3844 }, { "epoch": 0.49444896909426034, "grad_norm": 0.189453125, "learning_rate": 8.571439284349085e-05, "loss": 0.047, "step": 3845 }, { "epoch": 0.49457756440481804, "grad_norm": 0.1767578125, "learning_rate": 8.570731886085296e-05, "loss": 0.0423, "step": 3846 }, { "epoch": 0.49470615971537574, "grad_norm": 0.201171875, "learning_rate": 8.570024341925578e-05, "loss": 0.0457, "step": 3847 }, { "epoch": 0.4948347550259334, "grad_norm": 0.1650390625, "learning_rate": 8.569316651898837e-05, "loss": 0.0403, "step": 3848 }, { "epoch": 0.4949633503364911, "grad_norm": 0.1826171875, "learning_rate": 8.568608816033994e-05, "loss": 0.0503, "step": 3849 }, { "epoch": 0.4950919456470487, "grad_norm": 0.15625, "learning_rate": 8.567900834359966e-05, "loss": 0.0302, "step": 3850 }, { "epoch": 0.4952205409576064, "grad_norm": 0.1806640625, "learning_rate": 8.567192706905682e-05, "loss": 0.0453, "step": 3851 }, { "epoch": 0.49534913626816407, "grad_norm": 0.1953125, "learning_rate": 8.566484433700073e-05, "loss": 0.0549, "step": 3852 }, { "epoch": 0.49547773157872177, "grad_norm": 0.181640625, "learning_rate": 8.565776014772082e-05, "loss": 0.0438, "step": 3853 }, { "epoch": 0.4956063268892794, "grad_norm": 0.19921875, "learning_rate": 8.565067450150651e-05, "loss": 0.0487, "step": 3854 }, { "epoch": 0.4957349221998371, "grad_norm": 0.208984375, "learning_rate": 8.564358739864733e-05, "loss": 0.0563, "step": 3855 }, { "epoch": 0.4958635175103948, "grad_norm": 0.1875, "learning_rate": 8.563649883943285e-05, "loss": 0.0485, "step": 3856 }, { "epoch": 0.49599211282095246, "grad_norm": 0.2177734375, "learning_rate": 8.562940882415269e-05, "loss": 0.052, "step": 3857 }, { "epoch": 0.49612070813151016, "grad_norm": 0.1826171875, "learning_rate": 8.562231735309653e-05, "loss": 0.0486, "step": 3858 }, { "epoch": 0.4962493034420678, "grad_norm": 0.1865234375, "learning_rate": 8.561522442655414e-05, "loss": 0.0547, "step": 3859 }, { "epoch": 0.4963778987526255, "grad_norm": 0.2041015625, "learning_rate": 8.560813004481534e-05, "loss": 0.0553, "step": 3860 }, { "epoch": 0.49650649406318315, "grad_norm": 0.181640625, "learning_rate": 8.560103420816996e-05, "loss": 0.0441, "step": 3861 }, { "epoch": 0.49663508937374085, "grad_norm": 0.173828125, "learning_rate": 8.559393691690795e-05, "loss": 0.0414, "step": 3862 }, { "epoch": 0.4967636846842985, "grad_norm": 0.1806640625, "learning_rate": 8.558683817131931e-05, "loss": 0.0519, "step": 3863 }, { "epoch": 0.4968922799948562, "grad_norm": 0.1630859375, "learning_rate": 8.557973797169406e-05, "loss": 0.0417, "step": 3864 }, { "epoch": 0.4970208753054139, "grad_norm": 0.1865234375, "learning_rate": 8.55726363183223e-05, "loss": 0.0534, "step": 3865 }, { "epoch": 0.49714947061597153, "grad_norm": 0.18359375, "learning_rate": 8.556553321149424e-05, "loss": 0.0424, "step": 3866 }, { "epoch": 0.49727806592652923, "grad_norm": 0.1767578125, "learning_rate": 8.555842865150007e-05, "loss": 0.0471, "step": 3867 }, { "epoch": 0.4974066612370869, "grad_norm": 0.1640625, "learning_rate": 8.555132263863008e-05, "loss": 0.0406, "step": 3868 }, { "epoch": 0.4975352565476446, "grad_norm": 0.162109375, "learning_rate": 8.554421517317461e-05, "loss": 0.0489, "step": 3869 }, { "epoch": 0.4976638518582022, "grad_norm": 0.169921875, "learning_rate": 8.553710625542409e-05, "loss": 0.0511, "step": 3870 }, { "epoch": 0.4977924471687599, "grad_norm": 0.154296875, "learning_rate": 8.552999588566892e-05, "loss": 0.04, "step": 3871 }, { "epoch": 0.49792104247931757, "grad_norm": 0.1787109375, "learning_rate": 8.552288406419968e-05, "loss": 0.0502, "step": 3872 }, { "epoch": 0.49804963778987527, "grad_norm": 0.1826171875, "learning_rate": 8.551577079130693e-05, "loss": 0.0447, "step": 3873 }, { "epoch": 0.4981782331004329, "grad_norm": 0.197265625, "learning_rate": 8.550865606728131e-05, "loss": 0.0513, "step": 3874 }, { "epoch": 0.4983068284109906, "grad_norm": 0.1943359375, "learning_rate": 8.550153989241352e-05, "loss": 0.0514, "step": 3875 }, { "epoch": 0.4984354237215483, "grad_norm": 0.1845703125, "learning_rate": 8.549442226699433e-05, "loss": 0.0553, "step": 3876 }, { "epoch": 0.49856401903210595, "grad_norm": 0.1787109375, "learning_rate": 8.548730319131453e-05, "loss": 0.0476, "step": 3877 }, { "epoch": 0.49869261434266365, "grad_norm": 0.1982421875, "learning_rate": 8.548018266566503e-05, "loss": 0.0473, "step": 3878 }, { "epoch": 0.4988212096532213, "grad_norm": 0.2041015625, "learning_rate": 8.547306069033673e-05, "loss": 0.0468, "step": 3879 }, { "epoch": 0.498949804963779, "grad_norm": 0.271484375, "learning_rate": 8.546593726562066e-05, "loss": 0.0402, "step": 3880 }, { "epoch": 0.49907840027433664, "grad_norm": 0.1708984375, "learning_rate": 8.545881239180786e-05, "loss": 0.0382, "step": 3881 }, { "epoch": 0.49920699558489434, "grad_norm": 0.171875, "learning_rate": 8.545168606918945e-05, "loss": 0.0364, "step": 3882 }, { "epoch": 0.499335590895452, "grad_norm": 0.17578125, "learning_rate": 8.544455829805659e-05, "loss": 0.0458, "step": 3883 }, { "epoch": 0.4994641862060097, "grad_norm": 0.2080078125, "learning_rate": 8.543742907870051e-05, "loss": 0.0546, "step": 3884 }, { "epoch": 0.4995927815165674, "grad_norm": 0.2314453125, "learning_rate": 8.543029841141254e-05, "loss": 0.0612, "step": 3885 }, { "epoch": 0.49972137682712503, "grad_norm": 0.203125, "learning_rate": 8.542316629648399e-05, "loss": 0.0577, "step": 3886 }, { "epoch": 0.49984997213768273, "grad_norm": 0.1875, "learning_rate": 8.541603273420628e-05, "loss": 0.0554, "step": 3887 }, { "epoch": 0.4999785674482404, "grad_norm": 0.17578125, "learning_rate": 8.540889772487087e-05, "loss": 0.0433, "step": 3888 }, { "epoch": 0.5001071627587981, "grad_norm": 0.193359375, "learning_rate": 8.540176126876931e-05, "loss": 0.0423, "step": 3889 }, { "epoch": 0.5002357580693557, "grad_norm": 0.169921875, "learning_rate": 8.539462336619318e-05, "loss": 0.0429, "step": 3890 }, { "epoch": 0.5003643533799134, "grad_norm": 0.1689453125, "learning_rate": 8.538748401743412e-05, "loss": 0.0386, "step": 3891 }, { "epoch": 0.5004929486904711, "grad_norm": 0.17578125, "learning_rate": 8.538034322278384e-05, "loss": 0.0498, "step": 3892 }, { "epoch": 0.5006215440010288, "grad_norm": 0.189453125, "learning_rate": 8.53732009825341e-05, "loss": 0.046, "step": 3893 }, { "epoch": 0.5007501393115864, "grad_norm": 0.212890625, "learning_rate": 8.536605729697674e-05, "loss": 0.0519, "step": 3894 }, { "epoch": 0.5008787346221442, "grad_norm": 0.1826171875, "learning_rate": 8.535891216640361e-05, "loss": 0.0461, "step": 3895 }, { "epoch": 0.5010073299327018, "grad_norm": 0.181640625, "learning_rate": 8.53517655911067e-05, "loss": 0.0488, "step": 3896 }, { "epoch": 0.5011359252432595, "grad_norm": 0.1748046875, "learning_rate": 8.534461757137795e-05, "loss": 0.0391, "step": 3897 }, { "epoch": 0.5012645205538171, "grad_norm": 0.1611328125, "learning_rate": 8.533746810750946e-05, "loss": 0.038, "step": 3898 }, { "epoch": 0.5013931158643748, "grad_norm": 0.169921875, "learning_rate": 8.533031719979335e-05, "loss": 0.0411, "step": 3899 }, { "epoch": 0.5015217111749325, "grad_norm": 0.1728515625, "learning_rate": 8.532316484852179e-05, "loss": 0.0348, "step": 3900 }, { "epoch": 0.5016503064854901, "grad_norm": 0.1748046875, "learning_rate": 8.531601105398701e-05, "loss": 0.0429, "step": 3901 }, { "epoch": 0.5017789017960478, "grad_norm": 0.2021484375, "learning_rate": 8.53088558164813e-05, "loss": 0.0515, "step": 3902 }, { "epoch": 0.5019074971066055, "grad_norm": 0.1728515625, "learning_rate": 8.530169913629705e-05, "loss": 0.0431, "step": 3903 }, { "epoch": 0.5020360924171632, "grad_norm": 0.171875, "learning_rate": 8.529454101372663e-05, "loss": 0.0443, "step": 3904 }, { "epoch": 0.5021646877277208, "grad_norm": 0.169921875, "learning_rate": 8.528738144906252e-05, "loss": 0.0427, "step": 3905 }, { "epoch": 0.5022932830382786, "grad_norm": 0.18359375, "learning_rate": 8.528022044259728e-05, "loss": 0.0431, "step": 3906 }, { "epoch": 0.5024218783488362, "grad_norm": 0.1865234375, "learning_rate": 8.527305799462349e-05, "loss": 0.0519, "step": 3907 }, { "epoch": 0.5025504736593939, "grad_norm": 0.166015625, "learning_rate": 8.526589410543378e-05, "loss": 0.039, "step": 3908 }, { "epoch": 0.5026790689699515, "grad_norm": 0.16796875, "learning_rate": 8.525872877532087e-05, "loss": 0.0388, "step": 3909 }, { "epoch": 0.5028076642805093, "grad_norm": 0.1962890625, "learning_rate": 8.525156200457753e-05, "loss": 0.054, "step": 3910 }, { "epoch": 0.5029362595910669, "grad_norm": 0.31640625, "learning_rate": 8.524439379349658e-05, "loss": 0.0584, "step": 3911 }, { "epoch": 0.5030648549016246, "grad_norm": 0.181640625, "learning_rate": 8.523722414237092e-05, "loss": 0.0451, "step": 3912 }, { "epoch": 0.5031934502121823, "grad_norm": 0.16796875, "learning_rate": 8.523005305149348e-05, "loss": 0.044, "step": 3913 }, { "epoch": 0.50332204552274, "grad_norm": 0.1982421875, "learning_rate": 8.522288052115724e-05, "loss": 0.0467, "step": 3914 }, { "epoch": 0.5034506408332976, "grad_norm": 0.19140625, "learning_rate": 8.52157065516553e-05, "loss": 0.0492, "step": 3915 }, { "epoch": 0.5035792361438552, "grad_norm": 0.1689453125, "learning_rate": 8.520853114328077e-05, "loss": 0.0492, "step": 3916 }, { "epoch": 0.503707831454413, "grad_norm": 0.1826171875, "learning_rate": 8.520135429632683e-05, "loss": 0.0454, "step": 3917 }, { "epoch": 0.5038364267649706, "grad_norm": 0.16796875, "learning_rate": 8.51941760110867e-05, "loss": 0.0478, "step": 3918 }, { "epoch": 0.5039650220755283, "grad_norm": 0.142578125, "learning_rate": 8.51869962878537e-05, "loss": 0.0352, "step": 3919 }, { "epoch": 0.5040936173860859, "grad_norm": 0.1962890625, "learning_rate": 8.517981512692115e-05, "loss": 0.0587, "step": 3920 }, { "epoch": 0.5042222126966437, "grad_norm": 0.185546875, "learning_rate": 8.517263252858251e-05, "loss": 0.049, "step": 3921 }, { "epoch": 0.5043508080072013, "grad_norm": 0.18359375, "learning_rate": 8.516544849313121e-05, "loss": 0.0503, "step": 3922 }, { "epoch": 0.504479403317759, "grad_norm": 0.1640625, "learning_rate": 8.515826302086082e-05, "loss": 0.0404, "step": 3923 }, { "epoch": 0.5046079986283167, "grad_norm": 0.1806640625, "learning_rate": 8.51510761120649e-05, "loss": 0.0442, "step": 3924 }, { "epoch": 0.5047365939388744, "grad_norm": 0.1787109375, "learning_rate": 8.514388776703711e-05, "loss": 0.0435, "step": 3925 }, { "epoch": 0.504865189249432, "grad_norm": 0.1708984375, "learning_rate": 8.513669798607116e-05, "loss": 0.0384, "step": 3926 }, { "epoch": 0.5049937845599897, "grad_norm": 0.181640625, "learning_rate": 8.51295067694608e-05, "loss": 0.0451, "step": 3927 }, { "epoch": 0.5051223798705474, "grad_norm": 0.181640625, "learning_rate": 8.512231411749989e-05, "loss": 0.0443, "step": 3928 }, { "epoch": 0.5052509751811051, "grad_norm": 0.1787109375, "learning_rate": 8.511512003048227e-05, "loss": 0.0423, "step": 3929 }, { "epoch": 0.5053795704916627, "grad_norm": 0.177734375, "learning_rate": 8.51079245087019e-05, "loss": 0.048, "step": 3930 }, { "epoch": 0.5055081658022205, "grad_norm": 0.189453125, "learning_rate": 8.510072755245281e-05, "loss": 0.0491, "step": 3931 }, { "epoch": 0.5056367611127781, "grad_norm": 0.1962890625, "learning_rate": 8.509352916202902e-05, "loss": 0.0513, "step": 3932 }, { "epoch": 0.5057653564233358, "grad_norm": 0.1845703125, "learning_rate": 8.508632933772467e-05, "loss": 0.0529, "step": 3933 }, { "epoch": 0.5058939517338934, "grad_norm": 0.2021484375, "learning_rate": 8.507912807983392e-05, "loss": 0.0469, "step": 3934 }, { "epoch": 0.5060225470444512, "grad_norm": 0.2138671875, "learning_rate": 8.5071925388651e-05, "loss": 0.06, "step": 3935 }, { "epoch": 0.5061511423550088, "grad_norm": 0.1767578125, "learning_rate": 8.506472126447021e-05, "loss": 0.051, "step": 3936 }, { "epoch": 0.5062797376655664, "grad_norm": 0.1767578125, "learning_rate": 8.505751570758594e-05, "loss": 0.0444, "step": 3937 }, { "epoch": 0.5064083329761241, "grad_norm": 0.1923828125, "learning_rate": 8.505030871829254e-05, "loss": 0.0558, "step": 3938 }, { "epoch": 0.5065369282866818, "grad_norm": 0.1982421875, "learning_rate": 8.504310029688452e-05, "loss": 0.0553, "step": 3939 }, { "epoch": 0.5066655235972395, "grad_norm": 0.1962890625, "learning_rate": 8.503589044365638e-05, "loss": 0.0552, "step": 3940 }, { "epoch": 0.5067941189077971, "grad_norm": 0.1572265625, "learning_rate": 8.502867915890274e-05, "loss": 0.0408, "step": 3941 }, { "epoch": 0.5069227142183549, "grad_norm": 0.1923828125, "learning_rate": 8.502146644291822e-05, "loss": 0.0467, "step": 3942 }, { "epoch": 0.5070513095289125, "grad_norm": 0.173828125, "learning_rate": 8.501425229599753e-05, "loss": 0.0435, "step": 3943 }, { "epoch": 0.5071799048394702, "grad_norm": 0.171875, "learning_rate": 8.500703671843542e-05, "loss": 0.0438, "step": 3944 }, { "epoch": 0.5073085001500278, "grad_norm": 0.1728515625, "learning_rate": 8.499981971052673e-05, "loss": 0.0443, "step": 3945 }, { "epoch": 0.5074370954605856, "grad_norm": 0.16796875, "learning_rate": 8.499260127256632e-05, "loss": 0.0452, "step": 3946 }, { "epoch": 0.5075656907711432, "grad_norm": 0.1728515625, "learning_rate": 8.498538140484915e-05, "loss": 0.0443, "step": 3947 }, { "epoch": 0.5076942860817009, "grad_norm": 0.171875, "learning_rate": 8.497816010767019e-05, "loss": 0.042, "step": 3948 }, { "epoch": 0.5078228813922585, "grad_norm": 0.1708984375, "learning_rate": 8.49709373813245e-05, "loss": 0.0412, "step": 3949 }, { "epoch": 0.5079514767028163, "grad_norm": 0.2041015625, "learning_rate": 8.496371322610721e-05, "loss": 0.058, "step": 3950 }, { "epoch": 0.5080800720133739, "grad_norm": 0.1982421875, "learning_rate": 8.495648764231347e-05, "loss": 0.05, "step": 3951 }, { "epoch": 0.5082086673239316, "grad_norm": 0.20703125, "learning_rate": 8.494926063023853e-05, "loss": 0.062, "step": 3952 }, { "epoch": 0.5083372626344893, "grad_norm": 0.171875, "learning_rate": 8.494203219017766e-05, "loss": 0.0388, "step": 3953 }, { "epoch": 0.508465857945047, "grad_norm": 0.1904296875, "learning_rate": 8.493480232242621e-05, "loss": 0.0492, "step": 3954 }, { "epoch": 0.5085944532556046, "grad_norm": 0.1845703125, "learning_rate": 8.492757102727957e-05, "loss": 0.0452, "step": 3955 }, { "epoch": 0.5087230485661622, "grad_norm": 0.197265625, "learning_rate": 8.492033830503323e-05, "loss": 0.0538, "step": 3956 }, { "epoch": 0.50885164387672, "grad_norm": 0.185546875, "learning_rate": 8.49131041559827e-05, "loss": 0.0431, "step": 3957 }, { "epoch": 0.5089802391872776, "grad_norm": 0.1865234375, "learning_rate": 8.490586858042353e-05, "loss": 0.0517, "step": 3958 }, { "epoch": 0.5091088344978353, "grad_norm": 0.16796875, "learning_rate": 8.489863157865141e-05, "loss": 0.04, "step": 3959 }, { "epoch": 0.509237429808393, "grad_norm": 0.1865234375, "learning_rate": 8.489139315096199e-05, "loss": 0.0409, "step": 3960 }, { "epoch": 0.5093660251189507, "grad_norm": 0.185546875, "learning_rate": 8.488415329765104e-05, "loss": 0.0471, "step": 3961 }, { "epoch": 0.5094946204295083, "grad_norm": 0.1533203125, "learning_rate": 8.487691201901439e-05, "loss": 0.0382, "step": 3962 }, { "epoch": 0.509623215740066, "grad_norm": 0.189453125, "learning_rate": 8.48696693153479e-05, "loss": 0.0423, "step": 3963 }, { "epoch": 0.5097518110506237, "grad_norm": 0.16796875, "learning_rate": 8.486242518694747e-05, "loss": 0.0383, "step": 3964 }, { "epoch": 0.5098804063611814, "grad_norm": 0.1787109375, "learning_rate": 8.485517963410911e-05, "loss": 0.0435, "step": 3965 }, { "epoch": 0.510009001671739, "grad_norm": 0.197265625, "learning_rate": 8.484793265712886e-05, "loss": 0.0597, "step": 3966 }, { "epoch": 0.5101375969822967, "grad_norm": 0.189453125, "learning_rate": 8.484068425630284e-05, "loss": 0.0499, "step": 3967 }, { "epoch": 0.5102661922928544, "grad_norm": 0.16796875, "learning_rate": 8.483343443192722e-05, "loss": 0.0434, "step": 3968 }, { "epoch": 0.5103947876034121, "grad_norm": 0.1708984375, "learning_rate": 8.482618318429815e-05, "loss": 0.0467, "step": 3969 }, { "epoch": 0.5105233829139697, "grad_norm": 0.1787109375, "learning_rate": 8.4818930513712e-05, "loss": 0.0497, "step": 3970 }, { "epoch": 0.5106519782245275, "grad_norm": 0.21875, "learning_rate": 8.481167642046501e-05, "loss": 0.0501, "step": 3971 }, { "epoch": 0.5107805735350851, "grad_norm": 0.1611328125, "learning_rate": 8.480442090485365e-05, "loss": 0.0421, "step": 3972 }, { "epoch": 0.5109091688456427, "grad_norm": 0.2158203125, "learning_rate": 8.479716396717436e-05, "loss": 0.0553, "step": 3973 }, { "epoch": 0.5110377641562004, "grad_norm": 0.1806640625, "learning_rate": 8.478990560772362e-05, "loss": 0.0442, "step": 3974 }, { "epoch": 0.5111663594667581, "grad_norm": 0.1962890625, "learning_rate": 8.478264582679801e-05, "loss": 0.0486, "step": 3975 }, { "epoch": 0.5112949547773158, "grad_norm": 0.1708984375, "learning_rate": 8.477538462469417e-05, "loss": 0.0426, "step": 3976 }, { "epoch": 0.5114235500878734, "grad_norm": 0.212890625, "learning_rate": 8.476812200170877e-05, "loss": 0.052, "step": 3977 }, { "epoch": 0.5115521453984312, "grad_norm": 0.1552734375, "learning_rate": 8.476085795813856e-05, "loss": 0.0354, "step": 3978 }, { "epoch": 0.5116807407089888, "grad_norm": 0.1728515625, "learning_rate": 8.475359249428034e-05, "loss": 0.0416, "step": 3979 }, { "epoch": 0.5118093360195465, "grad_norm": 0.1767578125, "learning_rate": 8.474632561043096e-05, "loss": 0.0409, "step": 3980 }, { "epoch": 0.5119379313301041, "grad_norm": 0.1865234375, "learning_rate": 8.473905730688733e-05, "loss": 0.045, "step": 3981 }, { "epoch": 0.5120665266406619, "grad_norm": 0.1953125, "learning_rate": 8.473178758394644e-05, "loss": 0.0507, "step": 3982 }, { "epoch": 0.5121951219512195, "grad_norm": 0.1767578125, "learning_rate": 8.472451644190534e-05, "loss": 0.0426, "step": 3983 }, { "epoch": 0.5123237172617772, "grad_norm": 0.1748046875, "learning_rate": 8.471724388106108e-05, "loss": 0.0424, "step": 3984 }, { "epoch": 0.5124523125723348, "grad_norm": 0.1865234375, "learning_rate": 8.470996990171083e-05, "loss": 0.0498, "step": 3985 }, { "epoch": 0.5125809078828926, "grad_norm": 0.189453125, "learning_rate": 8.47026945041518e-05, "loss": 0.0483, "step": 3986 }, { "epoch": 0.5127095031934502, "grad_norm": 0.1845703125, "learning_rate": 8.469541768868124e-05, "loss": 0.0453, "step": 3987 }, { "epoch": 0.5128380985040079, "grad_norm": 0.193359375, "learning_rate": 8.468813945559648e-05, "loss": 0.0469, "step": 3988 }, { "epoch": 0.5129666938145656, "grad_norm": 0.1708984375, "learning_rate": 8.46808598051949e-05, "loss": 0.0423, "step": 3989 }, { "epoch": 0.5130952891251233, "grad_norm": 0.2060546875, "learning_rate": 8.467357873777393e-05, "loss": 0.059, "step": 3990 }, { "epoch": 0.5132238844356809, "grad_norm": 0.1728515625, "learning_rate": 8.466629625363109e-05, "loss": 0.0409, "step": 3991 }, { "epoch": 0.5133524797462385, "grad_norm": 0.17578125, "learning_rate": 8.465901235306392e-05, "loss": 0.0447, "step": 3992 }, { "epoch": 0.5134810750567963, "grad_norm": 0.181640625, "learning_rate": 8.465172703637002e-05, "loss": 0.0452, "step": 3993 }, { "epoch": 0.5136096703673539, "grad_norm": 0.197265625, "learning_rate": 8.464444030384706e-05, "loss": 0.045, "step": 3994 }, { "epoch": 0.5137382656779116, "grad_norm": 0.203125, "learning_rate": 8.463715215579279e-05, "loss": 0.0475, "step": 3995 }, { "epoch": 0.5138668609884692, "grad_norm": 0.1640625, "learning_rate": 8.462986259250498e-05, "loss": 0.0376, "step": 3996 }, { "epoch": 0.513995456299027, "grad_norm": 0.1923828125, "learning_rate": 8.462257161428147e-05, "loss": 0.0439, "step": 3997 }, { "epoch": 0.5141240516095846, "grad_norm": 0.181640625, "learning_rate": 8.461527922142016e-05, "loss": 0.0474, "step": 3998 }, { "epoch": 0.5142526469201423, "grad_norm": 0.1962890625, "learning_rate": 8.460798541421903e-05, "loss": 0.0493, "step": 3999 }, { "epoch": 0.5143812422307, "grad_norm": 0.1884765625, "learning_rate": 8.460069019297608e-05, "loss": 0.046, "step": 4000 }, { "epoch": 0.5143812422307, "eval_loss": 0.04566698893904686, "eval_runtime": 1042.718, "eval_samples_per_second": 94.202, "eval_steps_per_second": 1.178, "step": 4000 }, { "epoch": 0.5145098375412577, "grad_norm": 0.1904296875, "learning_rate": 8.459339355798936e-05, "loss": 0.0394, "step": 4001 }, { "epoch": 0.5146384328518153, "grad_norm": 0.189453125, "learning_rate": 8.458609550955706e-05, "loss": 0.0532, "step": 4002 }, { "epoch": 0.514767028162373, "grad_norm": 0.1845703125, "learning_rate": 8.457879604797731e-05, "loss": 0.0411, "step": 4003 }, { "epoch": 0.5148956234729307, "grad_norm": 0.2060546875, "learning_rate": 8.457149517354839e-05, "loss": 0.0523, "step": 4004 }, { "epoch": 0.5150242187834884, "grad_norm": 0.185546875, "learning_rate": 8.45641928865686e-05, "loss": 0.0475, "step": 4005 }, { "epoch": 0.515152814094046, "grad_norm": 0.1904296875, "learning_rate": 8.45568891873363e-05, "loss": 0.0454, "step": 4006 }, { "epoch": 0.5152814094046038, "grad_norm": 0.16796875, "learning_rate": 8.45495840761499e-05, "loss": 0.0461, "step": 4007 }, { "epoch": 0.5154100047151614, "grad_norm": 0.1748046875, "learning_rate": 8.454227755330792e-05, "loss": 0.0403, "step": 4008 }, { "epoch": 0.515538600025719, "grad_norm": 0.1923828125, "learning_rate": 8.453496961910885e-05, "loss": 0.0453, "step": 4009 }, { "epoch": 0.5156671953362767, "grad_norm": 0.189453125, "learning_rate": 8.45276602738513e-05, "loss": 0.0488, "step": 4010 }, { "epoch": 0.5157957906468345, "grad_norm": 0.193359375, "learning_rate": 8.452034951783392e-05, "loss": 0.0476, "step": 4011 }, { "epoch": 0.5159243859573921, "grad_norm": 0.17578125, "learning_rate": 8.451303735135543e-05, "loss": 0.0391, "step": 4012 }, { "epoch": 0.5160529812679497, "grad_norm": 0.15625, "learning_rate": 8.450572377471459e-05, "loss": 0.0377, "step": 4013 }, { "epoch": 0.5161815765785074, "grad_norm": 0.1806640625, "learning_rate": 8.449840878821021e-05, "loss": 0.0446, "step": 4014 }, { "epoch": 0.5163101718890651, "grad_norm": 0.1845703125, "learning_rate": 8.44910923921412e-05, "loss": 0.0408, "step": 4015 }, { "epoch": 0.5164387671996228, "grad_norm": 0.1611328125, "learning_rate": 8.448377458680645e-05, "loss": 0.0374, "step": 4016 }, { "epoch": 0.5165673625101804, "grad_norm": 0.185546875, "learning_rate": 8.447645537250501e-05, "loss": 0.0448, "step": 4017 }, { "epoch": 0.5166959578207382, "grad_norm": 0.15625, "learning_rate": 8.446913474953592e-05, "loss": 0.0375, "step": 4018 }, { "epoch": 0.5168245531312958, "grad_norm": 0.1845703125, "learning_rate": 8.446181271819827e-05, "loss": 0.0482, "step": 4019 }, { "epoch": 0.5169531484418535, "grad_norm": 0.1875, "learning_rate": 8.445448927879126e-05, "loss": 0.0429, "step": 4020 }, { "epoch": 0.5170817437524111, "grad_norm": 0.1630859375, "learning_rate": 8.444716443161409e-05, "loss": 0.043, "step": 4021 }, { "epoch": 0.5172103390629689, "grad_norm": 0.189453125, "learning_rate": 8.443983817696606e-05, "loss": 0.0516, "step": 4022 }, { "epoch": 0.5173389343735265, "grad_norm": 0.181640625, "learning_rate": 8.443251051514652e-05, "loss": 0.0439, "step": 4023 }, { "epoch": 0.5174675296840842, "grad_norm": 0.181640625, "learning_rate": 8.442518144645486e-05, "loss": 0.0439, "step": 4024 }, { "epoch": 0.5175961249946419, "grad_norm": 0.1591796875, "learning_rate": 8.441785097119053e-05, "loss": 0.0313, "step": 4025 }, { "epoch": 0.5177247203051996, "grad_norm": 0.1708984375, "learning_rate": 8.441051908965306e-05, "loss": 0.0441, "step": 4026 }, { "epoch": 0.5178533156157572, "grad_norm": 0.193359375, "learning_rate": 8.440318580214201e-05, "loss": 0.0433, "step": 4027 }, { "epoch": 0.5179819109263148, "grad_norm": 0.171875, "learning_rate": 8.4395851108957e-05, "loss": 0.048, "step": 4028 }, { "epoch": 0.5181105062368726, "grad_norm": 0.1689453125, "learning_rate": 8.438851501039775e-05, "loss": 0.041, "step": 4029 }, { "epoch": 0.5182391015474302, "grad_norm": 0.16015625, "learning_rate": 8.438117750676397e-05, "loss": 0.0431, "step": 4030 }, { "epoch": 0.5183676968579879, "grad_norm": 0.177734375, "learning_rate": 8.437383859835548e-05, "loss": 0.0456, "step": 4031 }, { "epoch": 0.5184962921685455, "grad_norm": 0.173828125, "learning_rate": 8.436649828547214e-05, "loss": 0.0496, "step": 4032 }, { "epoch": 0.5186248874791033, "grad_norm": 0.181640625, "learning_rate": 8.435915656841384e-05, "loss": 0.0471, "step": 4033 }, { "epoch": 0.5187534827896609, "grad_norm": 0.17578125, "learning_rate": 8.43518134474806e-05, "loss": 0.0454, "step": 4034 }, { "epoch": 0.5188820781002186, "grad_norm": 0.2060546875, "learning_rate": 8.434446892297242e-05, "loss": 0.0496, "step": 4035 }, { "epoch": 0.5190106734107763, "grad_norm": 0.193359375, "learning_rate": 8.43371229951894e-05, "loss": 0.0556, "step": 4036 }, { "epoch": 0.519139268721334, "grad_norm": 0.1767578125, "learning_rate": 8.432977566443168e-05, "loss": 0.0415, "step": 4037 }, { "epoch": 0.5192678640318916, "grad_norm": 0.197265625, "learning_rate": 8.432242693099946e-05, "loss": 0.0537, "step": 4038 }, { "epoch": 0.5193964593424493, "grad_norm": 0.2001953125, "learning_rate": 8.431507679519302e-05, "loss": 0.0548, "step": 4039 }, { "epoch": 0.519525054653007, "grad_norm": 0.1796875, "learning_rate": 8.430772525731265e-05, "loss": 0.0389, "step": 4040 }, { "epoch": 0.5196536499635647, "grad_norm": 0.1435546875, "learning_rate": 8.430037231765875e-05, "loss": 0.0312, "step": 4041 }, { "epoch": 0.5197822452741223, "grad_norm": 0.185546875, "learning_rate": 8.429301797653174e-05, "loss": 0.0464, "step": 4042 }, { "epoch": 0.5199108405846801, "grad_norm": 0.1943359375, "learning_rate": 8.428566223423209e-05, "loss": 0.0436, "step": 4043 }, { "epoch": 0.5200394358952377, "grad_norm": 0.1708984375, "learning_rate": 8.427830509106039e-05, "loss": 0.0444, "step": 4044 }, { "epoch": 0.5201680312057954, "grad_norm": 0.1904296875, "learning_rate": 8.427094654731722e-05, "loss": 0.0499, "step": 4045 }, { "epoch": 0.520296626516353, "grad_norm": 0.1669921875, "learning_rate": 8.426358660330325e-05, "loss": 0.0448, "step": 4046 }, { "epoch": 0.5204252218269108, "grad_norm": 0.1767578125, "learning_rate": 8.425622525931917e-05, "loss": 0.0404, "step": 4047 }, { "epoch": 0.5205538171374684, "grad_norm": 0.1572265625, "learning_rate": 8.424886251566579e-05, "loss": 0.0404, "step": 4048 }, { "epoch": 0.520682412448026, "grad_norm": 0.1630859375, "learning_rate": 8.424149837264394e-05, "loss": 0.038, "step": 4049 }, { "epoch": 0.5208110077585837, "grad_norm": 0.240234375, "learning_rate": 8.423413283055452e-05, "loss": 0.0481, "step": 4050 }, { "epoch": 0.5209396030691414, "grad_norm": 0.173828125, "learning_rate": 8.422676588969843e-05, "loss": 0.0419, "step": 4051 }, { "epoch": 0.5210681983796991, "grad_norm": 0.1591796875, "learning_rate": 8.421939755037671e-05, "loss": 0.0377, "step": 4052 }, { "epoch": 0.5211967936902567, "grad_norm": 0.1875, "learning_rate": 8.421202781289043e-05, "loss": 0.046, "step": 4053 }, { "epoch": 0.5213253890008145, "grad_norm": 0.1943359375, "learning_rate": 8.420465667754067e-05, "loss": 0.0486, "step": 4054 }, { "epoch": 0.5214539843113721, "grad_norm": 0.171875, "learning_rate": 8.419728414462865e-05, "loss": 0.0394, "step": 4055 }, { "epoch": 0.5215825796219298, "grad_norm": 0.1875, "learning_rate": 8.41899102144556e-05, "loss": 0.0439, "step": 4056 }, { "epoch": 0.5217111749324874, "grad_norm": 0.169921875, "learning_rate": 8.418253488732275e-05, "loss": 0.0379, "step": 4057 }, { "epoch": 0.5218397702430452, "grad_norm": 0.17578125, "learning_rate": 8.417515816353152e-05, "loss": 0.0435, "step": 4058 }, { "epoch": 0.5219683655536028, "grad_norm": 0.171875, "learning_rate": 8.416778004338328e-05, "loss": 0.0434, "step": 4059 }, { "epoch": 0.5220969608641605, "grad_norm": 0.20703125, "learning_rate": 8.416040052717949e-05, "loss": 0.0619, "step": 4060 }, { "epoch": 0.5222255561747181, "grad_norm": 0.1728515625, "learning_rate": 8.415301961522168e-05, "loss": 0.0427, "step": 4061 }, { "epoch": 0.5223541514852759, "grad_norm": 0.17578125, "learning_rate": 8.414563730781144e-05, "loss": 0.0377, "step": 4062 }, { "epoch": 0.5224827467958335, "grad_norm": 0.19140625, "learning_rate": 8.413825360525037e-05, "loss": 0.0525, "step": 4063 }, { "epoch": 0.5226113421063912, "grad_norm": 0.173828125, "learning_rate": 8.413086850784017e-05, "loss": 0.0447, "step": 4064 }, { "epoch": 0.5227399374169489, "grad_norm": 0.181640625, "learning_rate": 8.412348201588258e-05, "loss": 0.0496, "step": 4065 }, { "epoch": 0.5228685327275066, "grad_norm": 0.193359375, "learning_rate": 8.411609412967943e-05, "loss": 0.046, "step": 4066 }, { "epoch": 0.5229971280380642, "grad_norm": 0.189453125, "learning_rate": 8.410870484953256e-05, "loss": 0.0478, "step": 4067 }, { "epoch": 0.5231257233486218, "grad_norm": 0.177734375, "learning_rate": 8.410131417574388e-05, "loss": 0.0517, "step": 4068 }, { "epoch": 0.5232543186591796, "grad_norm": 0.1533203125, "learning_rate": 8.409392210861537e-05, "loss": 0.0396, "step": 4069 }, { "epoch": 0.5233829139697372, "grad_norm": 0.1943359375, "learning_rate": 8.408652864844908e-05, "loss": 0.0467, "step": 4070 }, { "epoch": 0.5235115092802949, "grad_norm": 0.162109375, "learning_rate": 8.407913379554708e-05, "loss": 0.0396, "step": 4071 }, { "epoch": 0.5236401045908526, "grad_norm": 0.1865234375, "learning_rate": 8.407173755021153e-05, "loss": 0.0505, "step": 4072 }, { "epoch": 0.5237686999014103, "grad_norm": 0.16015625, "learning_rate": 8.40643399127446e-05, "loss": 0.0423, "step": 4073 }, { "epoch": 0.5238972952119679, "grad_norm": 0.1748046875, "learning_rate": 8.40569408834486e-05, "loss": 0.0473, "step": 4074 }, { "epoch": 0.5240258905225256, "grad_norm": 0.16796875, "learning_rate": 8.404954046262579e-05, "loss": 0.0458, "step": 4075 }, { "epoch": 0.5241544858330833, "grad_norm": 0.1787109375, "learning_rate": 8.404213865057857e-05, "loss": 0.0454, "step": 4076 }, { "epoch": 0.524283081143641, "grad_norm": 0.162109375, "learning_rate": 8.403473544760938e-05, "loss": 0.0431, "step": 4077 }, { "epoch": 0.5244116764541986, "grad_norm": 0.2099609375, "learning_rate": 8.402733085402068e-05, "loss": 0.0613, "step": 4078 }, { "epoch": 0.5245402717647563, "grad_norm": 0.1611328125, "learning_rate": 8.401992487011503e-05, "loss": 0.0436, "step": 4079 }, { "epoch": 0.524668867075314, "grad_norm": 0.2177734375, "learning_rate": 8.401251749619503e-05, "loss": 0.0558, "step": 4080 }, { "epoch": 0.5247974623858717, "grad_norm": 0.169921875, "learning_rate": 8.400510873256334e-05, "loss": 0.042, "step": 4081 }, { "epoch": 0.5249260576964293, "grad_norm": 0.181640625, "learning_rate": 8.399769857952265e-05, "loss": 0.0436, "step": 4082 }, { "epoch": 0.5250546530069871, "grad_norm": 0.193359375, "learning_rate": 8.399028703737576e-05, "loss": 0.0496, "step": 4083 }, { "epoch": 0.5251832483175447, "grad_norm": 0.19140625, "learning_rate": 8.398287410642548e-05, "loss": 0.0532, "step": 4084 }, { "epoch": 0.5253118436281023, "grad_norm": 0.1689453125, "learning_rate": 8.39754597869747e-05, "loss": 0.0407, "step": 4085 }, { "epoch": 0.52544043893866, "grad_norm": 0.17578125, "learning_rate": 8.396804407932636e-05, "loss": 0.0497, "step": 4086 }, { "epoch": 0.5255690342492177, "grad_norm": 0.2216796875, "learning_rate": 8.396062698378344e-05, "loss": 0.0506, "step": 4087 }, { "epoch": 0.5256976295597754, "grad_norm": 0.177734375, "learning_rate": 8.395320850064902e-05, "loss": 0.0456, "step": 4088 }, { "epoch": 0.525826224870333, "grad_norm": 0.19140625, "learning_rate": 8.39457886302262e-05, "loss": 0.0519, "step": 4089 }, { "epoch": 0.5259548201808908, "grad_norm": 0.1796875, "learning_rate": 8.393836737281816e-05, "loss": 0.0485, "step": 4090 }, { "epoch": 0.5260834154914484, "grad_norm": 0.1640625, "learning_rate": 8.393094472872809e-05, "loss": 0.0398, "step": 4091 }, { "epoch": 0.5262120108020061, "grad_norm": 0.1982421875, "learning_rate": 8.392352069825928e-05, "loss": 0.0548, "step": 4092 }, { "epoch": 0.5263406061125637, "grad_norm": 0.173828125, "learning_rate": 8.39160952817151e-05, "loss": 0.0481, "step": 4093 }, { "epoch": 0.5264692014231215, "grad_norm": 0.2001953125, "learning_rate": 8.390866847939892e-05, "loss": 0.0497, "step": 4094 }, { "epoch": 0.5265977967336791, "grad_norm": 0.201171875, "learning_rate": 8.39012402916142e-05, "loss": 0.0528, "step": 4095 }, { "epoch": 0.5267263920442368, "grad_norm": 0.16796875, "learning_rate": 8.389381071866442e-05, "loss": 0.0434, "step": 4096 }, { "epoch": 0.5268549873547944, "grad_norm": 0.1845703125, "learning_rate": 8.388637976085317e-05, "loss": 0.0551, "step": 4097 }, { "epoch": 0.5269835826653522, "grad_norm": 0.1748046875, "learning_rate": 8.387894741848408e-05, "loss": 0.0506, "step": 4098 }, { "epoch": 0.5271121779759098, "grad_norm": 0.197265625, "learning_rate": 8.387151369186079e-05, "loss": 0.0564, "step": 4099 }, { "epoch": 0.5272407732864675, "grad_norm": 0.19921875, "learning_rate": 8.386407858128706e-05, "loss": 0.0531, "step": 4100 }, { "epoch": 0.5273693685970252, "grad_norm": 0.1787109375, "learning_rate": 8.385664208706667e-05, "loss": 0.037, "step": 4101 }, { "epoch": 0.5274979639075829, "grad_norm": 0.169921875, "learning_rate": 8.384920420950349e-05, "loss": 0.0438, "step": 4102 }, { "epoch": 0.5276265592181405, "grad_norm": 0.1982421875, "learning_rate": 8.384176494890137e-05, "loss": 0.0499, "step": 4103 }, { "epoch": 0.5277551545286981, "grad_norm": 0.181640625, "learning_rate": 8.383432430556432e-05, "loss": 0.0455, "step": 4104 }, { "epoch": 0.5278837498392559, "grad_norm": 0.1953125, "learning_rate": 8.382688227979635e-05, "loss": 0.0493, "step": 4105 }, { "epoch": 0.5280123451498135, "grad_norm": 0.1787109375, "learning_rate": 8.381943887190151e-05, "loss": 0.0492, "step": 4106 }, { "epoch": 0.5281409404603712, "grad_norm": 0.1962890625, "learning_rate": 8.381199408218395e-05, "loss": 0.056, "step": 4107 }, { "epoch": 0.5282695357709288, "grad_norm": 0.1826171875, "learning_rate": 8.380454791094781e-05, "loss": 0.0498, "step": 4108 }, { "epoch": 0.5283981310814866, "grad_norm": 0.201171875, "learning_rate": 8.37971003584974e-05, "loss": 0.0523, "step": 4109 }, { "epoch": 0.5285267263920442, "grad_norm": 0.1962890625, "learning_rate": 8.3789651425137e-05, "loss": 0.0529, "step": 4110 }, { "epoch": 0.5286553217026019, "grad_norm": 0.1640625, "learning_rate": 8.378220111117092e-05, "loss": 0.0409, "step": 4111 }, { "epoch": 0.5287839170131596, "grad_norm": 0.181640625, "learning_rate": 8.37747494169036e-05, "loss": 0.0433, "step": 4112 }, { "epoch": 0.5289125123237173, "grad_norm": 0.1865234375, "learning_rate": 8.376729634263953e-05, "loss": 0.0523, "step": 4113 }, { "epoch": 0.5290411076342749, "grad_norm": 0.1806640625, "learning_rate": 8.375984188868319e-05, "loss": 0.0457, "step": 4114 }, { "epoch": 0.5291697029448326, "grad_norm": 0.16015625, "learning_rate": 8.37523860553392e-05, "loss": 0.0314, "step": 4115 }, { "epoch": 0.5292982982553903, "grad_norm": 0.197265625, "learning_rate": 8.374492884291219e-05, "loss": 0.0525, "step": 4116 }, { "epoch": 0.529426893565948, "grad_norm": 0.220703125, "learning_rate": 8.373747025170681e-05, "loss": 0.0558, "step": 4117 }, { "epoch": 0.5295554888765056, "grad_norm": 0.1884765625, "learning_rate": 8.373001028202787e-05, "loss": 0.046, "step": 4118 }, { "epoch": 0.5296840841870634, "grad_norm": 0.19140625, "learning_rate": 8.372254893418013e-05, "loss": 0.0528, "step": 4119 }, { "epoch": 0.529812679497621, "grad_norm": 0.181640625, "learning_rate": 8.371508620846849e-05, "loss": 0.0461, "step": 4120 }, { "epoch": 0.5299412748081787, "grad_norm": 0.1875, "learning_rate": 8.370762210519783e-05, "loss": 0.0489, "step": 4121 }, { "epoch": 0.5300698701187363, "grad_norm": 0.1767578125, "learning_rate": 8.370015662467314e-05, "loss": 0.0405, "step": 4122 }, { "epoch": 0.530198465429294, "grad_norm": 0.18359375, "learning_rate": 8.369268976719948e-05, "loss": 0.0415, "step": 4123 }, { "epoch": 0.5303270607398517, "grad_norm": 0.162109375, "learning_rate": 8.368522153308189e-05, "loss": 0.0403, "step": 4124 }, { "epoch": 0.5304556560504093, "grad_norm": 0.1865234375, "learning_rate": 8.367775192262553e-05, "loss": 0.0508, "step": 4125 }, { "epoch": 0.530584251360967, "grad_norm": 0.173828125, "learning_rate": 8.36702809361356e-05, "loss": 0.0448, "step": 4126 }, { "epoch": 0.5307128466715247, "grad_norm": 0.1826171875, "learning_rate": 8.366280857391737e-05, "loss": 0.0524, "step": 4127 }, { "epoch": 0.5308414419820824, "grad_norm": 0.1962890625, "learning_rate": 8.365533483627614e-05, "loss": 0.0519, "step": 4128 }, { "epoch": 0.53097003729264, "grad_norm": 0.162109375, "learning_rate": 8.364785972351728e-05, "loss": 0.041, "step": 4129 }, { "epoch": 0.5310986326031978, "grad_norm": 0.16015625, "learning_rate": 8.364038323594621e-05, "loss": 0.0404, "step": 4130 }, { "epoch": 0.5312272279137554, "grad_norm": 0.17578125, "learning_rate": 8.363290537386841e-05, "loss": 0.0395, "step": 4131 }, { "epoch": 0.5313558232243131, "grad_norm": 0.1669921875, "learning_rate": 8.362542613758942e-05, "loss": 0.0429, "step": 4132 }, { "epoch": 0.5314844185348707, "grad_norm": 0.158203125, "learning_rate": 8.361794552741484e-05, "loss": 0.042, "step": 4133 }, { "epoch": 0.5316130138454285, "grad_norm": 0.18359375, "learning_rate": 8.361046354365032e-05, "loss": 0.044, "step": 4134 }, { "epoch": 0.5317416091559861, "grad_norm": 0.1904296875, "learning_rate": 8.360298018660154e-05, "loss": 0.0484, "step": 4135 }, { "epoch": 0.5318702044665438, "grad_norm": 0.2080078125, "learning_rate": 8.359549545657429e-05, "loss": 0.0509, "step": 4136 }, { "epoch": 0.5319987997771015, "grad_norm": 0.166015625, "learning_rate": 8.358800935387438e-05, "loss": 0.0393, "step": 4137 }, { "epoch": 0.5321273950876592, "grad_norm": 0.169921875, "learning_rate": 8.358052187880768e-05, "loss": 0.0385, "step": 4138 }, { "epoch": 0.5322559903982168, "grad_norm": 0.1640625, "learning_rate": 8.357303303168009e-05, "loss": 0.04, "step": 4139 }, { "epoch": 0.5323845857087744, "grad_norm": 0.1767578125, "learning_rate": 8.356554281279764e-05, "loss": 0.0461, "step": 4140 }, { "epoch": 0.5325131810193322, "grad_norm": 0.1640625, "learning_rate": 8.355805122246639e-05, "loss": 0.0347, "step": 4141 }, { "epoch": 0.5326417763298898, "grad_norm": 0.189453125, "learning_rate": 8.355055826099236e-05, "loss": 0.0537, "step": 4142 }, { "epoch": 0.5327703716404475, "grad_norm": 0.1826171875, "learning_rate": 8.354306392868176e-05, "loss": 0.0475, "step": 4143 }, { "epoch": 0.5328989669510051, "grad_norm": 0.1796875, "learning_rate": 8.35355682258408e-05, "loss": 0.0456, "step": 4144 }, { "epoch": 0.5330275622615629, "grad_norm": 0.173828125, "learning_rate": 8.352807115277572e-05, "loss": 0.0379, "step": 4145 }, { "epoch": 0.5331561575721205, "grad_norm": 0.2041015625, "learning_rate": 8.352057270979287e-05, "loss": 0.0531, "step": 4146 }, { "epoch": 0.5332847528826782, "grad_norm": 0.1630859375, "learning_rate": 8.35130728971986e-05, "loss": 0.0405, "step": 4147 }, { "epoch": 0.5334133481932359, "grad_norm": 0.189453125, "learning_rate": 8.350557171529936e-05, "loss": 0.04, "step": 4148 }, { "epoch": 0.5335419435037936, "grad_norm": 0.177734375, "learning_rate": 8.349806916440163e-05, "loss": 0.0513, "step": 4149 }, { "epoch": 0.5336705388143512, "grad_norm": 0.1875, "learning_rate": 8.349056524481196e-05, "loss": 0.0482, "step": 4150 }, { "epoch": 0.5337991341249089, "grad_norm": 0.2119140625, "learning_rate": 8.348305995683695e-05, "loss": 0.0543, "step": 4151 }, { "epoch": 0.5339277294354666, "grad_norm": 0.189453125, "learning_rate": 8.347555330078327e-05, "loss": 0.0477, "step": 4152 }, { "epoch": 0.5340563247460243, "grad_norm": 0.19140625, "learning_rate": 8.34680452769576e-05, "loss": 0.0504, "step": 4153 }, { "epoch": 0.5341849200565819, "grad_norm": 0.1669921875, "learning_rate": 8.346053588566675e-05, "loss": 0.0403, "step": 4154 }, { "epoch": 0.5343135153671396, "grad_norm": 0.189453125, "learning_rate": 8.345302512721753e-05, "loss": 0.0472, "step": 4155 }, { "epoch": 0.5344421106776973, "grad_norm": 0.201171875, "learning_rate": 8.344551300191681e-05, "loss": 0.0561, "step": 4156 }, { "epoch": 0.534570705988255, "grad_norm": 0.2021484375, "learning_rate": 8.343799951007155e-05, "loss": 0.0518, "step": 4157 }, { "epoch": 0.5346993012988126, "grad_norm": 0.203125, "learning_rate": 8.34304846519887e-05, "loss": 0.0482, "step": 4158 }, { "epoch": 0.5348278966093704, "grad_norm": 0.1669921875, "learning_rate": 8.342296842797535e-05, "loss": 0.0453, "step": 4159 }, { "epoch": 0.534956491919928, "grad_norm": 0.1650390625, "learning_rate": 8.341545083833859e-05, "loss": 0.0414, "step": 4160 }, { "epoch": 0.5350850872304856, "grad_norm": 0.1826171875, "learning_rate": 8.340793188338556e-05, "loss": 0.0427, "step": 4161 }, { "epoch": 0.5352136825410433, "grad_norm": 0.1728515625, "learning_rate": 8.340041156342351e-05, "loss": 0.0503, "step": 4162 }, { "epoch": 0.535342277851601, "grad_norm": 0.173828125, "learning_rate": 8.339288987875969e-05, "loss": 0.0478, "step": 4163 }, { "epoch": 0.5354708731621587, "grad_norm": 0.1806640625, "learning_rate": 8.338536682970144e-05, "loss": 0.0457, "step": 4164 }, { "epoch": 0.5355994684727163, "grad_norm": 0.181640625, "learning_rate": 8.337784241655613e-05, "loss": 0.0437, "step": 4165 }, { "epoch": 0.5357280637832741, "grad_norm": 0.177734375, "learning_rate": 8.33703166396312e-05, "loss": 0.043, "step": 4166 }, { "epoch": 0.5358566590938317, "grad_norm": 0.1787109375, "learning_rate": 8.336278949923417e-05, "loss": 0.0494, "step": 4167 }, { "epoch": 0.5359852544043894, "grad_norm": 0.1923828125, "learning_rate": 8.335526099567256e-05, "loss": 0.0474, "step": 4168 }, { "epoch": 0.536113849714947, "grad_norm": 0.1787109375, "learning_rate": 8.334773112925398e-05, "loss": 0.0469, "step": 4169 }, { "epoch": 0.5362424450255048, "grad_norm": 0.2080078125, "learning_rate": 8.33401999002861e-05, "loss": 0.0496, "step": 4170 }, { "epoch": 0.5363710403360624, "grad_norm": 0.1884765625, "learning_rate": 8.333266730907663e-05, "loss": 0.0545, "step": 4171 }, { "epoch": 0.5364996356466201, "grad_norm": 0.173828125, "learning_rate": 8.332513335593334e-05, "loss": 0.044, "step": 4172 }, { "epoch": 0.5366282309571777, "grad_norm": 0.1708984375, "learning_rate": 8.331759804116409e-05, "loss": 0.0456, "step": 4173 }, { "epoch": 0.5367568262677355, "grad_norm": 0.1796875, "learning_rate": 8.331006136507673e-05, "loss": 0.0444, "step": 4174 }, { "epoch": 0.5368854215782931, "grad_norm": 0.181640625, "learning_rate": 8.330252332797921e-05, "loss": 0.0446, "step": 4175 }, { "epoch": 0.5370140168888508, "grad_norm": 0.1875, "learning_rate": 8.32949839301795e-05, "loss": 0.0572, "step": 4176 }, { "epoch": 0.5371426121994085, "grad_norm": 0.1845703125, "learning_rate": 8.32874431719857e-05, "loss": 0.0461, "step": 4177 }, { "epoch": 0.5372712075099662, "grad_norm": 0.1630859375, "learning_rate": 8.327990105370589e-05, "loss": 0.0407, "step": 4178 }, { "epoch": 0.5373998028205238, "grad_norm": 0.1943359375, "learning_rate": 8.327235757564825e-05, "loss": 0.0473, "step": 4179 }, { "epoch": 0.5375283981310814, "grad_norm": 0.193359375, "learning_rate": 8.326481273812095e-05, "loss": 0.0476, "step": 4180 }, { "epoch": 0.5376569934416392, "grad_norm": 0.1796875, "learning_rate": 8.325726654143231e-05, "loss": 0.0454, "step": 4181 }, { "epoch": 0.5377855887521968, "grad_norm": 0.1640625, "learning_rate": 8.324971898589064e-05, "loss": 0.0426, "step": 4182 }, { "epoch": 0.5379141840627545, "grad_norm": 0.169921875, "learning_rate": 8.324217007180434e-05, "loss": 0.0439, "step": 4183 }, { "epoch": 0.5380427793733122, "grad_norm": 0.1728515625, "learning_rate": 8.323461979948182e-05, "loss": 0.044, "step": 4184 }, { "epoch": 0.5381713746838699, "grad_norm": 0.1650390625, "learning_rate": 8.322706816923162e-05, "loss": 0.041, "step": 4185 }, { "epoch": 0.5382999699944275, "grad_norm": 0.177734375, "learning_rate": 8.321951518136224e-05, "loss": 0.0445, "step": 4186 }, { "epoch": 0.5384285653049852, "grad_norm": 0.1845703125, "learning_rate": 8.321196083618232e-05, "loss": 0.0506, "step": 4187 }, { "epoch": 0.5385571606155429, "grad_norm": 0.1884765625, "learning_rate": 8.320440513400052e-05, "loss": 0.0482, "step": 4188 }, { "epoch": 0.5386857559261006, "grad_norm": 0.181640625, "learning_rate": 8.319684807512553e-05, "loss": 0.0391, "step": 4189 }, { "epoch": 0.5388143512366582, "grad_norm": 0.1953125, "learning_rate": 8.318928965986615e-05, "loss": 0.0486, "step": 4190 }, { "epoch": 0.5389429465472159, "grad_norm": 0.1650390625, "learning_rate": 8.318172988853121e-05, "loss": 0.0402, "step": 4191 }, { "epoch": 0.5390715418577736, "grad_norm": 0.2041015625, "learning_rate": 8.317416876142958e-05, "loss": 0.0508, "step": 4192 }, { "epoch": 0.5392001371683313, "grad_norm": 0.1484375, "learning_rate": 8.31666062788702e-05, "loss": 0.0407, "step": 4193 }, { "epoch": 0.5393287324788889, "grad_norm": 0.185546875, "learning_rate": 8.315904244116206e-05, "loss": 0.0421, "step": 4194 }, { "epoch": 0.5394573277894467, "grad_norm": 0.1494140625, "learning_rate": 8.315147724861423e-05, "loss": 0.039, "step": 4195 }, { "epoch": 0.5395859231000043, "grad_norm": 0.169921875, "learning_rate": 8.314391070153579e-05, "loss": 0.0447, "step": 4196 }, { "epoch": 0.539714518410562, "grad_norm": 0.1796875, "learning_rate": 8.313634280023591e-05, "loss": 0.0508, "step": 4197 }, { "epoch": 0.5398431137211196, "grad_norm": 0.169921875, "learning_rate": 8.312877354502382e-05, "loss": 0.0394, "step": 4198 }, { "epoch": 0.5399717090316773, "grad_norm": 0.1962890625, "learning_rate": 8.312120293620876e-05, "loss": 0.0567, "step": 4199 }, { "epoch": 0.540100304342235, "grad_norm": 0.1796875, "learning_rate": 8.311363097410009e-05, "loss": 0.0454, "step": 4200 }, { "epoch": 0.5402288996527926, "grad_norm": 0.16796875, "learning_rate": 8.310605765900717e-05, "loss": 0.0418, "step": 4201 }, { "epoch": 0.5403574949633503, "grad_norm": 0.2080078125, "learning_rate": 8.309848299123943e-05, "loss": 0.0512, "step": 4202 }, { "epoch": 0.540486090273908, "grad_norm": 0.1748046875, "learning_rate": 8.309090697110639e-05, "loss": 0.0413, "step": 4203 }, { "epoch": 0.5406146855844657, "grad_norm": 0.16796875, "learning_rate": 8.308332959891758e-05, "loss": 0.0427, "step": 4204 }, { "epoch": 0.5407432808950233, "grad_norm": 0.19921875, "learning_rate": 8.307575087498261e-05, "loss": 0.0584, "step": 4205 }, { "epoch": 0.5408718762055811, "grad_norm": 0.1796875, "learning_rate": 8.30681707996111e-05, "loss": 0.049, "step": 4206 }, { "epoch": 0.5410004715161387, "grad_norm": 0.1708984375, "learning_rate": 8.306058937311283e-05, "loss": 0.0431, "step": 4207 }, { "epoch": 0.5411290668266964, "grad_norm": 0.2060546875, "learning_rate": 8.305300659579752e-05, "loss": 0.0535, "step": 4208 }, { "epoch": 0.541257662137254, "grad_norm": 0.1884765625, "learning_rate": 8.3045422467975e-05, "loss": 0.0529, "step": 4209 }, { "epoch": 0.5413862574478118, "grad_norm": 0.1748046875, "learning_rate": 8.303783698995518e-05, "loss": 0.0432, "step": 4210 }, { "epoch": 0.5415148527583694, "grad_norm": 0.1689453125, "learning_rate": 8.303025016204795e-05, "loss": 0.0445, "step": 4211 }, { "epoch": 0.5416434480689271, "grad_norm": 0.1865234375, "learning_rate": 8.302266198456332e-05, "loss": 0.051, "step": 4212 }, { "epoch": 0.5417720433794848, "grad_norm": 0.1611328125, "learning_rate": 8.301507245781132e-05, "loss": 0.0413, "step": 4213 }, { "epoch": 0.5419006386900425, "grad_norm": 0.1923828125, "learning_rate": 8.300748158210208e-05, "loss": 0.0463, "step": 4214 }, { "epoch": 0.5420292340006001, "grad_norm": 0.17578125, "learning_rate": 8.299988935774573e-05, "loss": 0.0523, "step": 4215 }, { "epoch": 0.5421578293111577, "grad_norm": 0.181640625, "learning_rate": 8.29922957850525e-05, "loss": 0.044, "step": 4216 }, { "epoch": 0.5422864246217155, "grad_norm": 0.1669921875, "learning_rate": 8.298470086433261e-05, "loss": 0.0392, "step": 4217 }, { "epoch": 0.5424150199322731, "grad_norm": 0.1728515625, "learning_rate": 8.297710459589643e-05, "loss": 0.0445, "step": 4218 }, { "epoch": 0.5425436152428308, "grad_norm": 0.1875, "learning_rate": 8.29695069800543e-05, "loss": 0.0452, "step": 4219 }, { "epoch": 0.5426722105533884, "grad_norm": 0.171875, "learning_rate": 8.296190801711668e-05, "loss": 0.0429, "step": 4220 }, { "epoch": 0.5428008058639462, "grad_norm": 0.19140625, "learning_rate": 8.295430770739404e-05, "loss": 0.047, "step": 4221 }, { "epoch": 0.5429294011745038, "grad_norm": 0.1748046875, "learning_rate": 8.29467060511969e-05, "loss": 0.0455, "step": 4222 }, { "epoch": 0.5430579964850615, "grad_norm": 0.271484375, "learning_rate": 8.293910304883588e-05, "loss": 0.054, "step": 4223 }, { "epoch": 0.5431865917956192, "grad_norm": 0.173828125, "learning_rate": 8.29314987006216e-05, "loss": 0.0414, "step": 4224 }, { "epoch": 0.5433151871061769, "grad_norm": 0.1962890625, "learning_rate": 8.292389300686482e-05, "loss": 0.0538, "step": 4225 }, { "epoch": 0.5434437824167345, "grad_norm": 0.169921875, "learning_rate": 8.291628596787625e-05, "loss": 0.0454, "step": 4226 }, { "epoch": 0.5435723777272922, "grad_norm": 0.1787109375, "learning_rate": 8.290867758396672e-05, "loss": 0.0465, "step": 4227 }, { "epoch": 0.5437009730378499, "grad_norm": 0.2109375, "learning_rate": 8.290106785544713e-05, "loss": 0.049, "step": 4228 }, { "epoch": 0.5438295683484076, "grad_norm": 0.1923828125, "learning_rate": 8.289345678262836e-05, "loss": 0.0544, "step": 4229 }, { "epoch": 0.5439581636589652, "grad_norm": 0.1708984375, "learning_rate": 8.288584436582138e-05, "loss": 0.0443, "step": 4230 }, { "epoch": 0.544086758969523, "grad_norm": 0.1943359375, "learning_rate": 8.287823060533728e-05, "loss": 0.0525, "step": 4231 }, { "epoch": 0.5442153542800806, "grad_norm": 0.2314453125, "learning_rate": 8.287061550148711e-05, "loss": 0.053, "step": 4232 }, { "epoch": 0.5443439495906383, "grad_norm": 0.193359375, "learning_rate": 8.286299905458203e-05, "loss": 0.05, "step": 4233 }, { "epoch": 0.5444725449011959, "grad_norm": 0.171875, "learning_rate": 8.285538126493322e-05, "loss": 0.0497, "step": 4234 }, { "epoch": 0.5446011402117537, "grad_norm": 0.1669921875, "learning_rate": 8.284776213285195e-05, "loss": 0.0453, "step": 4235 }, { "epoch": 0.5447297355223113, "grad_norm": 0.169921875, "learning_rate": 8.284014165864953e-05, "loss": 0.0404, "step": 4236 }, { "epoch": 0.5448583308328689, "grad_norm": 0.1689453125, "learning_rate": 8.283251984263732e-05, "loss": 0.0433, "step": 4237 }, { "epoch": 0.5449869261434266, "grad_norm": 0.18359375, "learning_rate": 8.282489668512674e-05, "loss": 0.0573, "step": 4238 }, { "epoch": 0.5451155214539843, "grad_norm": 0.1943359375, "learning_rate": 8.281727218642926e-05, "loss": 0.0485, "step": 4239 }, { "epoch": 0.545244116764542, "grad_norm": 0.1845703125, "learning_rate": 8.28096463468564e-05, "loss": 0.0468, "step": 4240 }, { "epoch": 0.5453727120750996, "grad_norm": 0.1982421875, "learning_rate": 8.280201916671976e-05, "loss": 0.0424, "step": 4241 }, { "epoch": 0.5455013073856574, "grad_norm": 0.1708984375, "learning_rate": 8.279439064633096e-05, "loss": 0.0372, "step": 4242 }, { "epoch": 0.545629902696215, "grad_norm": 0.18359375, "learning_rate": 8.278676078600172e-05, "loss": 0.0495, "step": 4243 }, { "epoch": 0.5457584980067727, "grad_norm": 0.201171875, "learning_rate": 8.277912958604375e-05, "loss": 0.057, "step": 4244 }, { "epoch": 0.5458870933173303, "grad_norm": 0.1962890625, "learning_rate": 8.277149704676888e-05, "loss": 0.0433, "step": 4245 }, { "epoch": 0.5460156886278881, "grad_norm": 0.1962890625, "learning_rate": 8.276386316848895e-05, "loss": 0.056, "step": 4246 }, { "epoch": 0.5461442839384457, "grad_norm": 0.1826171875, "learning_rate": 8.275622795151589e-05, "loss": 0.0492, "step": 4247 }, { "epoch": 0.5462728792490034, "grad_norm": 0.19140625, "learning_rate": 8.274859139616164e-05, "loss": 0.0464, "step": 4248 }, { "epoch": 0.546401474559561, "grad_norm": 0.1748046875, "learning_rate": 8.274095350273826e-05, "loss": 0.0427, "step": 4249 }, { "epoch": 0.5465300698701188, "grad_norm": 0.201171875, "learning_rate": 8.273331427155779e-05, "loss": 0.0506, "step": 4250 }, { "epoch": 0.5466586651806764, "grad_norm": 0.201171875, "learning_rate": 8.272567370293235e-05, "loss": 0.042, "step": 4251 }, { "epoch": 0.546787260491234, "grad_norm": 0.1748046875, "learning_rate": 8.271803179717419e-05, "loss": 0.0425, "step": 4252 }, { "epoch": 0.5469158558017918, "grad_norm": 0.1630859375, "learning_rate": 8.271038855459548e-05, "loss": 0.0401, "step": 4253 }, { "epoch": 0.5470444511123495, "grad_norm": 0.212890625, "learning_rate": 8.270274397550854e-05, "loss": 0.0451, "step": 4254 }, { "epoch": 0.5471730464229071, "grad_norm": 0.197265625, "learning_rate": 8.269509806022572e-05, "loss": 0.0455, "step": 4255 }, { "epoch": 0.5473016417334647, "grad_norm": 0.1689453125, "learning_rate": 8.268745080905942e-05, "loss": 0.0407, "step": 4256 }, { "epoch": 0.5474302370440225, "grad_norm": 0.1767578125, "learning_rate": 8.26798022223221e-05, "loss": 0.044, "step": 4257 }, { "epoch": 0.5475588323545801, "grad_norm": 0.1669921875, "learning_rate": 8.267215230032625e-05, "loss": 0.0412, "step": 4258 }, { "epoch": 0.5476874276651378, "grad_norm": 0.166015625, "learning_rate": 8.26645010433845e-05, "loss": 0.0451, "step": 4259 }, { "epoch": 0.5478160229756955, "grad_norm": 0.1923828125, "learning_rate": 8.26568484518094e-05, "loss": 0.044, "step": 4260 }, { "epoch": 0.5479446182862532, "grad_norm": 0.1611328125, "learning_rate": 8.264919452591366e-05, "loss": 0.037, "step": 4261 }, { "epoch": 0.5480732135968108, "grad_norm": 0.1865234375, "learning_rate": 8.264153926601e-05, "loss": 0.0497, "step": 4262 }, { "epoch": 0.5482018089073685, "grad_norm": 0.1826171875, "learning_rate": 8.263388267241122e-05, "loss": 0.0426, "step": 4263 }, { "epoch": 0.5483304042179262, "grad_norm": 0.1552734375, "learning_rate": 8.262622474543016e-05, "loss": 0.0393, "step": 4264 }, { "epoch": 0.5484589995284839, "grad_norm": 0.166015625, "learning_rate": 8.26185654853797e-05, "loss": 0.0408, "step": 4265 }, { "epoch": 0.5485875948390415, "grad_norm": 0.2275390625, "learning_rate": 8.261090489257278e-05, "loss": 0.0572, "step": 4266 }, { "epoch": 0.5487161901495992, "grad_norm": 0.177734375, "learning_rate": 8.260324296732243e-05, "loss": 0.0489, "step": 4267 }, { "epoch": 0.5488447854601569, "grad_norm": 0.1845703125, "learning_rate": 8.259557970994168e-05, "loss": 0.0512, "step": 4268 }, { "epoch": 0.5489733807707146, "grad_norm": 0.1689453125, "learning_rate": 8.258791512074364e-05, "loss": 0.0401, "step": 4269 }, { "epoch": 0.5491019760812722, "grad_norm": 0.16015625, "learning_rate": 8.258024920004151e-05, "loss": 0.0346, "step": 4270 }, { "epoch": 0.54923057139183, "grad_norm": 0.1669921875, "learning_rate": 8.257258194814849e-05, "loss": 0.0428, "step": 4271 }, { "epoch": 0.5493591667023876, "grad_norm": 0.193359375, "learning_rate": 8.256491336537786e-05, "loss": 0.0507, "step": 4272 }, { "epoch": 0.5494877620129452, "grad_norm": 0.2099609375, "learning_rate": 8.255724345204294e-05, "loss": 0.0524, "step": 4273 }, { "epoch": 0.5496163573235029, "grad_norm": 0.201171875, "learning_rate": 8.254957220845712e-05, "loss": 0.0494, "step": 4274 }, { "epoch": 0.5497449526340606, "grad_norm": 0.17578125, "learning_rate": 8.254189963493384e-05, "loss": 0.0526, "step": 4275 }, { "epoch": 0.5498735479446183, "grad_norm": 0.1708984375, "learning_rate": 8.25342257317866e-05, "loss": 0.0423, "step": 4276 }, { "epoch": 0.5500021432551759, "grad_norm": 0.1875, "learning_rate": 8.252655049932892e-05, "loss": 0.0455, "step": 4277 }, { "epoch": 0.5501307385657337, "grad_norm": 0.171875, "learning_rate": 8.25188739378744e-05, "loss": 0.0425, "step": 4278 }, { "epoch": 0.5502593338762913, "grad_norm": 0.1708984375, "learning_rate": 8.251119604773674e-05, "loss": 0.0419, "step": 4279 }, { "epoch": 0.550387929186849, "grad_norm": 0.1728515625, "learning_rate": 8.250351682922963e-05, "loss": 0.0463, "step": 4280 }, { "epoch": 0.5505165244974066, "grad_norm": 0.1865234375, "learning_rate": 8.249583628266679e-05, "loss": 0.0468, "step": 4281 }, { "epoch": 0.5506451198079644, "grad_norm": 0.1943359375, "learning_rate": 8.24881544083621e-05, "loss": 0.0499, "step": 4282 }, { "epoch": 0.550773715118522, "grad_norm": 0.166015625, "learning_rate": 8.24804712066294e-05, "loss": 0.042, "step": 4283 }, { "epoch": 0.5509023104290797, "grad_norm": 0.1669921875, "learning_rate": 8.247278667778264e-05, "loss": 0.0409, "step": 4284 }, { "epoch": 0.5510309057396373, "grad_norm": 0.16796875, "learning_rate": 8.246510082213576e-05, "loss": 0.0465, "step": 4285 }, { "epoch": 0.5511595010501951, "grad_norm": 0.2021484375, "learning_rate": 8.245741364000284e-05, "loss": 0.0647, "step": 4286 }, { "epoch": 0.5512880963607527, "grad_norm": 0.1953125, "learning_rate": 8.244972513169793e-05, "loss": 0.0537, "step": 4287 }, { "epoch": 0.5514166916713104, "grad_norm": 0.158203125, "learning_rate": 8.244203529753521e-05, "loss": 0.0362, "step": 4288 }, { "epoch": 0.5515452869818681, "grad_norm": 0.169921875, "learning_rate": 8.243434413782885e-05, "loss": 0.0421, "step": 4289 }, { "epoch": 0.5516738822924258, "grad_norm": 0.173828125, "learning_rate": 8.242665165289312e-05, "loss": 0.0454, "step": 4290 }, { "epoch": 0.5518024776029834, "grad_norm": 0.19921875, "learning_rate": 8.24189578430423e-05, "loss": 0.047, "step": 4291 }, { "epoch": 0.551931072913541, "grad_norm": 0.1904296875, "learning_rate": 8.241126270859077e-05, "loss": 0.0528, "step": 4292 }, { "epoch": 0.5520596682240988, "grad_norm": 0.177734375, "learning_rate": 8.240356624985295e-05, "loss": 0.0502, "step": 4293 }, { "epoch": 0.5521882635346564, "grad_norm": 0.169921875, "learning_rate": 8.239586846714329e-05, "loss": 0.0388, "step": 4294 }, { "epoch": 0.5523168588452141, "grad_norm": 0.1845703125, "learning_rate": 8.238816936077634e-05, "loss": 0.0479, "step": 4295 }, { "epoch": 0.5524454541557717, "grad_norm": 0.203125, "learning_rate": 8.238046893106665e-05, "loss": 0.0612, "step": 4296 }, { "epoch": 0.5525740494663295, "grad_norm": 0.1845703125, "learning_rate": 8.237276717832884e-05, "loss": 0.0451, "step": 4297 }, { "epoch": 0.5527026447768871, "grad_norm": 0.1796875, "learning_rate": 8.236506410287766e-05, "loss": 0.0495, "step": 4298 }, { "epoch": 0.5528312400874448, "grad_norm": 0.1611328125, "learning_rate": 8.235735970502777e-05, "loss": 0.0443, "step": 4299 }, { "epoch": 0.5529598353980025, "grad_norm": 0.1591796875, "learning_rate": 8.2349653985094e-05, "loss": 0.0401, "step": 4300 }, { "epoch": 0.5530884307085602, "grad_norm": 0.220703125, "learning_rate": 8.234194694339119e-05, "loss": 0.0482, "step": 4301 }, { "epoch": 0.5532170260191178, "grad_norm": 0.15234375, "learning_rate": 8.233423858023424e-05, "loss": 0.0379, "step": 4302 }, { "epoch": 0.5533456213296755, "grad_norm": 0.19921875, "learning_rate": 8.232652889593811e-05, "loss": 0.0537, "step": 4303 }, { "epoch": 0.5534742166402332, "grad_norm": 0.197265625, "learning_rate": 8.231881789081781e-05, "loss": 0.0494, "step": 4304 }, { "epoch": 0.5536028119507909, "grad_norm": 0.1796875, "learning_rate": 8.231110556518839e-05, "loss": 0.0453, "step": 4305 }, { "epoch": 0.5537314072613485, "grad_norm": 0.2333984375, "learning_rate": 8.230339191936498e-05, "loss": 0.0497, "step": 4306 }, { "epoch": 0.5538600025719063, "grad_norm": 0.17578125, "learning_rate": 8.229567695366276e-05, "loss": 0.0423, "step": 4307 }, { "epoch": 0.5539885978824639, "grad_norm": 0.1875, "learning_rate": 8.228796066839693e-05, "loss": 0.039, "step": 4308 }, { "epoch": 0.5541171931930216, "grad_norm": 0.2021484375, "learning_rate": 8.228024306388277e-05, "loss": 0.0537, "step": 4309 }, { "epoch": 0.5542457885035792, "grad_norm": 0.1494140625, "learning_rate": 8.227252414043563e-05, "loss": 0.0282, "step": 4310 }, { "epoch": 0.554374383814137, "grad_norm": 0.2216796875, "learning_rate": 8.22648038983709e-05, "loss": 0.0544, "step": 4311 }, { "epoch": 0.5545029791246946, "grad_norm": 0.1884765625, "learning_rate": 8.225708233800399e-05, "loss": 0.0464, "step": 4312 }, { "epoch": 0.5546315744352522, "grad_norm": 0.189453125, "learning_rate": 8.22493594596504e-05, "loss": 0.0488, "step": 4313 }, { "epoch": 0.5547601697458099, "grad_norm": 0.1865234375, "learning_rate": 8.224163526362572e-05, "loss": 0.0523, "step": 4314 }, { "epoch": 0.5548887650563676, "grad_norm": 0.177734375, "learning_rate": 8.22339097502455e-05, "loss": 0.0431, "step": 4315 }, { "epoch": 0.5550173603669253, "grad_norm": 0.1689453125, "learning_rate": 8.222618291982543e-05, "loss": 0.0455, "step": 4316 }, { "epoch": 0.5551459556774829, "grad_norm": 0.2109375, "learning_rate": 8.221845477268119e-05, "loss": 0.0543, "step": 4317 }, { "epoch": 0.5552745509880407, "grad_norm": 0.2060546875, "learning_rate": 8.221072530912857e-05, "loss": 0.0482, "step": 4318 }, { "epoch": 0.5554031462985983, "grad_norm": 0.169921875, "learning_rate": 8.220299452948335e-05, "loss": 0.0462, "step": 4319 }, { "epoch": 0.555531741609156, "grad_norm": 0.17578125, "learning_rate": 8.219526243406146e-05, "loss": 0.0483, "step": 4320 }, { "epoch": 0.5556603369197136, "grad_norm": 0.1796875, "learning_rate": 8.218752902317877e-05, "loss": 0.0487, "step": 4321 }, { "epoch": 0.5557889322302714, "grad_norm": 0.19921875, "learning_rate": 8.217979429715129e-05, "loss": 0.0482, "step": 4322 }, { "epoch": 0.555917527540829, "grad_norm": 0.1943359375, "learning_rate": 8.217205825629504e-05, "loss": 0.0439, "step": 4323 }, { "epoch": 0.5560461228513867, "grad_norm": 0.17578125, "learning_rate": 8.216432090092609e-05, "loss": 0.0407, "step": 4324 }, { "epoch": 0.5561747181619444, "grad_norm": 0.193359375, "learning_rate": 8.215658223136061e-05, "loss": 0.054, "step": 4325 }, { "epoch": 0.5563033134725021, "grad_norm": 0.162109375, "learning_rate": 8.214884224791478e-05, "loss": 0.0344, "step": 4326 }, { "epoch": 0.5564319087830597, "grad_norm": 0.18359375, "learning_rate": 8.214110095090483e-05, "loss": 0.0466, "step": 4327 }, { "epoch": 0.5565605040936173, "grad_norm": 0.1943359375, "learning_rate": 8.213335834064708e-05, "loss": 0.0488, "step": 4328 }, { "epoch": 0.5566890994041751, "grad_norm": 0.1728515625, "learning_rate": 8.21256144174579e-05, "loss": 0.0389, "step": 4329 }, { "epoch": 0.5568176947147327, "grad_norm": 0.173828125, "learning_rate": 8.211786918165365e-05, "loss": 0.0443, "step": 4330 }, { "epoch": 0.5569462900252904, "grad_norm": 0.1796875, "learning_rate": 8.211012263355083e-05, "loss": 0.0344, "step": 4331 }, { "epoch": 0.557074885335848, "grad_norm": 0.1923828125, "learning_rate": 8.210237477346594e-05, "loss": 0.0386, "step": 4332 }, { "epoch": 0.5572034806464058, "grad_norm": 0.1689453125, "learning_rate": 8.209462560171555e-05, "loss": 0.0441, "step": 4333 }, { "epoch": 0.5573320759569634, "grad_norm": 0.1865234375, "learning_rate": 8.208687511861628e-05, "loss": 0.0433, "step": 4334 }, { "epoch": 0.5574606712675211, "grad_norm": 0.23828125, "learning_rate": 8.207912332448483e-05, "loss": 0.0644, "step": 4335 }, { "epoch": 0.5575892665780788, "grad_norm": 0.18359375, "learning_rate": 8.207137021963789e-05, "loss": 0.044, "step": 4336 }, { "epoch": 0.5577178618886365, "grad_norm": 0.1708984375, "learning_rate": 8.206361580439226e-05, "loss": 0.0387, "step": 4337 }, { "epoch": 0.5578464571991941, "grad_norm": 0.181640625, "learning_rate": 8.205586007906478e-05, "loss": 0.0448, "step": 4338 }, { "epoch": 0.5579750525097518, "grad_norm": 0.18359375, "learning_rate": 8.204810304397235e-05, "loss": 0.0437, "step": 4339 }, { "epoch": 0.5581036478203095, "grad_norm": 0.166015625, "learning_rate": 8.204034469943189e-05, "loss": 0.04, "step": 4340 }, { "epoch": 0.5582322431308672, "grad_norm": 0.1884765625, "learning_rate": 8.203258504576042e-05, "loss": 0.0448, "step": 4341 }, { "epoch": 0.5583608384414248, "grad_norm": 0.1923828125, "learning_rate": 8.202482408327497e-05, "loss": 0.0463, "step": 4342 }, { "epoch": 0.5584894337519825, "grad_norm": 0.189453125, "learning_rate": 8.201706181229264e-05, "loss": 0.052, "step": 4343 }, { "epoch": 0.5586180290625402, "grad_norm": 0.1875, "learning_rate": 8.200929823313061e-05, "loss": 0.0465, "step": 4344 }, { "epoch": 0.5587466243730979, "grad_norm": 0.1806640625, "learning_rate": 8.200153334610608e-05, "loss": 0.0531, "step": 4345 }, { "epoch": 0.5588752196836555, "grad_norm": 0.185546875, "learning_rate": 8.19937671515363e-05, "loss": 0.0477, "step": 4346 }, { "epoch": 0.5590038149942133, "grad_norm": 0.1591796875, "learning_rate": 8.198599964973863e-05, "loss": 0.0478, "step": 4347 }, { "epoch": 0.5591324103047709, "grad_norm": 0.1669921875, "learning_rate": 8.197823084103041e-05, "loss": 0.0397, "step": 4348 }, { "epoch": 0.5592610056153285, "grad_norm": 0.171875, "learning_rate": 8.197046072572904e-05, "loss": 0.0503, "step": 4349 }, { "epoch": 0.5593896009258862, "grad_norm": 0.193359375, "learning_rate": 8.196268930415205e-05, "loss": 0.0534, "step": 4350 }, { "epoch": 0.5595181962364439, "grad_norm": 0.171875, "learning_rate": 8.195491657661694e-05, "loss": 0.0436, "step": 4351 }, { "epoch": 0.5596467915470016, "grad_norm": 0.1787109375, "learning_rate": 8.194714254344129e-05, "loss": 0.0476, "step": 4352 }, { "epoch": 0.5597753868575592, "grad_norm": 0.1865234375, "learning_rate": 8.193936720494275e-05, "loss": 0.0509, "step": 4353 }, { "epoch": 0.559903982168117, "grad_norm": 0.1845703125, "learning_rate": 8.193159056143902e-05, "loss": 0.0462, "step": 4354 }, { "epoch": 0.5600325774786746, "grad_norm": 0.171875, "learning_rate": 8.192381261324785e-05, "loss": 0.0406, "step": 4355 }, { "epoch": 0.5601611727892323, "grad_norm": 0.1708984375, "learning_rate": 8.191603336068698e-05, "loss": 0.0439, "step": 4356 }, { "epoch": 0.5602897680997899, "grad_norm": 0.162109375, "learning_rate": 8.190825280407433e-05, "loss": 0.0407, "step": 4357 }, { "epoch": 0.5604183634103477, "grad_norm": 0.1806640625, "learning_rate": 8.190047094372778e-05, "loss": 0.0422, "step": 4358 }, { "epoch": 0.5605469587209053, "grad_norm": 0.177734375, "learning_rate": 8.18926877799653e-05, "loss": 0.0443, "step": 4359 }, { "epoch": 0.560675554031463, "grad_norm": 0.1806640625, "learning_rate": 8.188490331310486e-05, "loss": 0.0439, "step": 4360 }, { "epoch": 0.5608041493420206, "grad_norm": 0.173828125, "learning_rate": 8.187711754346456e-05, "loss": 0.0425, "step": 4361 }, { "epoch": 0.5609327446525784, "grad_norm": 0.1611328125, "learning_rate": 8.186933047136251e-05, "loss": 0.0398, "step": 4362 }, { "epoch": 0.561061339963136, "grad_norm": 0.1845703125, "learning_rate": 8.186154209711688e-05, "loss": 0.0506, "step": 4363 }, { "epoch": 0.5611899352736937, "grad_norm": 0.201171875, "learning_rate": 8.18537524210459e-05, "loss": 0.0549, "step": 4364 }, { "epoch": 0.5613185305842514, "grad_norm": 0.1767578125, "learning_rate": 8.184596144346782e-05, "loss": 0.0473, "step": 4365 }, { "epoch": 0.561447125894809, "grad_norm": 0.1806640625, "learning_rate": 8.183816916470099e-05, "loss": 0.0434, "step": 4366 }, { "epoch": 0.5615757212053667, "grad_norm": 0.1796875, "learning_rate": 8.183037558506382e-05, "loss": 0.0465, "step": 4367 }, { "epoch": 0.5617043165159243, "grad_norm": 0.1953125, "learning_rate": 8.18225807048747e-05, "loss": 0.0493, "step": 4368 }, { "epoch": 0.5618329118264821, "grad_norm": 0.18359375, "learning_rate": 8.181478452445214e-05, "loss": 0.0409, "step": 4369 }, { "epoch": 0.5619615071370397, "grad_norm": 0.166015625, "learning_rate": 8.180698704411469e-05, "loss": 0.0373, "step": 4370 }, { "epoch": 0.5620901024475974, "grad_norm": 0.1708984375, "learning_rate": 8.179918826418096e-05, "loss": 0.0399, "step": 4371 }, { "epoch": 0.5622186977581551, "grad_norm": 0.197265625, "learning_rate": 8.179138818496954e-05, "loss": 0.0467, "step": 4372 }, { "epoch": 0.5623472930687128, "grad_norm": 0.1845703125, "learning_rate": 8.17835868067992e-05, "loss": 0.0418, "step": 4373 }, { "epoch": 0.5624758883792704, "grad_norm": 0.1806640625, "learning_rate": 8.177578412998864e-05, "loss": 0.0467, "step": 4374 }, { "epoch": 0.5626044836898281, "grad_norm": 0.1826171875, "learning_rate": 8.176798015485671e-05, "loss": 0.0427, "step": 4375 }, { "epoch": 0.5627330790003858, "grad_norm": 0.1787109375, "learning_rate": 8.176017488172224e-05, "loss": 0.042, "step": 4376 }, { "epoch": 0.5628616743109435, "grad_norm": 0.17578125, "learning_rate": 8.175236831090417e-05, "loss": 0.0476, "step": 4377 }, { "epoch": 0.5629902696215011, "grad_norm": 0.1875, "learning_rate": 8.174456044272146e-05, "loss": 0.0578, "step": 4378 }, { "epoch": 0.5631188649320588, "grad_norm": 0.1904296875, "learning_rate": 8.173675127749312e-05, "loss": 0.0562, "step": 4379 }, { "epoch": 0.5632474602426165, "grad_norm": 0.1572265625, "learning_rate": 8.172894081553823e-05, "loss": 0.0324, "step": 4380 }, { "epoch": 0.5633760555531742, "grad_norm": 0.1826171875, "learning_rate": 8.17211290571759e-05, "loss": 0.0422, "step": 4381 }, { "epoch": 0.5635046508637318, "grad_norm": 0.162109375, "learning_rate": 8.171331600272537e-05, "loss": 0.0412, "step": 4382 }, { "epoch": 0.5636332461742896, "grad_norm": 0.1806640625, "learning_rate": 8.17055016525058e-05, "loss": 0.0446, "step": 4383 }, { "epoch": 0.5637618414848472, "grad_norm": 0.1865234375, "learning_rate": 8.169768600683649e-05, "loss": 0.0409, "step": 4384 }, { "epoch": 0.5638904367954048, "grad_norm": 0.1787109375, "learning_rate": 8.168986906603681e-05, "loss": 0.0527, "step": 4385 }, { "epoch": 0.5640190321059625, "grad_norm": 0.1767578125, "learning_rate": 8.168205083042613e-05, "loss": 0.0458, "step": 4386 }, { "epoch": 0.5641476274165202, "grad_norm": 0.171875, "learning_rate": 8.167423130032388e-05, "loss": 0.0406, "step": 4387 }, { "epoch": 0.5642762227270779, "grad_norm": 0.1640625, "learning_rate": 8.16664104760496e-05, "loss": 0.0388, "step": 4388 }, { "epoch": 0.5644048180376355, "grad_norm": 0.1767578125, "learning_rate": 8.165858835792279e-05, "loss": 0.041, "step": 4389 }, { "epoch": 0.5645334133481932, "grad_norm": 0.1943359375, "learning_rate": 8.165076494626309e-05, "loss": 0.0414, "step": 4390 }, { "epoch": 0.5646620086587509, "grad_norm": 0.1923828125, "learning_rate": 8.164294024139015e-05, "loss": 0.0467, "step": 4391 }, { "epoch": 0.5647906039693086, "grad_norm": 0.171875, "learning_rate": 8.163511424362365e-05, "loss": 0.0378, "step": 4392 }, { "epoch": 0.5649191992798662, "grad_norm": 0.185546875, "learning_rate": 8.162728695328337e-05, "loss": 0.0457, "step": 4393 }, { "epoch": 0.565047794590424, "grad_norm": 0.1865234375, "learning_rate": 8.161945837068913e-05, "loss": 0.0496, "step": 4394 }, { "epoch": 0.5651763899009816, "grad_norm": 0.1767578125, "learning_rate": 8.16116284961608e-05, "loss": 0.0459, "step": 4395 }, { "epoch": 0.5653049852115393, "grad_norm": 0.171875, "learning_rate": 8.160379733001828e-05, "loss": 0.0392, "step": 4396 }, { "epoch": 0.5654335805220969, "grad_norm": 0.169921875, "learning_rate": 8.159596487258155e-05, "loss": 0.0417, "step": 4397 }, { "epoch": 0.5655621758326547, "grad_norm": 0.14453125, "learning_rate": 8.158813112417065e-05, "loss": 0.0325, "step": 4398 }, { "epoch": 0.5656907711432123, "grad_norm": 0.17578125, "learning_rate": 8.158029608510563e-05, "loss": 0.0405, "step": 4399 }, { "epoch": 0.56581936645377, "grad_norm": 0.16015625, "learning_rate": 8.157245975570665e-05, "loss": 0.0393, "step": 4400 }, { "epoch": 0.5659479617643277, "grad_norm": 0.16796875, "learning_rate": 8.156462213629388e-05, "loss": 0.039, "step": 4401 }, { "epoch": 0.5660765570748854, "grad_norm": 0.16015625, "learning_rate": 8.155678322718754e-05, "loss": 0.0357, "step": 4402 }, { "epoch": 0.566205152385443, "grad_norm": 0.16015625, "learning_rate": 8.154894302870795e-05, "loss": 0.0365, "step": 4403 }, { "epoch": 0.5663337476960006, "grad_norm": 0.2041015625, "learning_rate": 8.154110154117542e-05, "loss": 0.0495, "step": 4404 }, { "epoch": 0.5664623430065584, "grad_norm": 0.1865234375, "learning_rate": 8.153325876491038e-05, "loss": 0.0518, "step": 4405 }, { "epoch": 0.566590938317116, "grad_norm": 0.1806640625, "learning_rate": 8.152541470023325e-05, "loss": 0.044, "step": 4406 }, { "epoch": 0.5667195336276737, "grad_norm": 0.19921875, "learning_rate": 8.151756934746453e-05, "loss": 0.0529, "step": 4407 }, { "epoch": 0.5668481289382313, "grad_norm": 0.15625, "learning_rate": 8.150972270692478e-05, "loss": 0.0345, "step": 4408 }, { "epoch": 0.5669767242487891, "grad_norm": 0.1767578125, "learning_rate": 8.15018747789346e-05, "loss": 0.0419, "step": 4409 }, { "epoch": 0.5671053195593467, "grad_norm": 0.189453125, "learning_rate": 8.149402556381464e-05, "loss": 0.0494, "step": 4410 }, { "epoch": 0.5672339148699044, "grad_norm": 0.2041015625, "learning_rate": 8.148617506188563e-05, "loss": 0.0502, "step": 4411 }, { "epoch": 0.5673625101804621, "grad_norm": 0.1728515625, "learning_rate": 8.147832327346832e-05, "loss": 0.0434, "step": 4412 }, { "epoch": 0.5674911054910198, "grad_norm": 0.19921875, "learning_rate": 8.147047019888352e-05, "loss": 0.0574, "step": 4413 }, { "epoch": 0.5676197008015774, "grad_norm": 0.1826171875, "learning_rate": 8.146261583845212e-05, "loss": 0.0435, "step": 4414 }, { "epoch": 0.5677482961121351, "grad_norm": 0.169921875, "learning_rate": 8.145476019249501e-05, "loss": 0.0453, "step": 4415 }, { "epoch": 0.5678768914226928, "grad_norm": 0.1875, "learning_rate": 8.144690326133318e-05, "loss": 0.0534, "step": 4416 }, { "epoch": 0.5680054867332505, "grad_norm": 0.1494140625, "learning_rate": 8.143904504528765e-05, "loss": 0.0373, "step": 4417 }, { "epoch": 0.5681340820438081, "grad_norm": 0.162109375, "learning_rate": 8.14311855446795e-05, "loss": 0.0425, "step": 4418 }, { "epoch": 0.5682626773543659, "grad_norm": 0.1708984375, "learning_rate": 8.142332475982985e-05, "loss": 0.0446, "step": 4419 }, { "epoch": 0.5683912726649235, "grad_norm": 0.1708984375, "learning_rate": 8.141546269105991e-05, "loss": 0.0414, "step": 4420 }, { "epoch": 0.5685198679754812, "grad_norm": 0.1591796875, "learning_rate": 8.140759933869089e-05, "loss": 0.038, "step": 4421 }, { "epoch": 0.5686484632860388, "grad_norm": 0.171875, "learning_rate": 8.139973470304407e-05, "loss": 0.0392, "step": 4422 }, { "epoch": 0.5687770585965966, "grad_norm": 0.17578125, "learning_rate": 8.139186878444081e-05, "loss": 0.0476, "step": 4423 }, { "epoch": 0.5689056539071542, "grad_norm": 0.185546875, "learning_rate": 8.13840015832025e-05, "loss": 0.0557, "step": 4424 }, { "epoch": 0.5690342492177118, "grad_norm": 0.19921875, "learning_rate": 8.137613309965058e-05, "loss": 0.0456, "step": 4425 }, { "epoch": 0.5691628445282695, "grad_norm": 0.1689453125, "learning_rate": 8.136826333410654e-05, "loss": 0.0414, "step": 4426 }, { "epoch": 0.5692914398388272, "grad_norm": 0.181640625, "learning_rate": 8.136039228689194e-05, "loss": 0.0447, "step": 4427 }, { "epoch": 0.5694200351493849, "grad_norm": 0.169921875, "learning_rate": 8.135251995832838e-05, "loss": 0.0376, "step": 4428 }, { "epoch": 0.5695486304599425, "grad_norm": 0.169921875, "learning_rate": 8.134464634873751e-05, "loss": 0.044, "step": 4429 }, { "epoch": 0.5696772257705003, "grad_norm": 0.1904296875, "learning_rate": 8.133677145844106e-05, "loss": 0.0473, "step": 4430 }, { "epoch": 0.5698058210810579, "grad_norm": 0.1904296875, "learning_rate": 8.132889528776073e-05, "loss": 0.0546, "step": 4431 }, { "epoch": 0.5699344163916156, "grad_norm": 0.1875, "learning_rate": 8.13210178370184e-05, "loss": 0.0401, "step": 4432 }, { "epoch": 0.5700630117021732, "grad_norm": 0.181640625, "learning_rate": 8.131313910653589e-05, "loss": 0.0474, "step": 4433 }, { "epoch": 0.570191607012731, "grad_norm": 0.1953125, "learning_rate": 8.130525909663512e-05, "loss": 0.053, "step": 4434 }, { "epoch": 0.5703202023232886, "grad_norm": 0.1708984375, "learning_rate": 8.129737780763807e-05, "loss": 0.0517, "step": 4435 }, { "epoch": 0.5704487976338463, "grad_norm": 0.1689453125, "learning_rate": 8.128949523986676e-05, "loss": 0.0465, "step": 4436 }, { "epoch": 0.570577392944404, "grad_norm": 0.166015625, "learning_rate": 8.128161139364326e-05, "loss": 0.0439, "step": 4437 }, { "epoch": 0.5707059882549617, "grad_norm": 0.16796875, "learning_rate": 8.127372626928968e-05, "loss": 0.0424, "step": 4438 }, { "epoch": 0.5708345835655193, "grad_norm": 0.1865234375, "learning_rate": 8.126583986712824e-05, "loss": 0.0496, "step": 4439 }, { "epoch": 0.570963178876077, "grad_norm": 0.1748046875, "learning_rate": 8.125795218748111e-05, "loss": 0.048, "step": 4440 }, { "epoch": 0.5710917741866347, "grad_norm": 0.169921875, "learning_rate": 8.125006323067061e-05, "loss": 0.043, "step": 4441 }, { "epoch": 0.5712203694971923, "grad_norm": 0.1904296875, "learning_rate": 8.124217299701909e-05, "loss": 0.0499, "step": 4442 }, { "epoch": 0.57134896480775, "grad_norm": 0.1845703125, "learning_rate": 8.123428148684889e-05, "loss": 0.0537, "step": 4443 }, { "epoch": 0.5714775601183076, "grad_norm": 0.169921875, "learning_rate": 8.122638870048246e-05, "loss": 0.0449, "step": 4444 }, { "epoch": 0.5716061554288654, "grad_norm": 0.1748046875, "learning_rate": 8.12184946382423e-05, "loss": 0.0439, "step": 4445 }, { "epoch": 0.571734750739423, "grad_norm": 0.1787109375, "learning_rate": 8.121059930045096e-05, "loss": 0.0437, "step": 4446 }, { "epoch": 0.5718633460499807, "grad_norm": 0.169921875, "learning_rate": 8.120270268743101e-05, "loss": 0.04, "step": 4447 }, { "epoch": 0.5719919413605384, "grad_norm": 0.1875, "learning_rate": 8.119480479950514e-05, "loss": 0.0601, "step": 4448 }, { "epoch": 0.5721205366710961, "grad_norm": 0.2041015625, "learning_rate": 8.1186905636996e-05, "loss": 0.0469, "step": 4449 }, { "epoch": 0.5722491319816537, "grad_norm": 0.173828125, "learning_rate": 8.117900520022635e-05, "loss": 0.0458, "step": 4450 }, { "epoch": 0.5723777272922114, "grad_norm": 0.177734375, "learning_rate": 8.117110348951902e-05, "loss": 0.0455, "step": 4451 }, { "epoch": 0.5725063226027691, "grad_norm": 0.17578125, "learning_rate": 8.116320050519683e-05, "loss": 0.0424, "step": 4452 }, { "epoch": 0.5726349179133268, "grad_norm": 0.15625, "learning_rate": 8.115529624758272e-05, "loss": 0.0342, "step": 4453 }, { "epoch": 0.5727635132238844, "grad_norm": 0.1796875, "learning_rate": 8.114739071699964e-05, "loss": 0.0465, "step": 4454 }, { "epoch": 0.5728921085344421, "grad_norm": 0.1953125, "learning_rate": 8.113948391377057e-05, "loss": 0.0542, "step": 4455 }, { "epoch": 0.5730207038449998, "grad_norm": 0.201171875, "learning_rate": 8.113157583821861e-05, "loss": 0.0468, "step": 4456 }, { "epoch": 0.5731492991555575, "grad_norm": 0.1943359375, "learning_rate": 8.112366649066684e-05, "loss": 0.0454, "step": 4457 }, { "epoch": 0.5732778944661151, "grad_norm": 0.1904296875, "learning_rate": 8.111575587143848e-05, "loss": 0.0424, "step": 4458 }, { "epoch": 0.5734064897766729, "grad_norm": 0.169921875, "learning_rate": 8.11078439808567e-05, "loss": 0.0472, "step": 4459 }, { "epoch": 0.5735350850872305, "grad_norm": 0.193359375, "learning_rate": 8.109993081924478e-05, "loss": 0.0528, "step": 4460 }, { "epoch": 0.5736636803977881, "grad_norm": 0.162109375, "learning_rate": 8.109201638692608e-05, "loss": 0.0404, "step": 4461 }, { "epoch": 0.5737922757083458, "grad_norm": 0.1962890625, "learning_rate": 8.108410068422391e-05, "loss": 0.0484, "step": 4462 }, { "epoch": 0.5739208710189035, "grad_norm": 0.1943359375, "learning_rate": 8.107618371146174e-05, "loss": 0.0529, "step": 4463 }, { "epoch": 0.5740494663294612, "grad_norm": 0.18359375, "learning_rate": 8.106826546896305e-05, "loss": 0.0485, "step": 4464 }, { "epoch": 0.5741780616400188, "grad_norm": 0.18359375, "learning_rate": 8.106034595705134e-05, "loss": 0.0403, "step": 4465 }, { "epoch": 0.5743066569505766, "grad_norm": 0.1923828125, "learning_rate": 8.105242517605023e-05, "loss": 0.0557, "step": 4466 }, { "epoch": 0.5744352522611342, "grad_norm": 0.19921875, "learning_rate": 8.104450312628332e-05, "loss": 0.0567, "step": 4467 }, { "epoch": 0.5745638475716919, "grad_norm": 0.18359375, "learning_rate": 8.10365798080743e-05, "loss": 0.0472, "step": 4468 }, { "epoch": 0.5746924428822495, "grad_norm": 0.1611328125, "learning_rate": 8.102865522174695e-05, "loss": 0.039, "step": 4469 }, { "epoch": 0.5748210381928073, "grad_norm": 0.15234375, "learning_rate": 8.1020729367625e-05, "loss": 0.0296, "step": 4470 }, { "epoch": 0.5749496335033649, "grad_norm": 0.1943359375, "learning_rate": 8.101280224603234e-05, "loss": 0.0545, "step": 4471 }, { "epoch": 0.5750782288139226, "grad_norm": 0.181640625, "learning_rate": 8.100487385729281e-05, "loss": 0.0437, "step": 4472 }, { "epoch": 0.5752068241244802, "grad_norm": 0.1689453125, "learning_rate": 8.09969442017304e-05, "loss": 0.0411, "step": 4473 }, { "epoch": 0.575335419435038, "grad_norm": 0.1669921875, "learning_rate": 8.09890132796691e-05, "loss": 0.0422, "step": 4474 }, { "epoch": 0.5754640147455956, "grad_norm": 0.1611328125, "learning_rate": 8.098108109143296e-05, "loss": 0.0364, "step": 4475 }, { "epoch": 0.5755926100561533, "grad_norm": 0.1611328125, "learning_rate": 8.097314763734605e-05, "loss": 0.046, "step": 4476 }, { "epoch": 0.575721205366711, "grad_norm": 0.1884765625, "learning_rate": 8.096521291773255e-05, "loss": 0.0469, "step": 4477 }, { "epoch": 0.5758498006772687, "grad_norm": 0.1650390625, "learning_rate": 8.095727693291665e-05, "loss": 0.0432, "step": 4478 }, { "epoch": 0.5759783959878263, "grad_norm": 0.1669921875, "learning_rate": 8.094933968322263e-05, "loss": 0.0416, "step": 4479 }, { "epoch": 0.5761069912983839, "grad_norm": 0.162109375, "learning_rate": 8.094140116897474e-05, "loss": 0.0411, "step": 4480 }, { "epoch": 0.5762355866089417, "grad_norm": 0.2392578125, "learning_rate": 8.093346139049742e-05, "loss": 0.0413, "step": 4481 }, { "epoch": 0.5763641819194993, "grad_norm": 0.1826171875, "learning_rate": 8.092552034811501e-05, "loss": 0.049, "step": 4482 }, { "epoch": 0.576492777230057, "grad_norm": 0.1748046875, "learning_rate": 8.091757804215198e-05, "loss": 0.0445, "step": 4483 }, { "epoch": 0.5766213725406147, "grad_norm": 0.17578125, "learning_rate": 8.09096344729329e-05, "loss": 0.04, "step": 4484 }, { "epoch": 0.5767499678511724, "grad_norm": 0.1689453125, "learning_rate": 8.090168964078227e-05, "loss": 0.0372, "step": 4485 }, { "epoch": 0.57687856316173, "grad_norm": 0.1708984375, "learning_rate": 8.089374354602473e-05, "loss": 0.0416, "step": 4486 }, { "epoch": 0.5770071584722877, "grad_norm": 0.1591796875, "learning_rate": 8.088579618898496e-05, "loss": 0.0374, "step": 4487 }, { "epoch": 0.5771357537828454, "grad_norm": 0.171875, "learning_rate": 8.087784756998767e-05, "loss": 0.0492, "step": 4488 }, { "epoch": 0.5772643490934031, "grad_norm": 0.17578125, "learning_rate": 8.086989768935764e-05, "loss": 0.0478, "step": 4489 }, { "epoch": 0.5773929444039607, "grad_norm": 0.16015625, "learning_rate": 8.086194654741968e-05, "loss": 0.0412, "step": 4490 }, { "epoch": 0.5775215397145184, "grad_norm": 0.181640625, "learning_rate": 8.085399414449866e-05, "loss": 0.0423, "step": 4491 }, { "epoch": 0.5776501350250761, "grad_norm": 0.1953125, "learning_rate": 8.084604048091951e-05, "loss": 0.0495, "step": 4492 }, { "epoch": 0.5777787303356338, "grad_norm": 0.1630859375, "learning_rate": 8.083808555700721e-05, "loss": 0.0363, "step": 4493 }, { "epoch": 0.5779073256461914, "grad_norm": 0.1787109375, "learning_rate": 8.08301293730868e-05, "loss": 0.0479, "step": 4494 }, { "epoch": 0.5780359209567492, "grad_norm": 0.197265625, "learning_rate": 8.082217192948337e-05, "loss": 0.0391, "step": 4495 }, { "epoch": 0.5781645162673068, "grad_norm": 0.1630859375, "learning_rate": 8.081421322652201e-05, "loss": 0.0364, "step": 4496 }, { "epoch": 0.5782931115778644, "grad_norm": 0.177734375, "learning_rate": 8.080625326452792e-05, "loss": 0.0421, "step": 4497 }, { "epoch": 0.5784217068884221, "grad_norm": 0.1728515625, "learning_rate": 8.079829204382635e-05, "loss": 0.043, "step": 4498 }, { "epoch": 0.5785503021989798, "grad_norm": 0.1650390625, "learning_rate": 8.079032956474258e-05, "loss": 0.0442, "step": 4499 }, { "epoch": 0.5786788975095375, "grad_norm": 0.1767578125, "learning_rate": 8.078236582760194e-05, "loss": 0.0508, "step": 4500 }, { "epoch": 0.5786788975095375, "eval_loss": 0.044236619025468826, "eval_runtime": 1046.2495, "eval_samples_per_second": 93.884, "eval_steps_per_second": 1.174, "step": 4500 }, { "epoch": 0.5788074928200951, "grad_norm": 0.181640625, "learning_rate": 8.077440083272982e-05, "loss": 0.0458, "step": 4501 }, { "epoch": 0.5789360881306528, "grad_norm": 0.1611328125, "learning_rate": 8.076643458045168e-05, "loss": 0.0351, "step": 4502 }, { "epoch": 0.5790646834412105, "grad_norm": 0.1943359375, "learning_rate": 8.075846707109299e-05, "loss": 0.0505, "step": 4503 }, { "epoch": 0.5791932787517682, "grad_norm": 0.1904296875, "learning_rate": 8.07504983049793e-05, "loss": 0.042, "step": 4504 }, { "epoch": 0.5793218740623258, "grad_norm": 0.166015625, "learning_rate": 8.074252828243618e-05, "loss": 0.0417, "step": 4505 }, { "epoch": 0.5794504693728836, "grad_norm": 0.197265625, "learning_rate": 8.073455700378933e-05, "loss": 0.0532, "step": 4506 }, { "epoch": 0.5795790646834412, "grad_norm": 0.1650390625, "learning_rate": 8.07265844693644e-05, "loss": 0.0464, "step": 4507 }, { "epoch": 0.5797076599939989, "grad_norm": 0.1982421875, "learning_rate": 8.071861067948716e-05, "loss": 0.0496, "step": 4508 }, { "epoch": 0.5798362553045565, "grad_norm": 0.166015625, "learning_rate": 8.07106356344834e-05, "loss": 0.0428, "step": 4509 }, { "epoch": 0.5799648506151143, "grad_norm": 0.1513671875, "learning_rate": 8.070265933467898e-05, "loss": 0.0448, "step": 4510 }, { "epoch": 0.5800934459256719, "grad_norm": 0.197265625, "learning_rate": 8.069468178039978e-05, "loss": 0.0399, "step": 4511 }, { "epoch": 0.5802220412362296, "grad_norm": 0.1865234375, "learning_rate": 8.068670297197178e-05, "loss": 0.0392, "step": 4512 }, { "epoch": 0.5803506365467873, "grad_norm": 0.1826171875, "learning_rate": 8.0678722909721e-05, "loss": 0.045, "step": 4513 }, { "epoch": 0.580479231857345, "grad_norm": 0.16796875, "learning_rate": 8.067074159397346e-05, "loss": 0.0383, "step": 4514 }, { "epoch": 0.5806078271679026, "grad_norm": 0.1923828125, "learning_rate": 8.066275902505527e-05, "loss": 0.0528, "step": 4515 }, { "epoch": 0.5807364224784602, "grad_norm": 0.154296875, "learning_rate": 8.065477520329261e-05, "loss": 0.0311, "step": 4516 }, { "epoch": 0.580865017789018, "grad_norm": 0.1943359375, "learning_rate": 8.064679012901166e-05, "loss": 0.0467, "step": 4517 }, { "epoch": 0.5809936130995756, "grad_norm": 0.193359375, "learning_rate": 8.063880380253871e-05, "loss": 0.0511, "step": 4518 }, { "epoch": 0.5811222084101333, "grad_norm": 0.1796875, "learning_rate": 8.063081622420005e-05, "loss": 0.0444, "step": 4519 }, { "epoch": 0.5812508037206909, "grad_norm": 0.1796875, "learning_rate": 8.062282739432207e-05, "loss": 0.0414, "step": 4520 }, { "epoch": 0.5813793990312487, "grad_norm": 0.19921875, "learning_rate": 8.061483731323116e-05, "loss": 0.0596, "step": 4521 }, { "epoch": 0.5815079943418063, "grad_norm": 0.1796875, "learning_rate": 8.06068459812538e-05, "loss": 0.0463, "step": 4522 }, { "epoch": 0.581636589652364, "grad_norm": 0.177734375, "learning_rate": 8.059885339871648e-05, "loss": 0.0425, "step": 4523 }, { "epoch": 0.5817651849629217, "grad_norm": 0.1865234375, "learning_rate": 8.059085956594579e-05, "loss": 0.0489, "step": 4524 }, { "epoch": 0.5818937802734794, "grad_norm": 0.1640625, "learning_rate": 8.058286448326837e-05, "loss": 0.0425, "step": 4525 }, { "epoch": 0.582022375584037, "grad_norm": 0.201171875, "learning_rate": 8.057486815101086e-05, "loss": 0.0567, "step": 4526 }, { "epoch": 0.5821509708945947, "grad_norm": 0.1650390625, "learning_rate": 8.056687056949997e-05, "loss": 0.0471, "step": 4527 }, { "epoch": 0.5822795662051524, "grad_norm": 0.1748046875, "learning_rate": 8.055887173906248e-05, "loss": 0.0455, "step": 4528 }, { "epoch": 0.5824081615157101, "grad_norm": 0.1669921875, "learning_rate": 8.055087166002524e-05, "loss": 0.0353, "step": 4529 }, { "epoch": 0.5825367568262677, "grad_norm": 0.14453125, "learning_rate": 8.054287033271509e-05, "loss": 0.0338, "step": 4530 }, { "epoch": 0.5826653521368255, "grad_norm": 0.162109375, "learning_rate": 8.053486775745897e-05, "loss": 0.0356, "step": 4531 }, { "epoch": 0.5827939474473831, "grad_norm": 0.1708984375, "learning_rate": 8.052686393458387e-05, "loss": 0.0441, "step": 4532 }, { "epoch": 0.5829225427579408, "grad_norm": 0.1591796875, "learning_rate": 8.051885886441679e-05, "loss": 0.0392, "step": 4533 }, { "epoch": 0.5830511380684984, "grad_norm": 0.1953125, "learning_rate": 8.051085254728483e-05, "loss": 0.0519, "step": 4534 }, { "epoch": 0.5831797333790562, "grad_norm": 0.158203125, "learning_rate": 8.05028449835151e-05, "loss": 0.0359, "step": 4535 }, { "epoch": 0.5833083286896138, "grad_norm": 0.181640625, "learning_rate": 8.04948361734348e-05, "loss": 0.0481, "step": 4536 }, { "epoch": 0.5834369240001714, "grad_norm": 0.1787109375, "learning_rate": 8.048682611737115e-05, "loss": 0.0417, "step": 4537 }, { "epoch": 0.5835655193107291, "grad_norm": 0.1748046875, "learning_rate": 8.047881481565143e-05, "loss": 0.0451, "step": 4538 }, { "epoch": 0.5836941146212868, "grad_norm": 0.1669921875, "learning_rate": 8.047080226860296e-05, "loss": 0.0385, "step": 4539 }, { "epoch": 0.5838227099318445, "grad_norm": 0.1943359375, "learning_rate": 8.046278847655314e-05, "loss": 0.0439, "step": 4540 }, { "epoch": 0.5839513052424021, "grad_norm": 0.1611328125, "learning_rate": 8.045477343982941e-05, "loss": 0.0374, "step": 4541 }, { "epoch": 0.5840799005529599, "grad_norm": 0.169921875, "learning_rate": 8.044675715875923e-05, "loss": 0.0403, "step": 4542 }, { "epoch": 0.5842084958635175, "grad_norm": 0.1708984375, "learning_rate": 8.043873963367015e-05, "loss": 0.0419, "step": 4543 }, { "epoch": 0.5843370911740752, "grad_norm": 0.208984375, "learning_rate": 8.043072086488977e-05, "loss": 0.0553, "step": 4544 }, { "epoch": 0.5844656864846328, "grad_norm": 0.2109375, "learning_rate": 8.042270085274571e-05, "loss": 0.0428, "step": 4545 }, { "epoch": 0.5845942817951906, "grad_norm": 0.171875, "learning_rate": 8.041467959756567e-05, "loss": 0.0414, "step": 4546 }, { "epoch": 0.5847228771057482, "grad_norm": 0.1884765625, "learning_rate": 8.040665709967738e-05, "loss": 0.049, "step": 4547 }, { "epoch": 0.5848514724163059, "grad_norm": 0.1865234375, "learning_rate": 8.039863335940863e-05, "loss": 0.0471, "step": 4548 }, { "epoch": 0.5849800677268635, "grad_norm": 0.1845703125, "learning_rate": 8.039060837708727e-05, "loss": 0.0518, "step": 4549 }, { "epoch": 0.5851086630374213, "grad_norm": 0.18359375, "learning_rate": 8.038258215304119e-05, "loss": 0.045, "step": 4550 }, { "epoch": 0.5852372583479789, "grad_norm": 0.1826171875, "learning_rate": 8.037455468759832e-05, "loss": 0.048, "step": 4551 }, { "epoch": 0.5853658536585366, "grad_norm": 0.177734375, "learning_rate": 8.036652598108666e-05, "loss": 0.0533, "step": 4552 }, { "epoch": 0.5854944489690943, "grad_norm": 0.1474609375, "learning_rate": 8.035849603383425e-05, "loss": 0.0282, "step": 4553 }, { "epoch": 0.585623044279652, "grad_norm": 0.1826171875, "learning_rate": 8.03504648461692e-05, "loss": 0.0424, "step": 4554 }, { "epoch": 0.5857516395902096, "grad_norm": 0.1640625, "learning_rate": 8.034243241841962e-05, "loss": 0.0445, "step": 4555 }, { "epoch": 0.5858802349007672, "grad_norm": 0.1923828125, "learning_rate": 8.033439875091375e-05, "loss": 0.0566, "step": 4556 }, { "epoch": 0.586008830211325, "grad_norm": 0.1875, "learning_rate": 8.03263638439798e-05, "loss": 0.0449, "step": 4557 }, { "epoch": 0.5861374255218826, "grad_norm": 0.173828125, "learning_rate": 8.031832769794609e-05, "loss": 0.0455, "step": 4558 }, { "epoch": 0.5862660208324403, "grad_norm": 0.18359375, "learning_rate": 8.031029031314095e-05, "loss": 0.0478, "step": 4559 }, { "epoch": 0.586394616142998, "grad_norm": 0.177734375, "learning_rate": 8.030225168989281e-05, "loss": 0.0438, "step": 4560 }, { "epoch": 0.5865232114535557, "grad_norm": 0.1845703125, "learning_rate": 8.029421182853007e-05, "loss": 0.0527, "step": 4561 }, { "epoch": 0.5866518067641133, "grad_norm": 0.1923828125, "learning_rate": 8.028617072938127e-05, "loss": 0.0553, "step": 4562 }, { "epoch": 0.586780402074671, "grad_norm": 0.1640625, "learning_rate": 8.027812839277495e-05, "loss": 0.0432, "step": 4563 }, { "epoch": 0.5869089973852287, "grad_norm": 0.1982421875, "learning_rate": 8.027008481903969e-05, "loss": 0.0459, "step": 4564 }, { "epoch": 0.5870375926957864, "grad_norm": 0.1669921875, "learning_rate": 8.026204000850417e-05, "loss": 0.0369, "step": 4565 }, { "epoch": 0.587166188006344, "grad_norm": 0.1611328125, "learning_rate": 8.025399396149706e-05, "loss": 0.0409, "step": 4566 }, { "epoch": 0.5872947833169017, "grad_norm": 0.1767578125, "learning_rate": 8.024594667834715e-05, "loss": 0.046, "step": 4567 }, { "epoch": 0.5874233786274594, "grad_norm": 0.17578125, "learning_rate": 8.023789815938321e-05, "loss": 0.042, "step": 4568 }, { "epoch": 0.5875519739380171, "grad_norm": 0.1669921875, "learning_rate": 8.022984840493411e-05, "loss": 0.0397, "step": 4569 }, { "epoch": 0.5876805692485747, "grad_norm": 0.2109375, "learning_rate": 8.022179741532874e-05, "loss": 0.0559, "step": 4570 }, { "epoch": 0.5878091645591325, "grad_norm": 0.16796875, "learning_rate": 8.021374519089608e-05, "loss": 0.0384, "step": 4571 }, { "epoch": 0.5879377598696901, "grad_norm": 0.173828125, "learning_rate": 8.020569173196513e-05, "loss": 0.0406, "step": 4572 }, { "epoch": 0.5880663551802477, "grad_norm": 0.185546875, "learning_rate": 8.01976370388649e-05, "loss": 0.0451, "step": 4573 }, { "epoch": 0.5881949504908054, "grad_norm": 0.162109375, "learning_rate": 8.018958111192455e-05, "loss": 0.0333, "step": 4574 }, { "epoch": 0.5883235458013631, "grad_norm": 0.185546875, "learning_rate": 8.01815239514732e-05, "loss": 0.0445, "step": 4575 }, { "epoch": 0.5884521411119208, "grad_norm": 0.1904296875, "learning_rate": 8.017346555784009e-05, "loss": 0.0494, "step": 4576 }, { "epoch": 0.5885807364224784, "grad_norm": 0.1923828125, "learning_rate": 8.016540593135444e-05, "loss": 0.0494, "step": 4577 }, { "epoch": 0.5887093317330362, "grad_norm": 0.1728515625, "learning_rate": 8.015734507234557e-05, "loss": 0.0467, "step": 4578 }, { "epoch": 0.5888379270435938, "grad_norm": 0.1884765625, "learning_rate": 8.014928298114285e-05, "loss": 0.0426, "step": 4579 }, { "epoch": 0.5889665223541515, "grad_norm": 0.17578125, "learning_rate": 8.014121965807568e-05, "loss": 0.0365, "step": 4580 }, { "epoch": 0.5890951176647091, "grad_norm": 0.1962890625, "learning_rate": 8.013315510347351e-05, "loss": 0.0483, "step": 4581 }, { "epoch": 0.5892237129752669, "grad_norm": 0.1748046875, "learning_rate": 8.012508931766586e-05, "loss": 0.0421, "step": 4582 }, { "epoch": 0.5893523082858245, "grad_norm": 0.193359375, "learning_rate": 8.011702230098227e-05, "loss": 0.049, "step": 4583 }, { "epoch": 0.5894809035963822, "grad_norm": 0.1767578125, "learning_rate": 8.010895405375238e-05, "loss": 0.042, "step": 4584 }, { "epoch": 0.5896094989069398, "grad_norm": 0.1904296875, "learning_rate": 8.010088457630584e-05, "loss": 0.0433, "step": 4585 }, { "epoch": 0.5897380942174976, "grad_norm": 0.19921875, "learning_rate": 8.009281386897233e-05, "loss": 0.0419, "step": 4586 }, { "epoch": 0.5898666895280552, "grad_norm": 0.181640625, "learning_rate": 8.008474193208164e-05, "loss": 0.036, "step": 4587 }, { "epoch": 0.5899952848386129, "grad_norm": 0.19140625, "learning_rate": 8.007666876596356e-05, "loss": 0.0494, "step": 4588 }, { "epoch": 0.5901238801491706, "grad_norm": 0.181640625, "learning_rate": 8.006859437094797e-05, "loss": 0.0425, "step": 4589 }, { "epoch": 0.5902524754597283, "grad_norm": 0.181640625, "learning_rate": 8.006051874736478e-05, "loss": 0.0399, "step": 4590 }, { "epoch": 0.5903810707702859, "grad_norm": 0.1982421875, "learning_rate": 8.005244189554392e-05, "loss": 0.0484, "step": 4591 }, { "epoch": 0.5905096660808435, "grad_norm": 0.189453125, "learning_rate": 8.004436381581543e-05, "loss": 0.0502, "step": 4592 }, { "epoch": 0.5906382613914013, "grad_norm": 0.1806640625, "learning_rate": 8.003628450850937e-05, "loss": 0.0483, "step": 4593 }, { "epoch": 0.5907668567019589, "grad_norm": 0.1806640625, "learning_rate": 8.002820397395582e-05, "loss": 0.0501, "step": 4594 }, { "epoch": 0.5908954520125166, "grad_norm": 0.1640625, "learning_rate": 8.002012221248499e-05, "loss": 0.04, "step": 4595 }, { "epoch": 0.5910240473230742, "grad_norm": 0.1611328125, "learning_rate": 8.001203922442707e-05, "loss": 0.0349, "step": 4596 }, { "epoch": 0.591152642633632, "grad_norm": 0.2060546875, "learning_rate": 8.00039550101123e-05, "loss": 0.0558, "step": 4597 }, { "epoch": 0.5912812379441896, "grad_norm": 0.169921875, "learning_rate": 7.999586956987102e-05, "loss": 0.0403, "step": 4598 }, { "epoch": 0.5914098332547473, "grad_norm": 0.1806640625, "learning_rate": 7.998778290403359e-05, "loss": 0.0426, "step": 4599 }, { "epoch": 0.591538428565305, "grad_norm": 0.1826171875, "learning_rate": 7.997969501293039e-05, "loss": 0.0469, "step": 4600 }, { "epoch": 0.5916670238758627, "grad_norm": 0.1796875, "learning_rate": 7.997160589689193e-05, "loss": 0.0436, "step": 4601 }, { "epoch": 0.5917956191864203, "grad_norm": 0.1728515625, "learning_rate": 7.996351555624869e-05, "loss": 0.0376, "step": 4602 }, { "epoch": 0.591924214496978, "grad_norm": 0.185546875, "learning_rate": 7.995542399133121e-05, "loss": 0.0461, "step": 4603 }, { "epoch": 0.5920528098075357, "grad_norm": 0.2041015625, "learning_rate": 7.994733120247015e-05, "loss": 0.0548, "step": 4604 }, { "epoch": 0.5921814051180934, "grad_norm": 0.1728515625, "learning_rate": 7.993923718999617e-05, "loss": 0.0328, "step": 4605 }, { "epoch": 0.592310000428651, "grad_norm": 0.1767578125, "learning_rate": 7.993114195423994e-05, "loss": 0.042, "step": 4606 }, { "epoch": 0.5924385957392088, "grad_norm": 0.18359375, "learning_rate": 7.992304549553224e-05, "loss": 0.0419, "step": 4607 }, { "epoch": 0.5925671910497664, "grad_norm": 0.169921875, "learning_rate": 7.991494781420392e-05, "loss": 0.0438, "step": 4608 }, { "epoch": 0.592695786360324, "grad_norm": 0.1962890625, "learning_rate": 7.990684891058579e-05, "loss": 0.0522, "step": 4609 }, { "epoch": 0.5928243816708817, "grad_norm": 0.1572265625, "learning_rate": 7.989874878500878e-05, "loss": 0.0344, "step": 4610 }, { "epoch": 0.5929529769814395, "grad_norm": 0.169921875, "learning_rate": 7.989064743780386e-05, "loss": 0.0355, "step": 4611 }, { "epoch": 0.5930815722919971, "grad_norm": 0.1728515625, "learning_rate": 7.988254486930203e-05, "loss": 0.0497, "step": 4612 }, { "epoch": 0.5932101676025547, "grad_norm": 0.1982421875, "learning_rate": 7.987444107983435e-05, "loss": 0.0525, "step": 4613 }, { "epoch": 0.5933387629131124, "grad_norm": 0.1572265625, "learning_rate": 7.986633606973192e-05, "loss": 0.04, "step": 4614 }, { "epoch": 0.5934673582236701, "grad_norm": 0.18359375, "learning_rate": 7.985822983932596e-05, "loss": 0.0333, "step": 4615 }, { "epoch": 0.5935959535342278, "grad_norm": 0.16796875, "learning_rate": 7.985012238894763e-05, "loss": 0.038, "step": 4616 }, { "epoch": 0.5937245488447854, "grad_norm": 0.1767578125, "learning_rate": 7.984201371892816e-05, "loss": 0.0442, "step": 4617 }, { "epoch": 0.5938531441553432, "grad_norm": 0.169921875, "learning_rate": 7.983390382959894e-05, "loss": 0.0405, "step": 4618 }, { "epoch": 0.5939817394659008, "grad_norm": 0.1669921875, "learning_rate": 7.98257927212913e-05, "loss": 0.0395, "step": 4619 }, { "epoch": 0.5941103347764585, "grad_norm": 0.15625, "learning_rate": 7.981768039433663e-05, "loss": 0.0374, "step": 4620 }, { "epoch": 0.5942389300870161, "grad_norm": 0.158203125, "learning_rate": 7.980956684906639e-05, "loss": 0.0357, "step": 4621 }, { "epoch": 0.5943675253975739, "grad_norm": 0.1767578125, "learning_rate": 7.980145208581211e-05, "loss": 0.055, "step": 4622 }, { "epoch": 0.5944961207081315, "grad_norm": 0.1591796875, "learning_rate": 7.979333610490534e-05, "loss": 0.0373, "step": 4623 }, { "epoch": 0.5946247160186892, "grad_norm": 0.171875, "learning_rate": 7.978521890667769e-05, "loss": 0.0392, "step": 4624 }, { "epoch": 0.5947533113292469, "grad_norm": 0.166015625, "learning_rate": 7.977710049146083e-05, "loss": 0.0389, "step": 4625 }, { "epoch": 0.5948819066398046, "grad_norm": 0.314453125, "learning_rate": 7.976898085958646e-05, "loss": 0.0436, "step": 4626 }, { "epoch": 0.5950105019503622, "grad_norm": 0.1875, "learning_rate": 7.976086001138635e-05, "loss": 0.0539, "step": 4627 }, { "epoch": 0.5951390972609198, "grad_norm": 0.16796875, "learning_rate": 7.975273794719226e-05, "loss": 0.0454, "step": 4628 }, { "epoch": 0.5952676925714776, "grad_norm": 0.1865234375, "learning_rate": 7.974461466733611e-05, "loss": 0.0512, "step": 4629 }, { "epoch": 0.5953962878820352, "grad_norm": 0.1845703125, "learning_rate": 7.973649017214979e-05, "loss": 0.0502, "step": 4630 }, { "epoch": 0.5955248831925929, "grad_norm": 0.193359375, "learning_rate": 7.972836446196524e-05, "loss": 0.0456, "step": 4631 }, { "epoch": 0.5956534785031505, "grad_norm": 0.1796875, "learning_rate": 7.972023753711448e-05, "loss": 0.0403, "step": 4632 }, { "epoch": 0.5957820738137083, "grad_norm": 0.1630859375, "learning_rate": 7.971210939792956e-05, "loss": 0.0409, "step": 4633 }, { "epoch": 0.5959106691242659, "grad_norm": 0.1806640625, "learning_rate": 7.970398004474259e-05, "loss": 0.0486, "step": 4634 }, { "epoch": 0.5960392644348236, "grad_norm": 0.1728515625, "learning_rate": 7.969584947788571e-05, "loss": 0.0433, "step": 4635 }, { "epoch": 0.5961678597453813, "grad_norm": 0.1748046875, "learning_rate": 7.968771769769116e-05, "loss": 0.0448, "step": 4636 }, { "epoch": 0.596296455055939, "grad_norm": 0.1640625, "learning_rate": 7.967958470449118e-05, "loss": 0.0413, "step": 4637 }, { "epoch": 0.5964250503664966, "grad_norm": 0.1826171875, "learning_rate": 7.967145049861805e-05, "loss": 0.0428, "step": 4638 }, { "epoch": 0.5965536456770543, "grad_norm": 0.17578125, "learning_rate": 7.966331508040418e-05, "loss": 0.045, "step": 4639 }, { "epoch": 0.596682240987612, "grad_norm": 0.205078125, "learning_rate": 7.965517845018191e-05, "loss": 0.0523, "step": 4640 }, { "epoch": 0.5968108362981697, "grad_norm": 0.162109375, "learning_rate": 7.964704060828374e-05, "loss": 0.0468, "step": 4641 }, { "epoch": 0.5969394316087273, "grad_norm": 0.154296875, "learning_rate": 7.963890155504214e-05, "loss": 0.0342, "step": 4642 }, { "epoch": 0.597068026919285, "grad_norm": 0.1748046875, "learning_rate": 7.96307612907897e-05, "loss": 0.0421, "step": 4643 }, { "epoch": 0.5971966222298427, "grad_norm": 0.1640625, "learning_rate": 7.962261981585896e-05, "loss": 0.0398, "step": 4644 }, { "epoch": 0.5973252175404004, "grad_norm": 0.177734375, "learning_rate": 7.961447713058264e-05, "loss": 0.0438, "step": 4645 }, { "epoch": 0.597453812850958, "grad_norm": 0.181640625, "learning_rate": 7.96063332352934e-05, "loss": 0.0478, "step": 4646 }, { "epoch": 0.5975824081615158, "grad_norm": 0.1767578125, "learning_rate": 7.9598188130324e-05, "loss": 0.0345, "step": 4647 }, { "epoch": 0.5977110034720734, "grad_norm": 0.1708984375, "learning_rate": 7.959004181600724e-05, "loss": 0.0425, "step": 4648 }, { "epoch": 0.597839598782631, "grad_norm": 0.1884765625, "learning_rate": 7.958189429267597e-05, "loss": 0.0503, "step": 4649 }, { "epoch": 0.5979681940931887, "grad_norm": 0.1787109375, "learning_rate": 7.957374556066308e-05, "loss": 0.0409, "step": 4650 }, { "epoch": 0.5980967894037464, "grad_norm": 0.185546875, "learning_rate": 7.956559562030154e-05, "loss": 0.047, "step": 4651 }, { "epoch": 0.5982253847143041, "grad_norm": 0.1748046875, "learning_rate": 7.955744447192432e-05, "loss": 0.0441, "step": 4652 }, { "epoch": 0.5983539800248617, "grad_norm": 0.177734375, "learning_rate": 7.954929211586448e-05, "loss": 0.0495, "step": 4653 }, { "epoch": 0.5984825753354195, "grad_norm": 0.1728515625, "learning_rate": 7.954113855245512e-05, "loss": 0.0407, "step": 4654 }, { "epoch": 0.5986111706459771, "grad_norm": 0.16796875, "learning_rate": 7.953298378202937e-05, "loss": 0.0439, "step": 4655 }, { "epoch": 0.5987397659565348, "grad_norm": 0.189453125, "learning_rate": 7.952482780492044e-05, "loss": 0.0484, "step": 4656 }, { "epoch": 0.5988683612670924, "grad_norm": 0.1904296875, "learning_rate": 7.951667062146156e-05, "loss": 0.0512, "step": 4657 }, { "epoch": 0.5989969565776502, "grad_norm": 0.2021484375, "learning_rate": 7.950851223198603e-05, "loss": 0.0478, "step": 4658 }, { "epoch": 0.5991255518882078, "grad_norm": 0.1884765625, "learning_rate": 7.95003526368272e-05, "loss": 0.0472, "step": 4659 }, { "epoch": 0.5992541471987655, "grad_norm": 0.177734375, "learning_rate": 7.949219183631844e-05, "loss": 0.0459, "step": 4660 }, { "epoch": 0.5993827425093231, "grad_norm": 0.1826171875, "learning_rate": 7.948402983079322e-05, "loss": 0.0408, "step": 4661 }, { "epoch": 0.5995113378198809, "grad_norm": 0.1630859375, "learning_rate": 7.947586662058502e-05, "loss": 0.045, "step": 4662 }, { "epoch": 0.5996399331304385, "grad_norm": 0.18359375, "learning_rate": 7.946770220602735e-05, "loss": 0.0515, "step": 4663 }, { "epoch": 0.5997685284409962, "grad_norm": 0.166015625, "learning_rate": 7.945953658745384e-05, "loss": 0.0397, "step": 4664 }, { "epoch": 0.5998971237515539, "grad_norm": 0.1669921875, "learning_rate": 7.94513697651981e-05, "loss": 0.043, "step": 4665 }, { "epoch": 0.6000257190621116, "grad_norm": 0.1689453125, "learning_rate": 7.944320173959383e-05, "loss": 0.0428, "step": 4666 }, { "epoch": 0.6001543143726692, "grad_norm": 0.1611328125, "learning_rate": 7.943503251097478e-05, "loss": 0.041, "step": 4667 }, { "epoch": 0.6002829096832268, "grad_norm": 0.166015625, "learning_rate": 7.942686207967469e-05, "loss": 0.0433, "step": 4668 }, { "epoch": 0.6004115049937846, "grad_norm": 0.173828125, "learning_rate": 7.941869044602744e-05, "loss": 0.0429, "step": 4669 }, { "epoch": 0.6005401003043422, "grad_norm": 0.1875, "learning_rate": 7.941051761036688e-05, "loss": 0.0486, "step": 4670 }, { "epoch": 0.6006686956148999, "grad_norm": 0.189453125, "learning_rate": 7.940234357302696e-05, "loss": 0.0542, "step": 4671 }, { "epoch": 0.6007972909254576, "grad_norm": 0.1630859375, "learning_rate": 7.939416833434167e-05, "loss": 0.037, "step": 4672 }, { "epoch": 0.6009258862360153, "grad_norm": 0.158203125, "learning_rate": 7.938599189464502e-05, "loss": 0.0325, "step": 4673 }, { "epoch": 0.6010544815465729, "grad_norm": 0.1748046875, "learning_rate": 7.937781425427111e-05, "loss": 0.0417, "step": 4674 }, { "epoch": 0.6011830768571306, "grad_norm": 0.193359375, "learning_rate": 7.936963541355404e-05, "loss": 0.0586, "step": 4675 }, { "epoch": 0.6013116721676883, "grad_norm": 0.1865234375, "learning_rate": 7.936145537282803e-05, "loss": 0.0449, "step": 4676 }, { "epoch": 0.601440267478246, "grad_norm": 0.19140625, "learning_rate": 7.935327413242729e-05, "loss": 0.0466, "step": 4677 }, { "epoch": 0.6015688627888036, "grad_norm": 0.18359375, "learning_rate": 7.934509169268606e-05, "loss": 0.0535, "step": 4678 }, { "epoch": 0.6016974580993613, "grad_norm": 0.18359375, "learning_rate": 7.933690805393871e-05, "loss": 0.0479, "step": 4679 }, { "epoch": 0.601826053409919, "grad_norm": 0.1611328125, "learning_rate": 7.932872321651959e-05, "loss": 0.04, "step": 4680 }, { "epoch": 0.6019546487204767, "grad_norm": 0.1982421875, "learning_rate": 7.932053718076313e-05, "loss": 0.0446, "step": 4681 }, { "epoch": 0.6020832440310343, "grad_norm": 0.166015625, "learning_rate": 7.931234994700382e-05, "loss": 0.0424, "step": 4682 }, { "epoch": 0.6022118393415921, "grad_norm": 0.1875, "learning_rate": 7.930416151557615e-05, "loss": 0.0491, "step": 4683 }, { "epoch": 0.6023404346521497, "grad_norm": 0.1591796875, "learning_rate": 7.92959718868147e-05, "loss": 0.0406, "step": 4684 }, { "epoch": 0.6024690299627073, "grad_norm": 0.1923828125, "learning_rate": 7.92877810610541e-05, "loss": 0.0512, "step": 4685 }, { "epoch": 0.602597625273265, "grad_norm": 0.1904296875, "learning_rate": 7.927958903862901e-05, "loss": 0.0529, "step": 4686 }, { "epoch": 0.6027262205838227, "grad_norm": 0.1630859375, "learning_rate": 7.927139581987414e-05, "loss": 0.0398, "step": 4687 }, { "epoch": 0.6028548158943804, "grad_norm": 0.140625, "learning_rate": 7.926320140512427e-05, "loss": 0.0322, "step": 4688 }, { "epoch": 0.602983411204938, "grad_norm": 0.16015625, "learning_rate": 7.925500579471421e-05, "loss": 0.0367, "step": 4689 }, { "epoch": 0.6031120065154957, "grad_norm": 0.1533203125, "learning_rate": 7.92468089889788e-05, "loss": 0.0332, "step": 4690 }, { "epoch": 0.6032406018260534, "grad_norm": 0.1708984375, "learning_rate": 7.9238610988253e-05, "loss": 0.0361, "step": 4691 }, { "epoch": 0.6033691971366111, "grad_norm": 0.16015625, "learning_rate": 7.923041179287172e-05, "loss": 0.0374, "step": 4692 }, { "epoch": 0.6034977924471687, "grad_norm": 0.1650390625, "learning_rate": 7.922221140316999e-05, "loss": 0.0394, "step": 4693 }, { "epoch": 0.6036263877577265, "grad_norm": 0.1767578125, "learning_rate": 7.921400981948286e-05, "loss": 0.0511, "step": 4694 }, { "epoch": 0.6037549830682841, "grad_norm": 0.1884765625, "learning_rate": 7.920580704214547e-05, "loss": 0.0546, "step": 4695 }, { "epoch": 0.6038835783788418, "grad_norm": 0.169921875, "learning_rate": 7.919760307149294e-05, "loss": 0.0407, "step": 4696 }, { "epoch": 0.6040121736893994, "grad_norm": 0.1904296875, "learning_rate": 7.918939790786049e-05, "loss": 0.0445, "step": 4697 }, { "epoch": 0.6041407689999572, "grad_norm": 0.1767578125, "learning_rate": 7.918119155158336e-05, "loss": 0.0433, "step": 4698 }, { "epoch": 0.6042693643105148, "grad_norm": 0.1787109375, "learning_rate": 7.917298400299689e-05, "loss": 0.043, "step": 4699 }, { "epoch": 0.6043979596210725, "grad_norm": 0.1806640625, "learning_rate": 7.916477526243638e-05, "loss": 0.0504, "step": 4700 }, { "epoch": 0.6045265549316302, "grad_norm": 0.1845703125, "learning_rate": 7.915656533023725e-05, "loss": 0.043, "step": 4701 }, { "epoch": 0.6046551502421879, "grad_norm": 0.1826171875, "learning_rate": 7.914835420673497e-05, "loss": 0.0457, "step": 4702 }, { "epoch": 0.6047837455527455, "grad_norm": 0.171875, "learning_rate": 7.914014189226499e-05, "loss": 0.042, "step": 4703 }, { "epoch": 0.6049123408633031, "grad_norm": 0.181640625, "learning_rate": 7.913192838716291e-05, "loss": 0.0463, "step": 4704 }, { "epoch": 0.6050409361738609, "grad_norm": 0.1728515625, "learning_rate": 7.912371369176429e-05, "loss": 0.0494, "step": 4705 }, { "epoch": 0.6051695314844185, "grad_norm": 0.18359375, "learning_rate": 7.911549780640479e-05, "loss": 0.0474, "step": 4706 }, { "epoch": 0.6052981267949762, "grad_norm": 0.1650390625, "learning_rate": 7.910728073142006e-05, "loss": 0.0409, "step": 4707 }, { "epoch": 0.6054267221055338, "grad_norm": 0.1630859375, "learning_rate": 7.90990624671459e-05, "loss": 0.0452, "step": 4708 }, { "epoch": 0.6055553174160916, "grad_norm": 0.185546875, "learning_rate": 7.909084301391808e-05, "loss": 0.0448, "step": 4709 }, { "epoch": 0.6056839127266492, "grad_norm": 0.1708984375, "learning_rate": 7.908262237207239e-05, "loss": 0.0413, "step": 4710 }, { "epoch": 0.6058125080372069, "grad_norm": 0.1455078125, "learning_rate": 7.907440054194477e-05, "loss": 0.0388, "step": 4711 }, { "epoch": 0.6059411033477646, "grad_norm": 0.1904296875, "learning_rate": 7.906617752387115e-05, "loss": 0.0517, "step": 4712 }, { "epoch": 0.6060696986583223, "grad_norm": 0.1875, "learning_rate": 7.905795331818749e-05, "loss": 0.0432, "step": 4713 }, { "epoch": 0.6061982939688799, "grad_norm": 0.1826171875, "learning_rate": 7.904972792522984e-05, "loss": 0.0472, "step": 4714 }, { "epoch": 0.6063268892794376, "grad_norm": 0.1650390625, "learning_rate": 7.904150134533428e-05, "loss": 0.0375, "step": 4715 }, { "epoch": 0.6064554845899953, "grad_norm": 0.169921875, "learning_rate": 7.903327357883693e-05, "loss": 0.0439, "step": 4716 }, { "epoch": 0.606584079900553, "grad_norm": 0.158203125, "learning_rate": 7.902504462607394e-05, "loss": 0.0367, "step": 4717 }, { "epoch": 0.6067126752111106, "grad_norm": 0.17578125, "learning_rate": 7.901681448738158e-05, "loss": 0.048, "step": 4718 }, { "epoch": 0.6068412705216684, "grad_norm": 0.1748046875, "learning_rate": 7.900858316309611e-05, "loss": 0.0428, "step": 4719 }, { "epoch": 0.606969865832226, "grad_norm": 0.166015625, "learning_rate": 7.900035065355385e-05, "loss": 0.0432, "step": 4720 }, { "epoch": 0.6070984611427837, "grad_norm": 0.140625, "learning_rate": 7.899211695909118e-05, "loss": 0.0357, "step": 4721 }, { "epoch": 0.6072270564533413, "grad_norm": 0.1982421875, "learning_rate": 7.898388208004449e-05, "loss": 0.0501, "step": 4722 }, { "epoch": 0.607355651763899, "grad_norm": 0.1728515625, "learning_rate": 7.897564601675028e-05, "loss": 0.0476, "step": 4723 }, { "epoch": 0.6074842470744567, "grad_norm": 0.25390625, "learning_rate": 7.896740876954505e-05, "loss": 0.0369, "step": 4724 }, { "epoch": 0.6076128423850143, "grad_norm": 0.1826171875, "learning_rate": 7.895917033876535e-05, "loss": 0.0436, "step": 4725 }, { "epoch": 0.607741437695572, "grad_norm": 0.17578125, "learning_rate": 7.895093072474784e-05, "loss": 0.0465, "step": 4726 }, { "epoch": 0.6078700330061297, "grad_norm": 0.1767578125, "learning_rate": 7.894268992782913e-05, "loss": 0.0421, "step": 4727 }, { "epoch": 0.6079986283166874, "grad_norm": 0.16015625, "learning_rate": 7.893444794834595e-05, "loss": 0.0396, "step": 4728 }, { "epoch": 0.608127223627245, "grad_norm": 0.173828125, "learning_rate": 7.892620478663506e-05, "loss": 0.0419, "step": 4729 }, { "epoch": 0.6082558189378028, "grad_norm": 0.17578125, "learning_rate": 7.891796044303327e-05, "loss": 0.0396, "step": 4730 }, { "epoch": 0.6083844142483604, "grad_norm": 0.20703125, "learning_rate": 7.890971491787742e-05, "loss": 0.0515, "step": 4731 }, { "epoch": 0.6085130095589181, "grad_norm": 0.2138671875, "learning_rate": 7.890146821150442e-05, "loss": 0.0401, "step": 4732 }, { "epoch": 0.6086416048694757, "grad_norm": 0.1669921875, "learning_rate": 7.889322032425124e-05, "loss": 0.0366, "step": 4733 }, { "epoch": 0.6087702001800335, "grad_norm": 0.1865234375, "learning_rate": 7.888497125645483e-05, "loss": 0.0373, "step": 4734 }, { "epoch": 0.6088987954905911, "grad_norm": 0.162109375, "learning_rate": 7.887672100845228e-05, "loss": 0.0363, "step": 4735 }, { "epoch": 0.6090273908011488, "grad_norm": 0.1796875, "learning_rate": 7.886846958058067e-05, "loss": 0.0396, "step": 4736 }, { "epoch": 0.6091559861117064, "grad_norm": 0.173828125, "learning_rate": 7.886021697317716e-05, "loss": 0.0397, "step": 4737 }, { "epoch": 0.6092845814222642, "grad_norm": 0.1875, "learning_rate": 7.885196318657892e-05, "loss": 0.0458, "step": 4738 }, { "epoch": 0.6094131767328218, "grad_norm": 0.1767578125, "learning_rate": 7.88437082211232e-05, "loss": 0.0467, "step": 4739 }, { "epoch": 0.6095417720433794, "grad_norm": 0.1728515625, "learning_rate": 7.883545207714728e-05, "loss": 0.0391, "step": 4740 }, { "epoch": 0.6096703673539372, "grad_norm": 0.18359375, "learning_rate": 7.882719475498853e-05, "loss": 0.0423, "step": 4741 }, { "epoch": 0.6097989626644948, "grad_norm": 0.1611328125, "learning_rate": 7.881893625498427e-05, "loss": 0.032, "step": 4742 }, { "epoch": 0.6099275579750525, "grad_norm": 0.130859375, "learning_rate": 7.881067657747198e-05, "loss": 0.0272, "step": 4743 }, { "epoch": 0.6100561532856101, "grad_norm": 0.1884765625, "learning_rate": 7.880241572278914e-05, "loss": 0.0422, "step": 4744 }, { "epoch": 0.6101847485961679, "grad_norm": 0.19921875, "learning_rate": 7.879415369127328e-05, "loss": 0.0437, "step": 4745 }, { "epoch": 0.6103133439067255, "grad_norm": 0.1845703125, "learning_rate": 7.878589048326194e-05, "loss": 0.0379, "step": 4746 }, { "epoch": 0.6104419392172832, "grad_norm": 0.1669921875, "learning_rate": 7.877762609909279e-05, "loss": 0.0393, "step": 4747 }, { "epoch": 0.6105705345278409, "grad_norm": 0.169921875, "learning_rate": 7.876936053910347e-05, "loss": 0.0398, "step": 4748 }, { "epoch": 0.6106991298383986, "grad_norm": 0.1845703125, "learning_rate": 7.876109380363172e-05, "loss": 0.0478, "step": 4749 }, { "epoch": 0.6108277251489562, "grad_norm": 0.1884765625, "learning_rate": 7.875282589301533e-05, "loss": 0.0449, "step": 4750 }, { "epoch": 0.6109563204595139, "grad_norm": 0.1875, "learning_rate": 7.874455680759208e-05, "loss": 0.0415, "step": 4751 }, { "epoch": 0.6110849157700716, "grad_norm": 0.1572265625, "learning_rate": 7.873628654769983e-05, "loss": 0.0389, "step": 4752 }, { "epoch": 0.6112135110806293, "grad_norm": 0.2080078125, "learning_rate": 7.872801511367653e-05, "loss": 0.0642, "step": 4753 }, { "epoch": 0.6113421063911869, "grad_norm": 0.177734375, "learning_rate": 7.871974250586012e-05, "loss": 0.0515, "step": 4754 }, { "epoch": 0.6114707017017446, "grad_norm": 0.1728515625, "learning_rate": 7.87114687245886e-05, "loss": 0.0436, "step": 4755 }, { "epoch": 0.6115992970123023, "grad_norm": 0.189453125, "learning_rate": 7.870319377020004e-05, "loss": 0.0507, "step": 4756 }, { "epoch": 0.61172789232286, "grad_norm": 0.173828125, "learning_rate": 7.869491764303254e-05, "loss": 0.0492, "step": 4757 }, { "epoch": 0.6118564876334176, "grad_norm": 0.1708984375, "learning_rate": 7.868664034342427e-05, "loss": 0.0411, "step": 4758 }, { "epoch": 0.6119850829439754, "grad_norm": 0.154296875, "learning_rate": 7.86783618717134e-05, "loss": 0.0374, "step": 4759 }, { "epoch": 0.612113678254533, "grad_norm": 0.1748046875, "learning_rate": 7.86700822282382e-05, "loss": 0.0491, "step": 4760 }, { "epoch": 0.6122422735650906, "grad_norm": 0.181640625, "learning_rate": 7.866180141333696e-05, "loss": 0.0544, "step": 4761 }, { "epoch": 0.6123708688756483, "grad_norm": 0.1982421875, "learning_rate": 7.865351942734805e-05, "loss": 0.0447, "step": 4762 }, { "epoch": 0.612499464186206, "grad_norm": 0.173828125, "learning_rate": 7.864523627060982e-05, "loss": 0.0463, "step": 4763 }, { "epoch": 0.6126280594967637, "grad_norm": 0.1591796875, "learning_rate": 7.863695194346073e-05, "loss": 0.0389, "step": 4764 }, { "epoch": 0.6127566548073213, "grad_norm": 0.1796875, "learning_rate": 7.862866644623927e-05, "loss": 0.0462, "step": 4765 }, { "epoch": 0.6128852501178791, "grad_norm": 0.224609375, "learning_rate": 7.862037977928397e-05, "loss": 0.0477, "step": 4766 }, { "epoch": 0.6130138454284367, "grad_norm": 0.1953125, "learning_rate": 7.861209194293341e-05, "loss": 0.0531, "step": 4767 }, { "epoch": 0.6131424407389944, "grad_norm": 0.1708984375, "learning_rate": 7.860380293752623e-05, "loss": 0.0443, "step": 4768 }, { "epoch": 0.613271036049552, "grad_norm": 0.1904296875, "learning_rate": 7.85955127634011e-05, "loss": 0.0484, "step": 4769 }, { "epoch": 0.6133996313601098, "grad_norm": 0.1650390625, "learning_rate": 7.858722142089677e-05, "loss": 0.0352, "step": 4770 }, { "epoch": 0.6135282266706674, "grad_norm": 0.1904296875, "learning_rate": 7.857892891035198e-05, "loss": 0.0459, "step": 4771 }, { "epoch": 0.6136568219812251, "grad_norm": 0.1533203125, "learning_rate": 7.857063523210557e-05, "loss": 0.0354, "step": 4772 }, { "epoch": 0.6137854172917827, "grad_norm": 0.1962890625, "learning_rate": 7.856234038649641e-05, "loss": 0.0516, "step": 4773 }, { "epoch": 0.6139140126023405, "grad_norm": 0.15625, "learning_rate": 7.855404437386345e-05, "loss": 0.0414, "step": 4774 }, { "epoch": 0.6140426079128981, "grad_norm": 0.1640625, "learning_rate": 7.854574719454559e-05, "loss": 0.0359, "step": 4775 }, { "epoch": 0.6141712032234558, "grad_norm": 0.1943359375, "learning_rate": 7.853744884888187e-05, "loss": 0.0499, "step": 4776 }, { "epoch": 0.6142997985340135, "grad_norm": 0.2138671875, "learning_rate": 7.852914933721139e-05, "loss": 0.0501, "step": 4777 }, { "epoch": 0.6144283938445712, "grad_norm": 0.1943359375, "learning_rate": 7.85208486598732e-05, "loss": 0.0425, "step": 4778 }, { "epoch": 0.6145569891551288, "grad_norm": 0.1767578125, "learning_rate": 7.851254681720647e-05, "loss": 0.0473, "step": 4779 }, { "epoch": 0.6146855844656864, "grad_norm": 0.169921875, "learning_rate": 7.850424380955044e-05, "loss": 0.0431, "step": 4780 }, { "epoch": 0.6148141797762442, "grad_norm": 0.1826171875, "learning_rate": 7.849593963724431e-05, "loss": 0.0484, "step": 4781 }, { "epoch": 0.6149427750868018, "grad_norm": 0.1552734375, "learning_rate": 7.848763430062743e-05, "loss": 0.0365, "step": 4782 }, { "epoch": 0.6150713703973595, "grad_norm": 0.177734375, "learning_rate": 7.847932780003911e-05, "loss": 0.047, "step": 4783 }, { "epoch": 0.6151999657079172, "grad_norm": 0.1767578125, "learning_rate": 7.847102013581875e-05, "loss": 0.0401, "step": 4784 }, { "epoch": 0.6153285610184749, "grad_norm": 0.171875, "learning_rate": 7.84627113083058e-05, "loss": 0.0461, "step": 4785 }, { "epoch": 0.6154571563290325, "grad_norm": 0.177734375, "learning_rate": 7.845440131783976e-05, "loss": 0.043, "step": 4786 }, { "epoch": 0.6155857516395902, "grad_norm": 0.15625, "learning_rate": 7.844609016476014e-05, "loss": 0.0434, "step": 4787 }, { "epoch": 0.6157143469501479, "grad_norm": 0.1904296875, "learning_rate": 7.843777784940653e-05, "loss": 0.0456, "step": 4788 }, { "epoch": 0.6158429422607056, "grad_norm": 0.1689453125, "learning_rate": 7.842946437211858e-05, "loss": 0.0393, "step": 4789 }, { "epoch": 0.6159715375712632, "grad_norm": 0.2080078125, "learning_rate": 7.842114973323593e-05, "loss": 0.0541, "step": 4790 }, { "epoch": 0.6161001328818209, "grad_norm": 0.17578125, "learning_rate": 7.841283393309835e-05, "loss": 0.048, "step": 4791 }, { "epoch": 0.6162287281923786, "grad_norm": 0.189453125, "learning_rate": 7.84045169720456e-05, "loss": 0.0474, "step": 4792 }, { "epoch": 0.6163573235029363, "grad_norm": 0.1787109375, "learning_rate": 7.83961988504175e-05, "loss": 0.0349, "step": 4793 }, { "epoch": 0.6164859188134939, "grad_norm": 0.1748046875, "learning_rate": 7.838787956855392e-05, "loss": 0.0424, "step": 4794 }, { "epoch": 0.6166145141240517, "grad_norm": 0.150390625, "learning_rate": 7.837955912679475e-05, "loss": 0.0365, "step": 4795 }, { "epoch": 0.6167431094346093, "grad_norm": 0.1640625, "learning_rate": 7.837123752548e-05, "loss": 0.0405, "step": 4796 }, { "epoch": 0.616871704745167, "grad_norm": 0.23828125, "learning_rate": 7.836291476494964e-05, "loss": 0.0514, "step": 4797 }, { "epoch": 0.6170003000557246, "grad_norm": 0.2021484375, "learning_rate": 7.835459084554376e-05, "loss": 0.0504, "step": 4798 }, { "epoch": 0.6171288953662823, "grad_norm": 0.1728515625, "learning_rate": 7.834626576760242e-05, "loss": 0.0447, "step": 4799 }, { "epoch": 0.61725749067684, "grad_norm": 0.1708984375, "learning_rate": 7.833793953146584e-05, "loss": 0.0346, "step": 4800 }, { "epoch": 0.6173860859873976, "grad_norm": 0.1748046875, "learning_rate": 7.832961213747416e-05, "loss": 0.0445, "step": 4801 }, { "epoch": 0.6175146812979553, "grad_norm": 0.193359375, "learning_rate": 7.832128358596767e-05, "loss": 0.0559, "step": 4802 }, { "epoch": 0.617643276608513, "grad_norm": 0.185546875, "learning_rate": 7.831295387728664e-05, "loss": 0.0501, "step": 4803 }, { "epoch": 0.6177718719190707, "grad_norm": 0.1767578125, "learning_rate": 7.83046230117714e-05, "loss": 0.0425, "step": 4804 }, { "epoch": 0.6179004672296283, "grad_norm": 0.17578125, "learning_rate": 7.829629098976238e-05, "loss": 0.0389, "step": 4805 }, { "epoch": 0.6180290625401861, "grad_norm": 0.177734375, "learning_rate": 7.828795781159999e-05, "loss": 0.0432, "step": 4806 }, { "epoch": 0.6181576578507437, "grad_norm": 0.1962890625, "learning_rate": 7.827962347762473e-05, "loss": 0.0541, "step": 4807 }, { "epoch": 0.6182862531613014, "grad_norm": 0.166015625, "learning_rate": 7.827128798817708e-05, "loss": 0.041, "step": 4808 }, { "epoch": 0.618414848471859, "grad_norm": 0.193359375, "learning_rate": 7.82629513435977e-05, "loss": 0.0507, "step": 4809 }, { "epoch": 0.6185434437824168, "grad_norm": 0.1767578125, "learning_rate": 7.825461354422716e-05, "loss": 0.0453, "step": 4810 }, { "epoch": 0.6186720390929744, "grad_norm": 0.197265625, "learning_rate": 7.824627459040614e-05, "loss": 0.0498, "step": 4811 }, { "epoch": 0.6188006344035321, "grad_norm": 0.16796875, "learning_rate": 7.823793448247539e-05, "loss": 0.0435, "step": 4812 }, { "epoch": 0.6189292297140898, "grad_norm": 0.171875, "learning_rate": 7.822959322077563e-05, "loss": 0.0416, "step": 4813 }, { "epoch": 0.6190578250246475, "grad_norm": 0.173828125, "learning_rate": 7.82212508056477e-05, "loss": 0.0499, "step": 4814 }, { "epoch": 0.6191864203352051, "grad_norm": 0.162109375, "learning_rate": 7.821290723743245e-05, "loss": 0.0394, "step": 4815 }, { "epoch": 0.6193150156457627, "grad_norm": 0.158203125, "learning_rate": 7.820456251647082e-05, "loss": 0.034, "step": 4816 }, { "epoch": 0.6194436109563205, "grad_norm": 0.1806640625, "learning_rate": 7.819621664310372e-05, "loss": 0.0463, "step": 4817 }, { "epoch": 0.6195722062668781, "grad_norm": 0.181640625, "learning_rate": 7.81878696176722e-05, "loss": 0.0468, "step": 4818 }, { "epoch": 0.6197008015774358, "grad_norm": 0.154296875, "learning_rate": 7.817952144051727e-05, "loss": 0.0365, "step": 4819 }, { "epoch": 0.6198293968879934, "grad_norm": 0.171875, "learning_rate": 7.817117211198004e-05, "loss": 0.0461, "step": 4820 }, { "epoch": 0.6199579921985512, "grad_norm": 0.1689453125, "learning_rate": 7.816282163240167e-05, "loss": 0.0415, "step": 4821 }, { "epoch": 0.6200865875091088, "grad_norm": 0.1826171875, "learning_rate": 7.815447000212333e-05, "loss": 0.045, "step": 4822 }, { "epoch": 0.6202151828196665, "grad_norm": 0.162109375, "learning_rate": 7.814611722148625e-05, "loss": 0.0397, "step": 4823 }, { "epoch": 0.6203437781302242, "grad_norm": 0.146484375, "learning_rate": 7.813776329083176e-05, "loss": 0.0377, "step": 4824 }, { "epoch": 0.6204723734407819, "grad_norm": 0.1728515625, "learning_rate": 7.812940821050114e-05, "loss": 0.0415, "step": 4825 }, { "epoch": 0.6206009687513395, "grad_norm": 0.197265625, "learning_rate": 7.81210519808358e-05, "loss": 0.0567, "step": 4826 }, { "epoch": 0.6207295640618972, "grad_norm": 0.1865234375, "learning_rate": 7.811269460217714e-05, "loss": 0.0432, "step": 4827 }, { "epoch": 0.6208581593724549, "grad_norm": 0.2041015625, "learning_rate": 7.810433607486665e-05, "loss": 0.0443, "step": 4828 }, { "epoch": 0.6209867546830126, "grad_norm": 0.1806640625, "learning_rate": 7.809597639924586e-05, "loss": 0.0392, "step": 4829 }, { "epoch": 0.6211153499935702, "grad_norm": 0.1591796875, "learning_rate": 7.808761557565631e-05, "loss": 0.0385, "step": 4830 }, { "epoch": 0.621243945304128, "grad_norm": 0.189453125, "learning_rate": 7.807925360443964e-05, "loss": 0.049, "step": 4831 }, { "epoch": 0.6213725406146856, "grad_norm": 0.169921875, "learning_rate": 7.80708904859375e-05, "loss": 0.0412, "step": 4832 }, { "epoch": 0.6215011359252433, "grad_norm": 0.173828125, "learning_rate": 7.806252622049159e-05, "loss": 0.0411, "step": 4833 }, { "epoch": 0.6216297312358009, "grad_norm": 0.1611328125, "learning_rate": 7.805416080844368e-05, "loss": 0.034, "step": 4834 }, { "epoch": 0.6217583265463587, "grad_norm": 0.185546875, "learning_rate": 7.804579425013554e-05, "loss": 0.0416, "step": 4835 }, { "epoch": 0.6218869218569163, "grad_norm": 0.1728515625, "learning_rate": 7.803742654590907e-05, "loss": 0.0482, "step": 4836 }, { "epoch": 0.6220155171674739, "grad_norm": 0.173828125, "learning_rate": 7.80290576961061e-05, "loss": 0.0467, "step": 4837 }, { "epoch": 0.6221441124780316, "grad_norm": 0.1767578125, "learning_rate": 7.802068770106863e-05, "loss": 0.0409, "step": 4838 }, { "epoch": 0.6222727077885893, "grad_norm": 0.1787109375, "learning_rate": 7.801231656113861e-05, "loss": 0.0408, "step": 4839 }, { "epoch": 0.622401303099147, "grad_norm": 0.1728515625, "learning_rate": 7.800394427665808e-05, "loss": 0.0474, "step": 4840 }, { "epoch": 0.6225298984097046, "grad_norm": 0.2021484375, "learning_rate": 7.799557084796913e-05, "loss": 0.0482, "step": 4841 }, { "epoch": 0.6226584937202624, "grad_norm": 0.2578125, "learning_rate": 7.798719627541391e-05, "loss": 0.0563, "step": 4842 }, { "epoch": 0.62278708903082, "grad_norm": 0.185546875, "learning_rate": 7.797882055933457e-05, "loss": 0.0503, "step": 4843 }, { "epoch": 0.6229156843413777, "grad_norm": 0.177734375, "learning_rate": 7.797044370007332e-05, "loss": 0.0454, "step": 4844 }, { "epoch": 0.6230442796519353, "grad_norm": 0.17578125, "learning_rate": 7.796206569797245e-05, "loss": 0.0516, "step": 4845 }, { "epoch": 0.6231728749624931, "grad_norm": 0.1552734375, "learning_rate": 7.795368655337427e-05, "loss": 0.0355, "step": 4846 }, { "epoch": 0.6233014702730507, "grad_norm": 0.1552734375, "learning_rate": 7.794530626662114e-05, "loss": 0.0413, "step": 4847 }, { "epoch": 0.6234300655836084, "grad_norm": 0.154296875, "learning_rate": 7.793692483805548e-05, "loss": 0.0361, "step": 4848 }, { "epoch": 0.623558660894166, "grad_norm": 0.16796875, "learning_rate": 7.792854226801972e-05, "loss": 0.0374, "step": 4849 }, { "epoch": 0.6236872562047238, "grad_norm": 0.1943359375, "learning_rate": 7.792015855685639e-05, "loss": 0.057, "step": 4850 }, { "epoch": 0.6238158515152814, "grad_norm": 0.1845703125, "learning_rate": 7.791177370490802e-05, "loss": 0.0465, "step": 4851 }, { "epoch": 0.623944446825839, "grad_norm": 0.171875, "learning_rate": 7.790338771251722e-05, "loss": 0.0423, "step": 4852 }, { "epoch": 0.6240730421363968, "grad_norm": 0.2041015625, "learning_rate": 7.789500058002661e-05, "loss": 0.0387, "step": 4853 }, { "epoch": 0.6242016374469544, "grad_norm": 0.197265625, "learning_rate": 7.78866123077789e-05, "loss": 0.0547, "step": 4854 }, { "epoch": 0.6243302327575121, "grad_norm": 0.1640625, "learning_rate": 7.787822289611681e-05, "loss": 0.0424, "step": 4855 }, { "epoch": 0.6244588280680697, "grad_norm": 0.1953125, "learning_rate": 7.786983234538314e-05, "loss": 0.053, "step": 4856 }, { "epoch": 0.6245874233786275, "grad_norm": 0.1689453125, "learning_rate": 7.78614406559207e-05, "loss": 0.041, "step": 4857 }, { "epoch": 0.6247160186891851, "grad_norm": 0.1689453125, "learning_rate": 7.785304782807237e-05, "loss": 0.0394, "step": 4858 }, { "epoch": 0.6248446139997428, "grad_norm": 0.19140625, "learning_rate": 7.784465386218107e-05, "loss": 0.0476, "step": 4859 }, { "epoch": 0.6249732093103005, "grad_norm": 0.1875, "learning_rate": 7.783625875858977e-05, "loss": 0.0521, "step": 4860 }, { "epoch": 0.6251018046208582, "grad_norm": 0.1767578125, "learning_rate": 7.782786251764148e-05, "loss": 0.0439, "step": 4861 }, { "epoch": 0.6252303999314158, "grad_norm": 0.173828125, "learning_rate": 7.781946513967928e-05, "loss": 0.0445, "step": 4862 }, { "epoch": 0.6253589952419735, "grad_norm": 0.189453125, "learning_rate": 7.781106662504625e-05, "loss": 0.0494, "step": 4863 }, { "epoch": 0.6254875905525312, "grad_norm": 0.1767578125, "learning_rate": 7.780266697408555e-05, "loss": 0.048, "step": 4864 }, { "epoch": 0.6256161858630889, "grad_norm": 0.1982421875, "learning_rate": 7.779426618714038e-05, "loss": 0.0548, "step": 4865 }, { "epoch": 0.6257447811736465, "grad_norm": 0.193359375, "learning_rate": 7.778586426455401e-05, "loss": 0.0514, "step": 4866 }, { "epoch": 0.6258733764842042, "grad_norm": 0.1650390625, "learning_rate": 7.777746120666971e-05, "loss": 0.0411, "step": 4867 }, { "epoch": 0.6260019717947619, "grad_norm": 0.1513671875, "learning_rate": 7.776905701383081e-05, "loss": 0.0357, "step": 4868 }, { "epoch": 0.6261305671053196, "grad_norm": 0.1728515625, "learning_rate": 7.776065168638074e-05, "loss": 0.0415, "step": 4869 }, { "epoch": 0.6262591624158772, "grad_norm": 0.17578125, "learning_rate": 7.775224522466287e-05, "loss": 0.0451, "step": 4870 }, { "epoch": 0.626387757726435, "grad_norm": 0.1904296875, "learning_rate": 7.774383762902071e-05, "loss": 0.0455, "step": 4871 }, { "epoch": 0.6265163530369926, "grad_norm": 0.18359375, "learning_rate": 7.773542889979778e-05, "loss": 0.0448, "step": 4872 }, { "epoch": 0.6266449483475502, "grad_norm": 0.1875, "learning_rate": 7.772701903733767e-05, "loss": 0.0408, "step": 4873 }, { "epoch": 0.6267735436581079, "grad_norm": 0.16015625, "learning_rate": 7.771860804198398e-05, "loss": 0.0412, "step": 4874 }, { "epoch": 0.6269021389686656, "grad_norm": 0.1640625, "learning_rate": 7.771019591408037e-05, "loss": 0.0475, "step": 4875 }, { "epoch": 0.6270307342792233, "grad_norm": 0.1533203125, "learning_rate": 7.770178265397056e-05, "loss": 0.0396, "step": 4876 }, { "epoch": 0.6271593295897809, "grad_norm": 0.1826171875, "learning_rate": 7.769336826199829e-05, "loss": 0.0502, "step": 4877 }, { "epoch": 0.6272879249003387, "grad_norm": 0.1640625, "learning_rate": 7.768495273850737e-05, "loss": 0.0393, "step": 4878 }, { "epoch": 0.6274165202108963, "grad_norm": 0.150390625, "learning_rate": 7.767653608384167e-05, "loss": 0.0373, "step": 4879 }, { "epoch": 0.627545115521454, "grad_norm": 0.189453125, "learning_rate": 7.766811829834504e-05, "loss": 0.0544, "step": 4880 }, { "epoch": 0.6276737108320116, "grad_norm": 0.173828125, "learning_rate": 7.765969938236147e-05, "loss": 0.0407, "step": 4881 }, { "epoch": 0.6278023061425694, "grad_norm": 0.158203125, "learning_rate": 7.765127933623492e-05, "loss": 0.0415, "step": 4882 }, { "epoch": 0.627930901453127, "grad_norm": 0.1826171875, "learning_rate": 7.76428581603094e-05, "loss": 0.0367, "step": 4883 }, { "epoch": 0.6280594967636847, "grad_norm": 0.2109375, "learning_rate": 7.763443585492905e-05, "loss": 0.0438, "step": 4884 }, { "epoch": 0.6281880920742423, "grad_norm": 0.1689453125, "learning_rate": 7.762601242043795e-05, "loss": 0.0439, "step": 4885 }, { "epoch": 0.6283166873848001, "grad_norm": 0.1845703125, "learning_rate": 7.761758785718028e-05, "loss": 0.0413, "step": 4886 }, { "epoch": 0.6284452826953577, "grad_norm": 0.171875, "learning_rate": 7.760916216550026e-05, "loss": 0.0396, "step": 4887 }, { "epoch": 0.6285738780059154, "grad_norm": 0.171875, "learning_rate": 7.760073534574215e-05, "loss": 0.0403, "step": 4888 }, { "epoch": 0.6287024733164731, "grad_norm": 0.177734375, "learning_rate": 7.759230739825027e-05, "loss": 0.0392, "step": 4889 }, { "epoch": 0.6288310686270308, "grad_norm": 0.1904296875, "learning_rate": 7.758387832336897e-05, "loss": 0.0559, "step": 4890 }, { "epoch": 0.6289596639375884, "grad_norm": 0.203125, "learning_rate": 7.757544812144267e-05, "loss": 0.0497, "step": 4891 }, { "epoch": 0.629088259248146, "grad_norm": 0.1669921875, "learning_rate": 7.756701679281578e-05, "loss": 0.0383, "step": 4892 }, { "epoch": 0.6292168545587038, "grad_norm": 0.1904296875, "learning_rate": 7.755858433783285e-05, "loss": 0.0474, "step": 4893 }, { "epoch": 0.6293454498692614, "grad_norm": 0.1650390625, "learning_rate": 7.755015075683834e-05, "loss": 0.0431, "step": 4894 }, { "epoch": 0.6294740451798191, "grad_norm": 0.1884765625, "learning_rate": 7.754171605017691e-05, "loss": 0.0461, "step": 4895 }, { "epoch": 0.6296026404903767, "grad_norm": 0.193359375, "learning_rate": 7.753328021819315e-05, "loss": 0.0412, "step": 4896 }, { "epoch": 0.6297312358009345, "grad_norm": 0.16796875, "learning_rate": 7.752484326123176e-05, "loss": 0.0415, "step": 4897 }, { "epoch": 0.6298598311114921, "grad_norm": 0.1767578125, "learning_rate": 7.751640517963746e-05, "loss": 0.0444, "step": 4898 }, { "epoch": 0.6299884264220498, "grad_norm": 0.216796875, "learning_rate": 7.750796597375502e-05, "loss": 0.0497, "step": 4899 }, { "epoch": 0.6301170217326075, "grad_norm": 0.1591796875, "learning_rate": 7.749952564392923e-05, "loss": 0.0394, "step": 4900 }, { "epoch": 0.6302456170431652, "grad_norm": 0.18359375, "learning_rate": 7.7491084190505e-05, "loss": 0.0382, "step": 4901 }, { "epoch": 0.6303742123537228, "grad_norm": 0.1484375, "learning_rate": 7.748264161382721e-05, "loss": 0.0354, "step": 4902 }, { "epoch": 0.6305028076642805, "grad_norm": 0.173828125, "learning_rate": 7.74741979142408e-05, "loss": 0.0409, "step": 4903 }, { "epoch": 0.6306314029748382, "grad_norm": 0.1640625, "learning_rate": 7.746575309209081e-05, "loss": 0.0429, "step": 4904 }, { "epoch": 0.6307599982853959, "grad_norm": 0.171875, "learning_rate": 7.745730714772225e-05, "loss": 0.0408, "step": 4905 }, { "epoch": 0.6308885935959535, "grad_norm": 0.1904296875, "learning_rate": 7.744886008148023e-05, "loss": 0.0537, "step": 4906 }, { "epoch": 0.6310171889065113, "grad_norm": 0.17578125, "learning_rate": 7.744041189370987e-05, "loss": 0.0373, "step": 4907 }, { "epoch": 0.6311457842170689, "grad_norm": 0.169921875, "learning_rate": 7.743196258475638e-05, "loss": 0.0368, "step": 4908 }, { "epoch": 0.6312743795276266, "grad_norm": 0.185546875, "learning_rate": 7.742351215496496e-05, "loss": 0.0405, "step": 4909 }, { "epoch": 0.6314029748381842, "grad_norm": 0.1640625, "learning_rate": 7.741506060468089e-05, "loss": 0.0358, "step": 4910 }, { "epoch": 0.631531570148742, "grad_norm": 0.162109375, "learning_rate": 7.740660793424952e-05, "loss": 0.0362, "step": 4911 }, { "epoch": 0.6316601654592996, "grad_norm": 0.1923828125, "learning_rate": 7.739815414401618e-05, "loss": 0.0487, "step": 4912 }, { "epoch": 0.6317887607698572, "grad_norm": 0.1796875, "learning_rate": 7.73896992343263e-05, "loss": 0.0483, "step": 4913 }, { "epoch": 0.6319173560804149, "grad_norm": 0.158203125, "learning_rate": 7.738124320552534e-05, "loss": 0.0368, "step": 4914 }, { "epoch": 0.6320459513909726, "grad_norm": 0.173828125, "learning_rate": 7.737278605795877e-05, "loss": 0.0409, "step": 4915 }, { "epoch": 0.6321745467015303, "grad_norm": 0.1796875, "learning_rate": 7.736432779197221e-05, "loss": 0.048, "step": 4916 }, { "epoch": 0.6323031420120879, "grad_norm": 0.1728515625, "learning_rate": 7.735586840791118e-05, "loss": 0.038, "step": 4917 }, { "epoch": 0.6324317373226457, "grad_norm": 0.16796875, "learning_rate": 7.734740790612136e-05, "loss": 0.041, "step": 4918 }, { "epoch": 0.6325603326332033, "grad_norm": 0.1708984375, "learning_rate": 7.733894628694843e-05, "loss": 0.043, "step": 4919 }, { "epoch": 0.632688927943761, "grad_norm": 0.1630859375, "learning_rate": 7.733048355073811e-05, "loss": 0.0431, "step": 4920 }, { "epoch": 0.6328175232543186, "grad_norm": 0.177734375, "learning_rate": 7.732201969783619e-05, "loss": 0.0414, "step": 4921 }, { "epoch": 0.6329461185648764, "grad_norm": 0.150390625, "learning_rate": 7.73135547285885e-05, "loss": 0.0302, "step": 4922 }, { "epoch": 0.633074713875434, "grad_norm": 0.181640625, "learning_rate": 7.73050886433409e-05, "loss": 0.0421, "step": 4923 }, { "epoch": 0.6332033091859917, "grad_norm": 0.1630859375, "learning_rate": 7.729662144243928e-05, "loss": 0.0369, "step": 4924 }, { "epoch": 0.6333319044965494, "grad_norm": 0.1748046875, "learning_rate": 7.728815312622964e-05, "loss": 0.0436, "step": 4925 }, { "epoch": 0.6334604998071071, "grad_norm": 0.1796875, "learning_rate": 7.727968369505798e-05, "loss": 0.0447, "step": 4926 }, { "epoch": 0.6335890951176647, "grad_norm": 0.19140625, "learning_rate": 7.727121314927033e-05, "loss": 0.0472, "step": 4927 }, { "epoch": 0.6337176904282223, "grad_norm": 0.1728515625, "learning_rate": 7.72627414892128e-05, "loss": 0.04, "step": 4928 }, { "epoch": 0.6338462857387801, "grad_norm": 0.17578125, "learning_rate": 7.725426871523153e-05, "loss": 0.0495, "step": 4929 }, { "epoch": 0.6339748810493377, "grad_norm": 0.171875, "learning_rate": 7.72457948276727e-05, "loss": 0.039, "step": 4930 }, { "epoch": 0.6341034763598954, "grad_norm": 0.1943359375, "learning_rate": 7.723731982688255e-05, "loss": 0.0431, "step": 4931 }, { "epoch": 0.634232071670453, "grad_norm": 0.1708984375, "learning_rate": 7.722884371320737e-05, "loss": 0.0427, "step": 4932 }, { "epoch": 0.6343606669810108, "grad_norm": 0.166015625, "learning_rate": 7.722036648699346e-05, "loss": 0.0393, "step": 4933 }, { "epoch": 0.6344892622915684, "grad_norm": 0.1748046875, "learning_rate": 7.72118881485872e-05, "loss": 0.0466, "step": 4934 }, { "epoch": 0.6346178576021261, "grad_norm": 0.263671875, "learning_rate": 7.720340869833502e-05, "loss": 0.0402, "step": 4935 }, { "epoch": 0.6347464529126838, "grad_norm": 0.1884765625, "learning_rate": 7.719492813658336e-05, "loss": 0.056, "step": 4936 }, { "epoch": 0.6348750482232415, "grad_norm": 0.1708984375, "learning_rate": 7.718644646367873e-05, "loss": 0.0489, "step": 4937 }, { "epoch": 0.6350036435337991, "grad_norm": 0.1884765625, "learning_rate": 7.717796367996768e-05, "loss": 0.0556, "step": 4938 }, { "epoch": 0.6351322388443568, "grad_norm": 0.1904296875, "learning_rate": 7.716947978579683e-05, "loss": 0.0441, "step": 4939 }, { "epoch": 0.6352608341549145, "grad_norm": 0.1884765625, "learning_rate": 7.716099478151279e-05, "loss": 0.0431, "step": 4940 }, { "epoch": 0.6353894294654722, "grad_norm": 0.193359375, "learning_rate": 7.715250866746226e-05, "loss": 0.0463, "step": 4941 }, { "epoch": 0.6355180247760298, "grad_norm": 0.18359375, "learning_rate": 7.714402144399197e-05, "loss": 0.0423, "step": 4942 }, { "epoch": 0.6356466200865875, "grad_norm": 0.1591796875, "learning_rate": 7.713553311144872e-05, "loss": 0.0363, "step": 4943 }, { "epoch": 0.6357752153971452, "grad_norm": 0.1748046875, "learning_rate": 7.712704367017928e-05, "loss": 0.0467, "step": 4944 }, { "epoch": 0.6359038107077029, "grad_norm": 0.1513671875, "learning_rate": 7.71185531205306e-05, "loss": 0.0341, "step": 4945 }, { "epoch": 0.6360324060182605, "grad_norm": 0.1845703125, "learning_rate": 7.711006146284951e-05, "loss": 0.0465, "step": 4946 }, { "epoch": 0.6361610013288183, "grad_norm": 0.1728515625, "learning_rate": 7.710156869748302e-05, "loss": 0.0406, "step": 4947 }, { "epoch": 0.6362895966393759, "grad_norm": 0.1865234375, "learning_rate": 7.709307482477812e-05, "loss": 0.0452, "step": 4948 }, { "epoch": 0.6364181919499335, "grad_norm": 0.185546875, "learning_rate": 7.708457984508184e-05, "loss": 0.0503, "step": 4949 }, { "epoch": 0.6365467872604912, "grad_norm": 0.16015625, "learning_rate": 7.707608375874132e-05, "loss": 0.0372, "step": 4950 }, { "epoch": 0.6366753825710489, "grad_norm": 0.1826171875, "learning_rate": 7.706758656610367e-05, "loss": 0.0399, "step": 4951 }, { "epoch": 0.6368039778816066, "grad_norm": 0.166015625, "learning_rate": 7.70590882675161e-05, "loss": 0.0394, "step": 4952 }, { "epoch": 0.6369325731921642, "grad_norm": 0.162109375, "learning_rate": 7.705058886332579e-05, "loss": 0.0425, "step": 4953 }, { "epoch": 0.637061168502722, "grad_norm": 0.1533203125, "learning_rate": 7.704208835388006e-05, "loss": 0.035, "step": 4954 }, { "epoch": 0.6371897638132796, "grad_norm": 0.1962890625, "learning_rate": 7.70335867395262e-05, "loss": 0.0423, "step": 4955 }, { "epoch": 0.6373183591238373, "grad_norm": 0.1767578125, "learning_rate": 7.702508402061162e-05, "loss": 0.0443, "step": 4956 }, { "epoch": 0.6374469544343949, "grad_norm": 0.1689453125, "learning_rate": 7.70165801974837e-05, "loss": 0.0477, "step": 4957 }, { "epoch": 0.6375755497449527, "grad_norm": 0.1826171875, "learning_rate": 7.700807527048989e-05, "loss": 0.0455, "step": 4958 }, { "epoch": 0.6377041450555103, "grad_norm": 0.205078125, "learning_rate": 7.69995692399777e-05, "loss": 0.0401, "step": 4959 }, { "epoch": 0.637832740366068, "grad_norm": 0.1630859375, "learning_rate": 7.699106210629471e-05, "loss": 0.0402, "step": 4960 }, { "epoch": 0.6379613356766256, "grad_norm": 0.1689453125, "learning_rate": 7.698255386978846e-05, "loss": 0.0411, "step": 4961 }, { "epoch": 0.6380899309871834, "grad_norm": 0.189453125, "learning_rate": 7.69740445308066e-05, "loss": 0.0475, "step": 4962 }, { "epoch": 0.638218526297741, "grad_norm": 0.1728515625, "learning_rate": 7.696553408969683e-05, "loss": 0.0457, "step": 4963 }, { "epoch": 0.6383471216082987, "grad_norm": 0.1787109375, "learning_rate": 7.695702254680687e-05, "loss": 0.0415, "step": 4964 }, { "epoch": 0.6384757169188564, "grad_norm": 0.1748046875, "learning_rate": 7.694850990248448e-05, "loss": 0.0477, "step": 4965 }, { "epoch": 0.638604312229414, "grad_norm": 0.169921875, "learning_rate": 7.693999615707747e-05, "loss": 0.0427, "step": 4966 }, { "epoch": 0.6387329075399717, "grad_norm": 0.1689453125, "learning_rate": 7.693148131093373e-05, "loss": 0.0426, "step": 4967 }, { "epoch": 0.6388615028505293, "grad_norm": 0.166015625, "learning_rate": 7.692296536440116e-05, "loss": 0.0407, "step": 4968 }, { "epoch": 0.6389900981610871, "grad_norm": 0.169921875, "learning_rate": 7.69144483178277e-05, "loss": 0.0443, "step": 4969 }, { "epoch": 0.6391186934716447, "grad_norm": 0.1806640625, "learning_rate": 7.690593017156135e-05, "loss": 0.0456, "step": 4970 }, { "epoch": 0.6392472887822024, "grad_norm": 0.1796875, "learning_rate": 7.689741092595015e-05, "loss": 0.038, "step": 4971 }, { "epoch": 0.6393758840927601, "grad_norm": 0.1630859375, "learning_rate": 7.688889058134219e-05, "loss": 0.0393, "step": 4972 }, { "epoch": 0.6395044794033178, "grad_norm": 0.1767578125, "learning_rate": 7.68803691380856e-05, "loss": 0.0427, "step": 4973 }, { "epoch": 0.6396330747138754, "grad_norm": 0.19140625, "learning_rate": 7.687184659652855e-05, "loss": 0.046, "step": 4974 }, { "epoch": 0.6397616700244331, "grad_norm": 0.177734375, "learning_rate": 7.686332295701927e-05, "loss": 0.0417, "step": 4975 }, { "epoch": 0.6398902653349908, "grad_norm": 0.17578125, "learning_rate": 7.685479821990602e-05, "loss": 0.0398, "step": 4976 }, { "epoch": 0.6400188606455485, "grad_norm": 0.1884765625, "learning_rate": 7.684627238553713e-05, "loss": 0.054, "step": 4977 }, { "epoch": 0.6401474559561061, "grad_norm": 0.16015625, "learning_rate": 7.683774545426092e-05, "loss": 0.0349, "step": 4978 }, { "epoch": 0.6402760512666638, "grad_norm": 0.173828125, "learning_rate": 7.682921742642581e-05, "loss": 0.0367, "step": 4979 }, { "epoch": 0.6404046465772215, "grad_norm": 0.173828125, "learning_rate": 7.682068830238026e-05, "loss": 0.0384, "step": 4980 }, { "epoch": 0.6405332418877792, "grad_norm": 0.1689453125, "learning_rate": 7.681215808247273e-05, "loss": 0.0414, "step": 4981 }, { "epoch": 0.6406618371983368, "grad_norm": 0.1982421875, "learning_rate": 7.68036267670518e-05, "loss": 0.0486, "step": 4982 }, { "epoch": 0.6407904325088946, "grad_norm": 0.16796875, "learning_rate": 7.679509435646599e-05, "loss": 0.0409, "step": 4983 }, { "epoch": 0.6409190278194522, "grad_norm": 0.173828125, "learning_rate": 7.678656085106396e-05, "loss": 0.0389, "step": 4984 }, { "epoch": 0.6410476231300098, "grad_norm": 0.1943359375, "learning_rate": 7.677802625119436e-05, "loss": 0.044, "step": 4985 }, { "epoch": 0.6411762184405675, "grad_norm": 0.1611328125, "learning_rate": 7.676949055720595e-05, "loss": 0.0414, "step": 4986 }, { "epoch": 0.6413048137511252, "grad_norm": 0.1923828125, "learning_rate": 7.676095376944744e-05, "loss": 0.0549, "step": 4987 }, { "epoch": 0.6414334090616829, "grad_norm": 0.15234375, "learning_rate": 7.675241588826763e-05, "loss": 0.0314, "step": 4988 }, { "epoch": 0.6415620043722405, "grad_norm": 0.1728515625, "learning_rate": 7.674387691401541e-05, "loss": 0.0443, "step": 4989 }, { "epoch": 0.6416905996827982, "grad_norm": 0.177734375, "learning_rate": 7.673533684703964e-05, "loss": 0.041, "step": 4990 }, { "epoch": 0.6418191949933559, "grad_norm": 0.1708984375, "learning_rate": 7.672679568768928e-05, "loss": 0.044, "step": 4991 }, { "epoch": 0.6419477903039136, "grad_norm": 0.171875, "learning_rate": 7.671825343631327e-05, "loss": 0.0445, "step": 4992 }, { "epoch": 0.6420763856144712, "grad_norm": 0.17578125, "learning_rate": 7.670971009326068e-05, "loss": 0.0486, "step": 4993 }, { "epoch": 0.642204980925029, "grad_norm": 0.17578125, "learning_rate": 7.670116565888057e-05, "loss": 0.0445, "step": 4994 }, { "epoch": 0.6423335762355866, "grad_norm": 0.169921875, "learning_rate": 7.669262013352202e-05, "loss": 0.0456, "step": 4995 }, { "epoch": 0.6424621715461443, "grad_norm": 0.1748046875, "learning_rate": 7.668407351753426e-05, "loss": 0.0479, "step": 4996 }, { "epoch": 0.6425907668567019, "grad_norm": 0.16796875, "learning_rate": 7.667552581126644e-05, "loss": 0.0419, "step": 4997 }, { "epoch": 0.6427193621672597, "grad_norm": 0.17578125, "learning_rate": 7.666697701506784e-05, "loss": 0.042, "step": 4998 }, { "epoch": 0.6428479574778173, "grad_norm": 0.1474609375, "learning_rate": 7.665842712928772e-05, "loss": 0.0342, "step": 4999 }, { "epoch": 0.642976552788375, "grad_norm": 0.1748046875, "learning_rate": 7.664987615427544e-05, "loss": 0.0455, "step": 5000 }, { "epoch": 0.642976552788375, "eval_loss": 0.04285893589258194, "eval_runtime": 1043.6235, "eval_samples_per_second": 94.12, "eval_steps_per_second": 1.177, "step": 5000 }, { "epoch": 0.6431051480989327, "grad_norm": 0.171875, "learning_rate": 7.664132409038039e-05, "loss": 0.0489, "step": 5001 }, { "epoch": 0.6432337434094904, "grad_norm": 0.1767578125, "learning_rate": 7.663277093795195e-05, "loss": 0.0404, "step": 5002 }, { "epoch": 0.643362338720048, "grad_norm": 0.171875, "learning_rate": 7.662421669733968e-05, "loss": 0.052, "step": 5003 }, { "epoch": 0.6434909340306056, "grad_norm": 0.1982421875, "learning_rate": 7.661566136889303e-05, "loss": 0.0549, "step": 5004 }, { "epoch": 0.6436195293411634, "grad_norm": 0.1923828125, "learning_rate": 7.660710495296154e-05, "loss": 0.0478, "step": 5005 }, { "epoch": 0.643748124651721, "grad_norm": 0.1796875, "learning_rate": 7.659854744989489e-05, "loss": 0.0438, "step": 5006 }, { "epoch": 0.6438767199622787, "grad_norm": 0.177734375, "learning_rate": 7.658998886004269e-05, "loss": 0.0449, "step": 5007 }, { "epoch": 0.6440053152728363, "grad_norm": 0.1669921875, "learning_rate": 7.658142918375462e-05, "loss": 0.0341, "step": 5008 }, { "epoch": 0.6441339105833941, "grad_norm": 0.1875, "learning_rate": 7.657286842138043e-05, "loss": 0.0435, "step": 5009 }, { "epoch": 0.6442625058939517, "grad_norm": 0.1611328125, "learning_rate": 7.656430657326991e-05, "loss": 0.0346, "step": 5010 }, { "epoch": 0.6443911012045094, "grad_norm": 0.1728515625, "learning_rate": 7.65557436397729e-05, "loss": 0.0431, "step": 5011 }, { "epoch": 0.6445196965150671, "grad_norm": 0.169921875, "learning_rate": 7.654717962123925e-05, "loss": 0.0392, "step": 5012 }, { "epoch": 0.6446482918256248, "grad_norm": 0.1767578125, "learning_rate": 7.653861451801887e-05, "loss": 0.0418, "step": 5013 }, { "epoch": 0.6447768871361824, "grad_norm": 0.1826171875, "learning_rate": 7.653004833046173e-05, "loss": 0.043, "step": 5014 }, { "epoch": 0.6449054824467401, "grad_norm": 0.1630859375, "learning_rate": 7.652148105891783e-05, "loss": 0.0397, "step": 5015 }, { "epoch": 0.6450340777572978, "grad_norm": 0.162109375, "learning_rate": 7.651291270373724e-05, "loss": 0.0387, "step": 5016 }, { "epoch": 0.6451626730678555, "grad_norm": 0.1845703125, "learning_rate": 7.650434326527002e-05, "loss": 0.0514, "step": 5017 }, { "epoch": 0.6452912683784131, "grad_norm": 0.173828125, "learning_rate": 7.649577274386634e-05, "loss": 0.0434, "step": 5018 }, { "epoch": 0.6454198636889709, "grad_norm": 0.185546875, "learning_rate": 7.648720113987634e-05, "loss": 0.0504, "step": 5019 }, { "epoch": 0.6455484589995285, "grad_norm": 0.1708984375, "learning_rate": 7.64786284536503e-05, "loss": 0.0412, "step": 5020 }, { "epoch": 0.6456770543100862, "grad_norm": 0.1728515625, "learning_rate": 7.647005468553842e-05, "loss": 0.0383, "step": 5021 }, { "epoch": 0.6458056496206438, "grad_norm": 0.1875, "learning_rate": 7.64614798358911e-05, "loss": 0.0501, "step": 5022 }, { "epoch": 0.6459342449312016, "grad_norm": 0.1689453125, "learning_rate": 7.645290390505864e-05, "loss": 0.0394, "step": 5023 }, { "epoch": 0.6460628402417592, "grad_norm": 0.171875, "learning_rate": 7.644432689339145e-05, "loss": 0.0426, "step": 5024 }, { "epoch": 0.6461914355523168, "grad_norm": 0.234375, "learning_rate": 7.643574880123998e-05, "loss": 0.0444, "step": 5025 }, { "epoch": 0.6463200308628745, "grad_norm": 0.1865234375, "learning_rate": 7.642716962895473e-05, "loss": 0.0473, "step": 5026 }, { "epoch": 0.6464486261734322, "grad_norm": 0.1767578125, "learning_rate": 7.641858937688623e-05, "loss": 0.0494, "step": 5027 }, { "epoch": 0.6465772214839899, "grad_norm": 0.1767578125, "learning_rate": 7.641000804538505e-05, "loss": 0.0432, "step": 5028 }, { "epoch": 0.6467058167945475, "grad_norm": 0.1875, "learning_rate": 7.640142563480182e-05, "loss": 0.0485, "step": 5029 }, { "epoch": 0.6468344121051053, "grad_norm": 0.1689453125, "learning_rate": 7.639284214548721e-05, "loss": 0.0393, "step": 5030 }, { "epoch": 0.6469630074156629, "grad_norm": 0.169921875, "learning_rate": 7.638425757779192e-05, "loss": 0.0404, "step": 5031 }, { "epoch": 0.6470916027262206, "grad_norm": 0.1708984375, "learning_rate": 7.637567193206672e-05, "loss": 0.0433, "step": 5032 }, { "epoch": 0.6472201980367782, "grad_norm": 0.169921875, "learning_rate": 7.63670852086624e-05, "loss": 0.0412, "step": 5033 }, { "epoch": 0.647348793347336, "grad_norm": 0.1630859375, "learning_rate": 7.635849740792983e-05, "loss": 0.0366, "step": 5034 }, { "epoch": 0.6474773886578936, "grad_norm": 0.2177734375, "learning_rate": 7.634990853021986e-05, "loss": 0.0592, "step": 5035 }, { "epoch": 0.6476059839684513, "grad_norm": 0.1640625, "learning_rate": 7.634131857588343e-05, "loss": 0.0373, "step": 5036 }, { "epoch": 0.6477345792790089, "grad_norm": 0.2021484375, "learning_rate": 7.633272754527153e-05, "loss": 0.0516, "step": 5037 }, { "epoch": 0.6478631745895667, "grad_norm": 0.19140625, "learning_rate": 7.632413543873516e-05, "loss": 0.0468, "step": 5038 }, { "epoch": 0.6479917699001243, "grad_norm": 0.173828125, "learning_rate": 7.631554225662542e-05, "loss": 0.0453, "step": 5039 }, { "epoch": 0.648120365210682, "grad_norm": 0.1982421875, "learning_rate": 7.630694799929336e-05, "loss": 0.0506, "step": 5040 }, { "epoch": 0.6482489605212397, "grad_norm": 0.1787109375, "learning_rate": 7.629835266709019e-05, "loss": 0.0468, "step": 5041 }, { "epoch": 0.6483775558317973, "grad_norm": 0.1767578125, "learning_rate": 7.628975626036706e-05, "loss": 0.0382, "step": 5042 }, { "epoch": 0.648506151142355, "grad_norm": 0.1953125, "learning_rate": 7.628115877947524e-05, "loss": 0.0495, "step": 5043 }, { "epoch": 0.6486347464529126, "grad_norm": 0.193359375, "learning_rate": 7.627256022476598e-05, "loss": 0.0439, "step": 5044 }, { "epoch": 0.6487633417634704, "grad_norm": 0.19921875, "learning_rate": 7.626396059659065e-05, "loss": 0.0536, "step": 5045 }, { "epoch": 0.648891937074028, "grad_norm": 0.1796875, "learning_rate": 7.62553598953006e-05, "loss": 0.0408, "step": 5046 }, { "epoch": 0.6490205323845857, "grad_norm": 0.173828125, "learning_rate": 7.624675812124725e-05, "loss": 0.0477, "step": 5047 }, { "epoch": 0.6491491276951434, "grad_norm": 0.19140625, "learning_rate": 7.623815527478202e-05, "loss": 0.0526, "step": 5048 }, { "epoch": 0.6492777230057011, "grad_norm": 0.185546875, "learning_rate": 7.622955135625647e-05, "loss": 0.0414, "step": 5049 }, { "epoch": 0.6494063183162587, "grad_norm": 0.16796875, "learning_rate": 7.62209463660221e-05, "loss": 0.0432, "step": 5050 }, { "epoch": 0.6495349136268164, "grad_norm": 0.169921875, "learning_rate": 7.621234030443053e-05, "loss": 0.0484, "step": 5051 }, { "epoch": 0.6496635089373741, "grad_norm": 0.1533203125, "learning_rate": 7.620373317183339e-05, "loss": 0.0344, "step": 5052 }, { "epoch": 0.6497921042479318, "grad_norm": 0.18359375, "learning_rate": 7.619512496858233e-05, "loss": 0.0461, "step": 5053 }, { "epoch": 0.6499206995584894, "grad_norm": 0.1943359375, "learning_rate": 7.61865156950291e-05, "loss": 0.0466, "step": 5054 }, { "epoch": 0.6500492948690471, "grad_norm": 0.1767578125, "learning_rate": 7.617790535152546e-05, "loss": 0.0458, "step": 5055 }, { "epoch": 0.6501778901796048, "grad_norm": 0.16796875, "learning_rate": 7.616929393842323e-05, "loss": 0.0386, "step": 5056 }, { "epoch": 0.6503064854901625, "grad_norm": 0.1845703125, "learning_rate": 7.616068145607422e-05, "loss": 0.043, "step": 5057 }, { "epoch": 0.6504350808007201, "grad_norm": 0.1689453125, "learning_rate": 7.615206790483037e-05, "loss": 0.0449, "step": 5058 }, { "epoch": 0.6505636761112779, "grad_norm": 0.158203125, "learning_rate": 7.61434532850436e-05, "loss": 0.0355, "step": 5059 }, { "epoch": 0.6506922714218355, "grad_norm": 0.15625, "learning_rate": 7.61348375970659e-05, "loss": 0.0393, "step": 5060 }, { "epoch": 0.6508208667323931, "grad_norm": 0.189453125, "learning_rate": 7.612622084124928e-05, "loss": 0.0569, "step": 5061 }, { "epoch": 0.6509494620429508, "grad_norm": 0.1572265625, "learning_rate": 7.611760301794584e-05, "loss": 0.0318, "step": 5062 }, { "epoch": 0.6510780573535085, "grad_norm": 0.1865234375, "learning_rate": 7.610898412750766e-05, "loss": 0.0473, "step": 5063 }, { "epoch": 0.6512066526640662, "grad_norm": 0.1806640625, "learning_rate": 7.610036417028694e-05, "loss": 0.0435, "step": 5064 }, { "epoch": 0.6513352479746238, "grad_norm": 0.181640625, "learning_rate": 7.609174314663584e-05, "loss": 0.0489, "step": 5065 }, { "epoch": 0.6514638432851816, "grad_norm": 0.1689453125, "learning_rate": 7.608312105690663e-05, "loss": 0.0405, "step": 5066 }, { "epoch": 0.6515924385957392, "grad_norm": 0.185546875, "learning_rate": 7.607449790145159e-05, "loss": 0.0422, "step": 5067 }, { "epoch": 0.6517210339062969, "grad_norm": 0.1875, "learning_rate": 7.606587368062303e-05, "loss": 0.0466, "step": 5068 }, { "epoch": 0.6518496292168545, "grad_norm": 0.173828125, "learning_rate": 7.605724839477337e-05, "loss": 0.0455, "step": 5069 }, { "epoch": 0.6519782245274123, "grad_norm": 0.162109375, "learning_rate": 7.6048622044255e-05, "loss": 0.0357, "step": 5070 }, { "epoch": 0.6521068198379699, "grad_norm": 0.1796875, "learning_rate": 7.603999462942039e-05, "loss": 0.0417, "step": 5071 }, { "epoch": 0.6522354151485276, "grad_norm": 0.189453125, "learning_rate": 7.603136615062205e-05, "loss": 0.0423, "step": 5072 }, { "epoch": 0.6523640104590852, "grad_norm": 0.169921875, "learning_rate": 7.602273660821253e-05, "loss": 0.0368, "step": 5073 }, { "epoch": 0.652492605769643, "grad_norm": 0.1650390625, "learning_rate": 7.601410600254441e-05, "loss": 0.0361, "step": 5074 }, { "epoch": 0.6526212010802006, "grad_norm": 0.1650390625, "learning_rate": 7.600547433397035e-05, "loss": 0.0359, "step": 5075 }, { "epoch": 0.6527497963907583, "grad_norm": 0.18359375, "learning_rate": 7.5996841602843e-05, "loss": 0.0431, "step": 5076 }, { "epoch": 0.652878391701316, "grad_norm": 0.19140625, "learning_rate": 7.59882078095151e-05, "loss": 0.0502, "step": 5077 }, { "epoch": 0.6530069870118737, "grad_norm": 0.189453125, "learning_rate": 7.597957295433942e-05, "loss": 0.0499, "step": 5078 }, { "epoch": 0.6531355823224313, "grad_norm": 0.1552734375, "learning_rate": 7.597093703766875e-05, "loss": 0.0365, "step": 5079 }, { "epoch": 0.6532641776329889, "grad_norm": 0.1787109375, "learning_rate": 7.596230005985598e-05, "loss": 0.0503, "step": 5080 }, { "epoch": 0.6533927729435467, "grad_norm": 0.1767578125, "learning_rate": 7.595366202125396e-05, "loss": 0.05, "step": 5081 }, { "epoch": 0.6535213682541043, "grad_norm": 0.205078125, "learning_rate": 7.594502292221569e-05, "loss": 0.0512, "step": 5082 }, { "epoch": 0.653649963564662, "grad_norm": 0.18359375, "learning_rate": 7.593638276309409e-05, "loss": 0.0428, "step": 5083 }, { "epoch": 0.6537785588752196, "grad_norm": 0.169921875, "learning_rate": 7.592774154424222e-05, "loss": 0.0448, "step": 5084 }, { "epoch": 0.6539071541857774, "grad_norm": 0.193359375, "learning_rate": 7.591909926601316e-05, "loss": 0.0533, "step": 5085 }, { "epoch": 0.654035749496335, "grad_norm": 0.17578125, "learning_rate": 7.591045592875999e-05, "loss": 0.044, "step": 5086 }, { "epoch": 0.6541643448068927, "grad_norm": 0.193359375, "learning_rate": 7.59018115328359e-05, "loss": 0.0541, "step": 5087 }, { "epoch": 0.6542929401174504, "grad_norm": 0.17578125, "learning_rate": 7.589316607859407e-05, "loss": 0.0473, "step": 5088 }, { "epoch": 0.6544215354280081, "grad_norm": 0.2080078125, "learning_rate": 7.588451956638776e-05, "loss": 0.0431, "step": 5089 }, { "epoch": 0.6545501307385657, "grad_norm": 0.23046875, "learning_rate": 7.587587199657022e-05, "loss": 0.0513, "step": 5090 }, { "epoch": 0.6546787260491234, "grad_norm": 0.1611328125, "learning_rate": 7.586722336949483e-05, "loss": 0.0351, "step": 5091 }, { "epoch": 0.6548073213596811, "grad_norm": 0.1728515625, "learning_rate": 7.585857368551492e-05, "loss": 0.0472, "step": 5092 }, { "epoch": 0.6549359166702388, "grad_norm": 0.16796875, "learning_rate": 7.584992294498392e-05, "loss": 0.0422, "step": 5093 }, { "epoch": 0.6550645119807964, "grad_norm": 0.16015625, "learning_rate": 7.584127114825532e-05, "loss": 0.0406, "step": 5094 }, { "epoch": 0.6551931072913542, "grad_norm": 0.1630859375, "learning_rate": 7.583261829568257e-05, "loss": 0.0376, "step": 5095 }, { "epoch": 0.6553217026019118, "grad_norm": 0.1923828125, "learning_rate": 7.582396438761926e-05, "loss": 0.0442, "step": 5096 }, { "epoch": 0.6554502979124694, "grad_norm": 0.1845703125, "learning_rate": 7.581530942441894e-05, "loss": 0.049, "step": 5097 }, { "epoch": 0.6555788932230271, "grad_norm": 0.1748046875, "learning_rate": 7.580665340643527e-05, "loss": 0.0442, "step": 5098 }, { "epoch": 0.6557074885335848, "grad_norm": 0.173828125, "learning_rate": 7.579799633402191e-05, "loss": 0.0436, "step": 5099 }, { "epoch": 0.6558360838441425, "grad_norm": 0.162109375, "learning_rate": 7.57893382075326e-05, "loss": 0.0337, "step": 5100 }, { "epoch": 0.6559646791547001, "grad_norm": 0.1640625, "learning_rate": 7.578067902732106e-05, "loss": 0.0453, "step": 5101 }, { "epoch": 0.6560932744652578, "grad_norm": 0.1630859375, "learning_rate": 7.577201879374115e-05, "loss": 0.0397, "step": 5102 }, { "epoch": 0.6562218697758155, "grad_norm": 0.1728515625, "learning_rate": 7.576335750714665e-05, "loss": 0.0435, "step": 5103 }, { "epoch": 0.6563504650863732, "grad_norm": 0.15625, "learning_rate": 7.57546951678915e-05, "loss": 0.0374, "step": 5104 }, { "epoch": 0.6564790603969308, "grad_norm": 0.189453125, "learning_rate": 7.574603177632962e-05, "loss": 0.0447, "step": 5105 }, { "epoch": 0.6566076557074886, "grad_norm": 0.1748046875, "learning_rate": 7.5737367332815e-05, "loss": 0.0373, "step": 5106 }, { "epoch": 0.6567362510180462, "grad_norm": 0.1650390625, "learning_rate": 7.572870183770162e-05, "loss": 0.0438, "step": 5107 }, { "epoch": 0.6568648463286039, "grad_norm": 0.173828125, "learning_rate": 7.572003529134358e-05, "loss": 0.0423, "step": 5108 }, { "epoch": 0.6569934416391615, "grad_norm": 0.2265625, "learning_rate": 7.571136769409495e-05, "loss": 0.0549, "step": 5109 }, { "epoch": 0.6571220369497193, "grad_norm": 0.1787109375, "learning_rate": 7.570269904630991e-05, "loss": 0.0417, "step": 5110 }, { "epoch": 0.6572506322602769, "grad_norm": 0.173828125, "learning_rate": 7.569402934834264e-05, "loss": 0.0449, "step": 5111 }, { "epoch": 0.6573792275708346, "grad_norm": 0.1376953125, "learning_rate": 7.568535860054737e-05, "loss": 0.0342, "step": 5112 }, { "epoch": 0.6575078228813923, "grad_norm": 0.197265625, "learning_rate": 7.567668680327839e-05, "loss": 0.0505, "step": 5113 }, { "epoch": 0.65763641819195, "grad_norm": 0.1845703125, "learning_rate": 7.566801395689e-05, "loss": 0.0432, "step": 5114 }, { "epoch": 0.6577650135025076, "grad_norm": 0.1806640625, "learning_rate": 7.565934006173657e-05, "loss": 0.0435, "step": 5115 }, { "epoch": 0.6578936088130652, "grad_norm": 0.1728515625, "learning_rate": 7.56506651181725e-05, "loss": 0.0367, "step": 5116 }, { "epoch": 0.658022204123623, "grad_norm": 0.173828125, "learning_rate": 7.564198912655227e-05, "loss": 0.0405, "step": 5117 }, { "epoch": 0.6581507994341806, "grad_norm": 0.1884765625, "learning_rate": 7.563331208723033e-05, "loss": 0.0405, "step": 5118 }, { "epoch": 0.6582793947447383, "grad_norm": 0.1982421875, "learning_rate": 7.562463400056122e-05, "loss": 0.0465, "step": 5119 }, { "epoch": 0.6584079900552959, "grad_norm": 0.1669921875, "learning_rate": 7.561595486689954e-05, "loss": 0.0425, "step": 5120 }, { "epoch": 0.6585365853658537, "grad_norm": 0.20703125, "learning_rate": 7.560727468659988e-05, "loss": 0.0607, "step": 5121 }, { "epoch": 0.6586651806764113, "grad_norm": 0.16796875, "learning_rate": 7.559859346001693e-05, "loss": 0.0426, "step": 5122 }, { "epoch": 0.658793775986969, "grad_norm": 0.1943359375, "learning_rate": 7.558991118750538e-05, "loss": 0.0432, "step": 5123 }, { "epoch": 0.6589223712975267, "grad_norm": 0.181640625, "learning_rate": 7.558122786941997e-05, "loss": 0.0434, "step": 5124 }, { "epoch": 0.6590509666080844, "grad_norm": 0.158203125, "learning_rate": 7.557254350611551e-05, "loss": 0.0343, "step": 5125 }, { "epoch": 0.659179561918642, "grad_norm": 0.1806640625, "learning_rate": 7.556385809794682e-05, "loss": 0.0463, "step": 5126 }, { "epoch": 0.6593081572291997, "grad_norm": 0.1923828125, "learning_rate": 7.555517164526878e-05, "loss": 0.0464, "step": 5127 }, { "epoch": 0.6594367525397574, "grad_norm": 0.2080078125, "learning_rate": 7.554648414843628e-05, "loss": 0.0387, "step": 5128 }, { "epoch": 0.6595653478503151, "grad_norm": 0.17578125, "learning_rate": 7.553779560780433e-05, "loss": 0.0426, "step": 5129 }, { "epoch": 0.6596939431608727, "grad_norm": 0.1904296875, "learning_rate": 7.552910602372793e-05, "loss": 0.0427, "step": 5130 }, { "epoch": 0.6598225384714304, "grad_norm": 0.2119140625, "learning_rate": 7.552041539656207e-05, "loss": 0.053, "step": 5131 }, { "epoch": 0.6599511337819881, "grad_norm": 0.1552734375, "learning_rate": 7.551172372666191e-05, "loss": 0.0323, "step": 5132 }, { "epoch": 0.6600797290925458, "grad_norm": 0.201171875, "learning_rate": 7.550303101438252e-05, "loss": 0.0432, "step": 5133 }, { "epoch": 0.6602083244031034, "grad_norm": 0.166015625, "learning_rate": 7.549433726007912e-05, "loss": 0.0354, "step": 5134 }, { "epoch": 0.6603369197136612, "grad_norm": 0.1689453125, "learning_rate": 7.54856424641069e-05, "loss": 0.0371, "step": 5135 }, { "epoch": 0.6604655150242188, "grad_norm": 0.1826171875, "learning_rate": 7.547694662682113e-05, "loss": 0.0477, "step": 5136 }, { "epoch": 0.6605941103347764, "grad_norm": 0.1708984375, "learning_rate": 7.546824974857709e-05, "loss": 0.0385, "step": 5137 }, { "epoch": 0.6607227056453341, "grad_norm": 0.16015625, "learning_rate": 7.545955182973016e-05, "loss": 0.0387, "step": 5138 }, { "epoch": 0.6608513009558918, "grad_norm": 0.21875, "learning_rate": 7.545085287063572e-05, "loss": 0.0559, "step": 5139 }, { "epoch": 0.6609798962664495, "grad_norm": 0.1982421875, "learning_rate": 7.544215287164918e-05, "loss": 0.0498, "step": 5140 }, { "epoch": 0.6611084915770071, "grad_norm": 0.177734375, "learning_rate": 7.543345183312602e-05, "loss": 0.0432, "step": 5141 }, { "epoch": 0.6612370868875649, "grad_norm": 0.1875, "learning_rate": 7.542474975542176e-05, "loss": 0.0523, "step": 5142 }, { "epoch": 0.6613656821981225, "grad_norm": 0.173828125, "learning_rate": 7.541604663889194e-05, "loss": 0.0476, "step": 5143 }, { "epoch": 0.6614942775086802, "grad_norm": 0.1533203125, "learning_rate": 7.540734248389218e-05, "loss": 0.0337, "step": 5144 }, { "epoch": 0.6616228728192378, "grad_norm": 0.1640625, "learning_rate": 7.539863729077809e-05, "loss": 0.0366, "step": 5145 }, { "epoch": 0.6617514681297956, "grad_norm": 0.18359375, "learning_rate": 7.538993105990542e-05, "loss": 0.0493, "step": 5146 }, { "epoch": 0.6618800634403532, "grad_norm": 0.1806640625, "learning_rate": 7.538122379162982e-05, "loss": 0.0466, "step": 5147 }, { "epoch": 0.6620086587509109, "grad_norm": 0.1630859375, "learning_rate": 7.53725154863071e-05, "loss": 0.0407, "step": 5148 }, { "epoch": 0.6621372540614685, "grad_norm": 0.181640625, "learning_rate": 7.536380614429307e-05, "loss": 0.0477, "step": 5149 }, { "epoch": 0.6622658493720263, "grad_norm": 0.1962890625, "learning_rate": 7.535509576594357e-05, "loss": 0.0484, "step": 5150 }, { "epoch": 0.6623944446825839, "grad_norm": 0.158203125, "learning_rate": 7.534638435161452e-05, "loss": 0.0375, "step": 5151 }, { "epoch": 0.6625230399931415, "grad_norm": 0.18359375, "learning_rate": 7.53376719016618e-05, "loss": 0.0345, "step": 5152 }, { "epoch": 0.6626516353036993, "grad_norm": 0.1708984375, "learning_rate": 7.532895841644147e-05, "loss": 0.0404, "step": 5153 }, { "epoch": 0.662780230614257, "grad_norm": 0.1650390625, "learning_rate": 7.532024389630951e-05, "loss": 0.0372, "step": 5154 }, { "epoch": 0.6629088259248146, "grad_norm": 0.1552734375, "learning_rate": 7.531152834162198e-05, "loss": 0.0375, "step": 5155 }, { "epoch": 0.6630374212353722, "grad_norm": 0.173828125, "learning_rate": 7.530281175273501e-05, "loss": 0.0466, "step": 5156 }, { "epoch": 0.66316601654593, "grad_norm": 0.15234375, "learning_rate": 7.529409413000472e-05, "loss": 0.0329, "step": 5157 }, { "epoch": 0.6632946118564876, "grad_norm": 0.1904296875, "learning_rate": 7.528537547378734e-05, "loss": 0.0458, "step": 5158 }, { "epoch": 0.6634232071670453, "grad_norm": 0.1748046875, "learning_rate": 7.527665578443906e-05, "loss": 0.0454, "step": 5159 }, { "epoch": 0.663551802477603, "grad_norm": 0.1826171875, "learning_rate": 7.526793506231617e-05, "loss": 0.0436, "step": 5160 }, { "epoch": 0.6636803977881607, "grad_norm": 0.2021484375, "learning_rate": 7.525921330777501e-05, "loss": 0.0543, "step": 5161 }, { "epoch": 0.6638089930987183, "grad_norm": 0.208984375, "learning_rate": 7.525049052117193e-05, "loss": 0.0452, "step": 5162 }, { "epoch": 0.663937588409276, "grad_norm": 0.1787109375, "learning_rate": 7.524176670286334e-05, "loss": 0.0379, "step": 5163 }, { "epoch": 0.6640661837198337, "grad_norm": 0.17578125, "learning_rate": 7.523304185320565e-05, "loss": 0.0462, "step": 5164 }, { "epoch": 0.6641947790303914, "grad_norm": 0.1611328125, "learning_rate": 7.52243159725554e-05, "loss": 0.0381, "step": 5165 }, { "epoch": 0.664323374340949, "grad_norm": 0.162109375, "learning_rate": 7.521558906126909e-05, "loss": 0.0369, "step": 5166 }, { "epoch": 0.6644519696515067, "grad_norm": 0.1513671875, "learning_rate": 7.520686111970328e-05, "loss": 0.0318, "step": 5167 }, { "epoch": 0.6645805649620644, "grad_norm": 0.193359375, "learning_rate": 7.51981321482146e-05, "loss": 0.0525, "step": 5168 }, { "epoch": 0.6647091602726221, "grad_norm": 0.1591796875, "learning_rate": 7.51894021471597e-05, "loss": 0.0403, "step": 5169 }, { "epoch": 0.6648377555831797, "grad_norm": 0.1728515625, "learning_rate": 7.518067111689529e-05, "loss": 0.035, "step": 5170 }, { "epoch": 0.6649663508937375, "grad_norm": 0.1787109375, "learning_rate": 7.517193905777809e-05, "loss": 0.0526, "step": 5171 }, { "epoch": 0.6650949462042951, "grad_norm": 0.1904296875, "learning_rate": 7.51632059701649e-05, "loss": 0.0446, "step": 5172 }, { "epoch": 0.6652235415148527, "grad_norm": 0.17578125, "learning_rate": 7.515447185441252e-05, "loss": 0.0432, "step": 5173 }, { "epoch": 0.6653521368254104, "grad_norm": 0.267578125, "learning_rate": 7.514573671087784e-05, "loss": 0.0413, "step": 5174 }, { "epoch": 0.6654807321359681, "grad_norm": 0.1572265625, "learning_rate": 7.513700053991777e-05, "loss": 0.0351, "step": 5175 }, { "epoch": 0.6656093274465258, "grad_norm": 0.177734375, "learning_rate": 7.512826334188923e-05, "loss": 0.0432, "step": 5176 }, { "epoch": 0.6657379227570834, "grad_norm": 0.1728515625, "learning_rate": 7.511952511714925e-05, "loss": 0.0406, "step": 5177 }, { "epoch": 0.6658665180676412, "grad_norm": 0.1748046875, "learning_rate": 7.511078586605484e-05, "loss": 0.0424, "step": 5178 }, { "epoch": 0.6659951133781988, "grad_norm": 0.185546875, "learning_rate": 7.510204558896307e-05, "loss": 0.0413, "step": 5179 }, { "epoch": 0.6661237086887565, "grad_norm": 0.1640625, "learning_rate": 7.509330428623107e-05, "loss": 0.0369, "step": 5180 }, { "epoch": 0.6662523039993141, "grad_norm": 0.1689453125, "learning_rate": 7.508456195821599e-05, "loss": 0.0402, "step": 5181 }, { "epoch": 0.6663808993098719, "grad_norm": 0.21484375, "learning_rate": 7.507581860527506e-05, "loss": 0.0502, "step": 5182 }, { "epoch": 0.6665094946204295, "grad_norm": 0.1669921875, "learning_rate": 7.506707422776547e-05, "loss": 0.0432, "step": 5183 }, { "epoch": 0.6666380899309872, "grad_norm": 0.185546875, "learning_rate": 7.505832882604456e-05, "loss": 0.0468, "step": 5184 }, { "epoch": 0.6667666852415448, "grad_norm": 0.1611328125, "learning_rate": 7.504958240046963e-05, "loss": 0.0398, "step": 5185 }, { "epoch": 0.6668952805521026, "grad_norm": 0.1875, "learning_rate": 7.504083495139803e-05, "loss": 0.0389, "step": 5186 }, { "epoch": 0.6670238758626602, "grad_norm": 0.17578125, "learning_rate": 7.503208647918722e-05, "loss": 0.0493, "step": 5187 }, { "epoch": 0.6671524711732179, "grad_norm": 0.197265625, "learning_rate": 7.502333698419461e-05, "loss": 0.0484, "step": 5188 }, { "epoch": 0.6672810664837756, "grad_norm": 0.291015625, "learning_rate": 7.501458646677773e-05, "loss": 0.0518, "step": 5189 }, { "epoch": 0.6674096617943333, "grad_norm": 0.1865234375, "learning_rate": 7.500583492729405e-05, "loss": 0.0455, "step": 5190 }, { "epoch": 0.6675382571048909, "grad_norm": 0.1640625, "learning_rate": 7.499708236610122e-05, "loss": 0.037, "step": 5191 }, { "epoch": 0.6676668524154485, "grad_norm": 0.1796875, "learning_rate": 7.498832878355684e-05, "loss": 0.0399, "step": 5192 }, { "epoch": 0.6677954477260063, "grad_norm": 0.1787109375, "learning_rate": 7.497957418001856e-05, "loss": 0.0382, "step": 5193 }, { "epoch": 0.6679240430365639, "grad_norm": 0.1943359375, "learning_rate": 7.497081855584407e-05, "loss": 0.0545, "step": 5194 }, { "epoch": 0.6680526383471216, "grad_norm": 0.173828125, "learning_rate": 7.496206191139114e-05, "loss": 0.0426, "step": 5195 }, { "epoch": 0.6681812336576792, "grad_norm": 0.173828125, "learning_rate": 7.495330424701754e-05, "loss": 0.0392, "step": 5196 }, { "epoch": 0.668309828968237, "grad_norm": 0.193359375, "learning_rate": 7.49445455630811e-05, "loss": 0.0413, "step": 5197 }, { "epoch": 0.6684384242787946, "grad_norm": 0.17578125, "learning_rate": 7.493578585993972e-05, "loss": 0.0447, "step": 5198 }, { "epoch": 0.6685670195893523, "grad_norm": 0.185546875, "learning_rate": 7.492702513795127e-05, "loss": 0.047, "step": 5199 }, { "epoch": 0.66869561489991, "grad_norm": 0.1865234375, "learning_rate": 7.49182633974737e-05, "loss": 0.038, "step": 5200 }, { "epoch": 0.6688242102104677, "grad_norm": 0.18359375, "learning_rate": 7.490950063886506e-05, "loss": 0.0474, "step": 5201 }, { "epoch": 0.6689528055210253, "grad_norm": 0.1845703125, "learning_rate": 7.490073686248332e-05, "loss": 0.0504, "step": 5202 }, { "epoch": 0.669081400831583, "grad_norm": 0.1787109375, "learning_rate": 7.489197206868658e-05, "loss": 0.0412, "step": 5203 }, { "epoch": 0.6692099961421407, "grad_norm": 0.158203125, "learning_rate": 7.488320625783299e-05, "loss": 0.0379, "step": 5204 }, { "epoch": 0.6693385914526984, "grad_norm": 0.1787109375, "learning_rate": 7.487443943028067e-05, "loss": 0.0464, "step": 5205 }, { "epoch": 0.669467186763256, "grad_norm": 0.1923828125, "learning_rate": 7.486567158638786e-05, "loss": 0.0489, "step": 5206 }, { "epoch": 0.6695957820738138, "grad_norm": 0.2177734375, "learning_rate": 7.485690272651275e-05, "loss": 0.0403, "step": 5207 }, { "epoch": 0.6697243773843714, "grad_norm": 0.154296875, "learning_rate": 7.484813285101369e-05, "loss": 0.0358, "step": 5208 }, { "epoch": 0.669852972694929, "grad_norm": 0.181640625, "learning_rate": 7.483936196024894e-05, "loss": 0.0413, "step": 5209 }, { "epoch": 0.6699815680054867, "grad_norm": 0.166015625, "learning_rate": 7.483059005457694e-05, "loss": 0.046, "step": 5210 }, { "epoch": 0.6701101633160444, "grad_norm": 0.1669921875, "learning_rate": 7.482181713435604e-05, "loss": 0.0408, "step": 5211 }, { "epoch": 0.6702387586266021, "grad_norm": 0.16796875, "learning_rate": 7.481304319994471e-05, "loss": 0.0489, "step": 5212 }, { "epoch": 0.6703673539371597, "grad_norm": 0.158203125, "learning_rate": 7.480426825170148e-05, "loss": 0.0362, "step": 5213 }, { "epoch": 0.6704959492477174, "grad_norm": 0.1689453125, "learning_rate": 7.479549228998481e-05, "loss": 0.0415, "step": 5214 }, { "epoch": 0.6706245445582751, "grad_norm": 0.19921875, "learning_rate": 7.478671531515333e-05, "loss": 0.0454, "step": 5215 }, { "epoch": 0.6707531398688328, "grad_norm": 0.177734375, "learning_rate": 7.477793732756565e-05, "loss": 0.0361, "step": 5216 }, { "epoch": 0.6708817351793904, "grad_norm": 0.1767578125, "learning_rate": 7.476915832758042e-05, "loss": 0.0421, "step": 5217 }, { "epoch": 0.6710103304899482, "grad_norm": 0.201171875, "learning_rate": 7.476037831555634e-05, "loss": 0.0493, "step": 5218 }, { "epoch": 0.6711389258005058, "grad_norm": 0.1767578125, "learning_rate": 7.475159729185214e-05, "loss": 0.0478, "step": 5219 }, { "epoch": 0.6712675211110635, "grad_norm": 0.166015625, "learning_rate": 7.474281525682664e-05, "loss": 0.0462, "step": 5220 }, { "epoch": 0.6713961164216211, "grad_norm": 0.1787109375, "learning_rate": 7.473403221083862e-05, "loss": 0.0405, "step": 5221 }, { "epoch": 0.6715247117321789, "grad_norm": 0.1806640625, "learning_rate": 7.472524815424698e-05, "loss": 0.0398, "step": 5222 }, { "epoch": 0.6716533070427365, "grad_norm": 0.1640625, "learning_rate": 7.471646308741058e-05, "loss": 0.0381, "step": 5223 }, { "epoch": 0.6717819023532942, "grad_norm": 0.150390625, "learning_rate": 7.470767701068841e-05, "loss": 0.0371, "step": 5224 }, { "epoch": 0.6719104976638519, "grad_norm": 0.1875, "learning_rate": 7.469888992443947e-05, "loss": 0.049, "step": 5225 }, { "epoch": 0.6720390929744096, "grad_norm": 0.181640625, "learning_rate": 7.469010182902274e-05, "loss": 0.0398, "step": 5226 }, { "epoch": 0.6721676882849672, "grad_norm": 0.2080078125, "learning_rate": 7.468131272479732e-05, "loss": 0.0372, "step": 5227 }, { "epoch": 0.6722962835955248, "grad_norm": 0.1708984375, "learning_rate": 7.467252261212233e-05, "loss": 0.0484, "step": 5228 }, { "epoch": 0.6724248789060826, "grad_norm": 0.1650390625, "learning_rate": 7.466373149135691e-05, "loss": 0.0424, "step": 5229 }, { "epoch": 0.6725534742166402, "grad_norm": 0.1572265625, "learning_rate": 7.465493936286025e-05, "loss": 0.0367, "step": 5230 }, { "epoch": 0.6726820695271979, "grad_norm": 0.1982421875, "learning_rate": 7.464614622699159e-05, "loss": 0.0399, "step": 5231 }, { "epoch": 0.6728106648377555, "grad_norm": 0.166015625, "learning_rate": 7.463735208411025e-05, "loss": 0.0425, "step": 5232 }, { "epoch": 0.6729392601483133, "grad_norm": 0.158203125, "learning_rate": 7.462855693457546e-05, "loss": 0.0324, "step": 5233 }, { "epoch": 0.6730678554588709, "grad_norm": 0.185546875, "learning_rate": 7.461976077874667e-05, "loss": 0.0477, "step": 5234 }, { "epoch": 0.6731964507694286, "grad_norm": 0.1650390625, "learning_rate": 7.461096361698322e-05, "loss": 0.0398, "step": 5235 }, { "epoch": 0.6733250460799863, "grad_norm": 0.177734375, "learning_rate": 7.460216544964457e-05, "loss": 0.0405, "step": 5236 }, { "epoch": 0.673453641390544, "grad_norm": 0.1787109375, "learning_rate": 7.459336627709022e-05, "loss": 0.0416, "step": 5237 }, { "epoch": 0.6735822367011016, "grad_norm": 0.1943359375, "learning_rate": 7.458456609967968e-05, "loss": 0.0547, "step": 5238 }, { "epoch": 0.6737108320116593, "grad_norm": 0.1650390625, "learning_rate": 7.457576491777251e-05, "loss": 0.0441, "step": 5239 }, { "epoch": 0.673839427322217, "grad_norm": 0.173828125, "learning_rate": 7.456696273172831e-05, "loss": 0.0444, "step": 5240 }, { "epoch": 0.6739680226327747, "grad_norm": 0.205078125, "learning_rate": 7.455815954190673e-05, "loss": 0.044, "step": 5241 }, { "epoch": 0.6740966179433323, "grad_norm": 0.1640625, "learning_rate": 7.454935534866748e-05, "loss": 0.0401, "step": 5242 }, { "epoch": 0.67422521325389, "grad_norm": 0.177734375, "learning_rate": 7.454055015237028e-05, "loss": 0.0422, "step": 5243 }, { "epoch": 0.6743538085644477, "grad_norm": 0.17578125, "learning_rate": 7.453174395337488e-05, "loss": 0.0406, "step": 5244 }, { "epoch": 0.6744824038750054, "grad_norm": 0.1796875, "learning_rate": 7.452293675204109e-05, "loss": 0.051, "step": 5245 }, { "epoch": 0.674610999185563, "grad_norm": 0.1611328125, "learning_rate": 7.451412854872882e-05, "loss": 0.0353, "step": 5246 }, { "epoch": 0.6747395944961208, "grad_norm": 0.1806640625, "learning_rate": 7.450531934379788e-05, "loss": 0.0493, "step": 5247 }, { "epoch": 0.6748681898066784, "grad_norm": 0.1728515625, "learning_rate": 7.449650913760827e-05, "loss": 0.0369, "step": 5248 }, { "epoch": 0.674996785117236, "grad_norm": 0.1689453125, "learning_rate": 7.448769793051993e-05, "loss": 0.0416, "step": 5249 }, { "epoch": 0.6751253804277937, "grad_norm": 0.1640625, "learning_rate": 7.447888572289287e-05, "loss": 0.0418, "step": 5250 }, { "epoch": 0.6752539757383514, "grad_norm": 0.1875, "learning_rate": 7.44700725150872e-05, "loss": 0.0489, "step": 5251 }, { "epoch": 0.6753825710489091, "grad_norm": 0.1865234375, "learning_rate": 7.446125830746293e-05, "loss": 0.0517, "step": 5252 }, { "epoch": 0.6755111663594667, "grad_norm": 0.1767578125, "learning_rate": 7.445244310038027e-05, "loss": 0.0446, "step": 5253 }, { "epoch": 0.6756397616700245, "grad_norm": 0.177734375, "learning_rate": 7.444362689419939e-05, "loss": 0.0435, "step": 5254 }, { "epoch": 0.6757683569805821, "grad_norm": 0.1806640625, "learning_rate": 7.443480968928047e-05, "loss": 0.0483, "step": 5255 }, { "epoch": 0.6758969522911398, "grad_norm": 0.19140625, "learning_rate": 7.442599148598379e-05, "loss": 0.0489, "step": 5256 }, { "epoch": 0.6760255476016974, "grad_norm": 0.1796875, "learning_rate": 7.441717228466967e-05, "loss": 0.0472, "step": 5257 }, { "epoch": 0.6761541429122552, "grad_norm": 0.1572265625, "learning_rate": 7.440835208569846e-05, "loss": 0.0352, "step": 5258 }, { "epoch": 0.6762827382228128, "grad_norm": 0.193359375, "learning_rate": 7.43995308894305e-05, "loss": 0.0448, "step": 5259 }, { "epoch": 0.6764113335333705, "grad_norm": 0.166015625, "learning_rate": 7.439070869622623e-05, "loss": 0.0443, "step": 5260 }, { "epoch": 0.6765399288439281, "grad_norm": 0.1865234375, "learning_rate": 7.438188550644614e-05, "loss": 0.0374, "step": 5261 }, { "epoch": 0.6766685241544859, "grad_norm": 0.169921875, "learning_rate": 7.43730613204507e-05, "loss": 0.0385, "step": 5262 }, { "epoch": 0.6767971194650435, "grad_norm": 0.1572265625, "learning_rate": 7.436423613860049e-05, "loss": 0.0391, "step": 5263 }, { "epoch": 0.6769257147756012, "grad_norm": 0.1826171875, "learning_rate": 7.435540996125605e-05, "loss": 0.0412, "step": 5264 }, { "epoch": 0.6770543100861589, "grad_norm": 0.2021484375, "learning_rate": 7.434658278877806e-05, "loss": 0.057, "step": 5265 }, { "epoch": 0.6771829053967166, "grad_norm": 0.1884765625, "learning_rate": 7.433775462152716e-05, "loss": 0.0471, "step": 5266 }, { "epoch": 0.6773115007072742, "grad_norm": 0.181640625, "learning_rate": 7.432892545986405e-05, "loss": 0.0505, "step": 5267 }, { "epoch": 0.6774400960178318, "grad_norm": 0.16015625, "learning_rate": 7.43200953041495e-05, "loss": 0.039, "step": 5268 }, { "epoch": 0.6775686913283896, "grad_norm": 0.1767578125, "learning_rate": 7.431126415474429e-05, "loss": 0.0404, "step": 5269 }, { "epoch": 0.6776972866389472, "grad_norm": 0.1728515625, "learning_rate": 7.430243201200926e-05, "loss": 0.0353, "step": 5270 }, { "epoch": 0.6778258819495049, "grad_norm": 0.169921875, "learning_rate": 7.429359887630526e-05, "loss": 0.0384, "step": 5271 }, { "epoch": 0.6779544772600626, "grad_norm": 0.171875, "learning_rate": 7.428476474799321e-05, "loss": 0.0428, "step": 5272 }, { "epoch": 0.6780830725706203, "grad_norm": 0.16796875, "learning_rate": 7.427592962743409e-05, "loss": 0.0371, "step": 5273 }, { "epoch": 0.6782116678811779, "grad_norm": 0.189453125, "learning_rate": 7.426709351498886e-05, "loss": 0.0479, "step": 5274 }, { "epoch": 0.6783402631917356, "grad_norm": 0.1962890625, "learning_rate": 7.425825641101855e-05, "loss": 0.0483, "step": 5275 }, { "epoch": 0.6784688585022933, "grad_norm": 0.1611328125, "learning_rate": 7.424941831588425e-05, "loss": 0.0372, "step": 5276 }, { "epoch": 0.678597453812851, "grad_norm": 0.1728515625, "learning_rate": 7.42405792299471e-05, "loss": 0.0437, "step": 5277 }, { "epoch": 0.6787260491234086, "grad_norm": 0.1611328125, "learning_rate": 7.423173915356819e-05, "loss": 0.0394, "step": 5278 }, { "epoch": 0.6788546444339663, "grad_norm": 0.189453125, "learning_rate": 7.422289808710876e-05, "loss": 0.0465, "step": 5279 }, { "epoch": 0.678983239744524, "grad_norm": 0.1826171875, "learning_rate": 7.421405603093004e-05, "loss": 0.0442, "step": 5280 }, { "epoch": 0.6791118350550817, "grad_norm": 0.16796875, "learning_rate": 7.420521298539329e-05, "loss": 0.0451, "step": 5281 }, { "epoch": 0.6792404303656393, "grad_norm": 0.193359375, "learning_rate": 7.419636895085986e-05, "loss": 0.0505, "step": 5282 }, { "epoch": 0.6793690256761971, "grad_norm": 0.1484375, "learning_rate": 7.418752392769107e-05, "loss": 0.0312, "step": 5283 }, { "epoch": 0.6794976209867547, "grad_norm": 0.1650390625, "learning_rate": 7.417867791624835e-05, "loss": 0.0405, "step": 5284 }, { "epoch": 0.6796262162973123, "grad_norm": 0.177734375, "learning_rate": 7.41698309168931e-05, "loss": 0.0409, "step": 5285 }, { "epoch": 0.67975481160787, "grad_norm": 0.177734375, "learning_rate": 7.416098292998683e-05, "loss": 0.0365, "step": 5286 }, { "epoch": 0.6798834069184277, "grad_norm": 0.16796875, "learning_rate": 7.415213395589103e-05, "loss": 0.0433, "step": 5287 }, { "epoch": 0.6800120022289854, "grad_norm": 0.1796875, "learning_rate": 7.41432839949673e-05, "loss": 0.0474, "step": 5288 }, { "epoch": 0.680140597539543, "grad_norm": 0.15234375, "learning_rate": 7.41344330475772e-05, "loss": 0.0334, "step": 5289 }, { "epoch": 0.6802691928501007, "grad_norm": 0.1865234375, "learning_rate": 7.412558111408239e-05, "loss": 0.0388, "step": 5290 }, { "epoch": 0.6803977881606584, "grad_norm": 0.1826171875, "learning_rate": 7.411672819484453e-05, "loss": 0.0445, "step": 5291 }, { "epoch": 0.6805263834712161, "grad_norm": 0.177734375, "learning_rate": 7.410787429022537e-05, "loss": 0.0453, "step": 5292 }, { "epoch": 0.6806549787817737, "grad_norm": 0.1826171875, "learning_rate": 7.409901940058664e-05, "loss": 0.0467, "step": 5293 }, { "epoch": 0.6807835740923315, "grad_norm": 0.1591796875, "learning_rate": 7.409016352629017e-05, "loss": 0.0381, "step": 5294 }, { "epoch": 0.6809121694028891, "grad_norm": 0.1689453125, "learning_rate": 7.408130666769778e-05, "loss": 0.0375, "step": 5295 }, { "epoch": 0.6810407647134468, "grad_norm": 0.169921875, "learning_rate": 7.407244882517136e-05, "loss": 0.0401, "step": 5296 }, { "epoch": 0.6811693600240044, "grad_norm": 0.193359375, "learning_rate": 7.406358999907283e-05, "loss": 0.052, "step": 5297 }, { "epoch": 0.6812979553345622, "grad_norm": 0.1591796875, "learning_rate": 7.405473018976411e-05, "loss": 0.0309, "step": 5298 }, { "epoch": 0.6814265506451198, "grad_norm": 0.1796875, "learning_rate": 7.404586939760729e-05, "loss": 0.0477, "step": 5299 }, { "epoch": 0.6815551459556775, "grad_norm": 0.1611328125, "learning_rate": 7.403700762296434e-05, "loss": 0.0337, "step": 5300 }, { "epoch": 0.6816837412662352, "grad_norm": 0.1865234375, "learning_rate": 7.402814486619737e-05, "loss": 0.0407, "step": 5301 }, { "epoch": 0.6818123365767929, "grad_norm": 0.1650390625, "learning_rate": 7.401928112766848e-05, "loss": 0.0398, "step": 5302 }, { "epoch": 0.6819409318873505, "grad_norm": 0.1767578125, "learning_rate": 7.401041640773985e-05, "loss": 0.039, "step": 5303 }, { "epoch": 0.6820695271979081, "grad_norm": 0.16796875, "learning_rate": 7.400155070677369e-05, "loss": 0.0452, "step": 5304 }, { "epoch": 0.6821981225084659, "grad_norm": 0.1689453125, "learning_rate": 7.399268402513222e-05, "loss": 0.0424, "step": 5305 }, { "epoch": 0.6823267178190235, "grad_norm": 0.185546875, "learning_rate": 7.398381636317775e-05, "loss": 0.045, "step": 5306 }, { "epoch": 0.6824553131295812, "grad_norm": 0.1728515625, "learning_rate": 7.397494772127257e-05, "loss": 0.0385, "step": 5307 }, { "epoch": 0.6825839084401388, "grad_norm": 0.166015625, "learning_rate": 7.396607809977907e-05, "loss": 0.0337, "step": 5308 }, { "epoch": 0.6827125037506966, "grad_norm": 0.185546875, "learning_rate": 7.395720749905964e-05, "loss": 0.0468, "step": 5309 }, { "epoch": 0.6828410990612542, "grad_norm": 0.158203125, "learning_rate": 7.394833591947671e-05, "loss": 0.0345, "step": 5310 }, { "epoch": 0.6829696943718119, "grad_norm": 0.181640625, "learning_rate": 7.393946336139278e-05, "loss": 0.0504, "step": 5311 }, { "epoch": 0.6830982896823696, "grad_norm": 0.185546875, "learning_rate": 7.393058982517036e-05, "loss": 0.0463, "step": 5312 }, { "epoch": 0.6832268849929273, "grad_norm": 0.2119140625, "learning_rate": 7.392171531117204e-05, "loss": 0.0502, "step": 5313 }, { "epoch": 0.6833554803034849, "grad_norm": 0.15234375, "learning_rate": 7.391283981976039e-05, "loss": 0.0327, "step": 5314 }, { "epoch": 0.6834840756140426, "grad_norm": 0.1806640625, "learning_rate": 7.390396335129807e-05, "loss": 0.0513, "step": 5315 }, { "epoch": 0.6836126709246003, "grad_norm": 0.1767578125, "learning_rate": 7.389508590614774e-05, "loss": 0.041, "step": 5316 }, { "epoch": 0.683741266235158, "grad_norm": 0.171875, "learning_rate": 7.388620748467216e-05, "loss": 0.0385, "step": 5317 }, { "epoch": 0.6838698615457156, "grad_norm": 0.173828125, "learning_rate": 7.387732808723405e-05, "loss": 0.0413, "step": 5318 }, { "epoch": 0.6839984568562734, "grad_norm": 0.169921875, "learning_rate": 7.386844771419625e-05, "loss": 0.0398, "step": 5319 }, { "epoch": 0.684127052166831, "grad_norm": 0.1826171875, "learning_rate": 7.385956636592157e-05, "loss": 0.0463, "step": 5320 }, { "epoch": 0.6842556474773887, "grad_norm": 0.169921875, "learning_rate": 7.385068404277292e-05, "loss": 0.0398, "step": 5321 }, { "epoch": 0.6843842427879463, "grad_norm": 0.169921875, "learning_rate": 7.38418007451132e-05, "loss": 0.0369, "step": 5322 }, { "epoch": 0.684512838098504, "grad_norm": 0.1826171875, "learning_rate": 7.383291647330537e-05, "loss": 0.046, "step": 5323 }, { "epoch": 0.6846414334090617, "grad_norm": 0.2138671875, "learning_rate": 7.382403122771245e-05, "loss": 0.0483, "step": 5324 }, { "epoch": 0.6847700287196193, "grad_norm": 0.1689453125, "learning_rate": 7.381514500869747e-05, "loss": 0.0376, "step": 5325 }, { "epoch": 0.684898624030177, "grad_norm": 0.17578125, "learning_rate": 7.38062578166235e-05, "loss": 0.0401, "step": 5326 }, { "epoch": 0.6850272193407347, "grad_norm": 0.181640625, "learning_rate": 7.379736965185368e-05, "loss": 0.0416, "step": 5327 }, { "epoch": 0.6851558146512924, "grad_norm": 0.19921875, "learning_rate": 7.378848051475116e-05, "loss": 0.053, "step": 5328 }, { "epoch": 0.68528440996185, "grad_norm": 0.1640625, "learning_rate": 7.377959040567914e-05, "loss": 0.0402, "step": 5329 }, { "epoch": 0.6854130052724078, "grad_norm": 0.169921875, "learning_rate": 7.377069932500084e-05, "loss": 0.0383, "step": 5330 }, { "epoch": 0.6855416005829654, "grad_norm": 0.1728515625, "learning_rate": 7.376180727307957e-05, "loss": 0.0465, "step": 5331 }, { "epoch": 0.6856701958935231, "grad_norm": 0.1396484375, "learning_rate": 7.375291425027867e-05, "loss": 0.029, "step": 5332 }, { "epoch": 0.6857987912040807, "grad_norm": 0.19140625, "learning_rate": 7.374402025696144e-05, "loss": 0.0489, "step": 5333 }, { "epoch": 0.6859273865146385, "grad_norm": 0.2275390625, "learning_rate": 7.373512529349129e-05, "loss": 0.0618, "step": 5334 }, { "epoch": 0.6860559818251961, "grad_norm": 0.189453125, "learning_rate": 7.372622936023169e-05, "loss": 0.0395, "step": 5335 }, { "epoch": 0.6861845771357538, "grad_norm": 0.1904296875, "learning_rate": 7.371733245754608e-05, "loss": 0.0509, "step": 5336 }, { "epoch": 0.6863131724463114, "grad_norm": 0.1953125, "learning_rate": 7.370843458579802e-05, "loss": 0.0525, "step": 5337 }, { "epoch": 0.6864417677568692, "grad_norm": 0.1865234375, "learning_rate": 7.369953574535104e-05, "loss": 0.0529, "step": 5338 }, { "epoch": 0.6865703630674268, "grad_norm": 0.18359375, "learning_rate": 7.369063593656873e-05, "loss": 0.048, "step": 5339 }, { "epoch": 0.6866989583779844, "grad_norm": 0.17578125, "learning_rate": 7.368173515981474e-05, "loss": 0.0441, "step": 5340 }, { "epoch": 0.6868275536885422, "grad_norm": 0.1787109375, "learning_rate": 7.367283341545273e-05, "loss": 0.0475, "step": 5341 }, { "epoch": 0.6869561489990998, "grad_norm": 0.1796875, "learning_rate": 7.366393070384645e-05, "loss": 0.0413, "step": 5342 }, { "epoch": 0.6870847443096575, "grad_norm": 0.1806640625, "learning_rate": 7.365502702535961e-05, "loss": 0.0378, "step": 5343 }, { "epoch": 0.6872133396202151, "grad_norm": 0.166015625, "learning_rate": 7.364612238035605e-05, "loss": 0.0429, "step": 5344 }, { "epoch": 0.6873419349307729, "grad_norm": 0.173828125, "learning_rate": 7.363721676919954e-05, "loss": 0.0473, "step": 5345 }, { "epoch": 0.6874705302413305, "grad_norm": 0.16015625, "learning_rate": 7.3628310192254e-05, "loss": 0.0408, "step": 5346 }, { "epoch": 0.6875991255518882, "grad_norm": 0.173828125, "learning_rate": 7.361940264988334e-05, "loss": 0.0446, "step": 5347 }, { "epoch": 0.6877277208624459, "grad_norm": 0.150390625, "learning_rate": 7.36104941424515e-05, "loss": 0.0335, "step": 5348 }, { "epoch": 0.6878563161730036, "grad_norm": 0.20703125, "learning_rate": 7.360158467032248e-05, "loss": 0.0456, "step": 5349 }, { "epoch": 0.6879849114835612, "grad_norm": 0.185546875, "learning_rate": 7.35926742338603e-05, "loss": 0.0401, "step": 5350 }, { "epoch": 0.6881135067941189, "grad_norm": 0.1806640625, "learning_rate": 7.358376283342904e-05, "loss": 0.044, "step": 5351 }, { "epoch": 0.6882421021046766, "grad_norm": 0.1650390625, "learning_rate": 7.35748504693928e-05, "loss": 0.042, "step": 5352 }, { "epoch": 0.6883706974152343, "grad_norm": 0.171875, "learning_rate": 7.356593714211573e-05, "loss": 0.0434, "step": 5353 }, { "epoch": 0.6884992927257919, "grad_norm": 0.173828125, "learning_rate": 7.355702285196204e-05, "loss": 0.036, "step": 5354 }, { "epoch": 0.6886278880363496, "grad_norm": 0.150390625, "learning_rate": 7.354810759929594e-05, "loss": 0.0365, "step": 5355 }, { "epoch": 0.6887564833469073, "grad_norm": 0.1572265625, "learning_rate": 7.353919138448168e-05, "loss": 0.0374, "step": 5356 }, { "epoch": 0.688885078657465, "grad_norm": 0.17578125, "learning_rate": 7.353027420788358e-05, "loss": 0.0416, "step": 5357 }, { "epoch": 0.6890136739680226, "grad_norm": 0.171875, "learning_rate": 7.3521356069866e-05, "loss": 0.0368, "step": 5358 }, { "epoch": 0.6891422692785804, "grad_norm": 0.16015625, "learning_rate": 7.351243697079331e-05, "loss": 0.0327, "step": 5359 }, { "epoch": 0.689270864589138, "grad_norm": 0.177734375, "learning_rate": 7.350351691102993e-05, "loss": 0.0496, "step": 5360 }, { "epoch": 0.6893994598996956, "grad_norm": 0.1845703125, "learning_rate": 7.349459589094035e-05, "loss": 0.0454, "step": 5361 }, { "epoch": 0.6895280552102533, "grad_norm": 0.1689453125, "learning_rate": 7.348567391088903e-05, "loss": 0.0489, "step": 5362 }, { "epoch": 0.689656650520811, "grad_norm": 0.18359375, "learning_rate": 7.347675097124054e-05, "loss": 0.0452, "step": 5363 }, { "epoch": 0.6897852458313687, "grad_norm": 0.1826171875, "learning_rate": 7.346782707235946e-05, "loss": 0.0439, "step": 5364 }, { "epoch": 0.6899138411419263, "grad_norm": 0.1875, "learning_rate": 7.345890221461041e-05, "loss": 0.0431, "step": 5365 }, { "epoch": 0.6900424364524841, "grad_norm": 0.15234375, "learning_rate": 7.344997639835802e-05, "loss": 0.0342, "step": 5366 }, { "epoch": 0.6901710317630417, "grad_norm": 0.158203125, "learning_rate": 7.344104962396704e-05, "loss": 0.0336, "step": 5367 }, { "epoch": 0.6902996270735994, "grad_norm": 0.177734375, "learning_rate": 7.343212189180216e-05, "loss": 0.048, "step": 5368 }, { "epoch": 0.690428222384157, "grad_norm": 0.181640625, "learning_rate": 7.342319320222819e-05, "loss": 0.048, "step": 5369 }, { "epoch": 0.6905568176947148, "grad_norm": 0.1875, "learning_rate": 7.341426355560993e-05, "loss": 0.0505, "step": 5370 }, { "epoch": 0.6906854130052724, "grad_norm": 0.185546875, "learning_rate": 7.340533295231222e-05, "loss": 0.0453, "step": 5371 }, { "epoch": 0.6908140083158301, "grad_norm": 0.1611328125, "learning_rate": 7.339640139269999e-05, "loss": 0.0389, "step": 5372 }, { "epoch": 0.6909426036263877, "grad_norm": 0.1728515625, "learning_rate": 7.338746887713816e-05, "loss": 0.0385, "step": 5373 }, { "epoch": 0.6910711989369455, "grad_norm": 0.201171875, "learning_rate": 7.337853540599168e-05, "loss": 0.0473, "step": 5374 }, { "epoch": 0.6911997942475031, "grad_norm": 0.2138671875, "learning_rate": 7.33696009796256e-05, "loss": 0.0505, "step": 5375 }, { "epoch": 0.6913283895580608, "grad_norm": 0.1796875, "learning_rate": 7.336066559840493e-05, "loss": 0.0469, "step": 5376 }, { "epoch": 0.6914569848686185, "grad_norm": 0.1552734375, "learning_rate": 7.335172926269478e-05, "loss": 0.0434, "step": 5377 }, { "epoch": 0.6915855801791762, "grad_norm": 0.1611328125, "learning_rate": 7.334279197286027e-05, "loss": 0.0416, "step": 5378 }, { "epoch": 0.6917141754897338, "grad_norm": 0.16015625, "learning_rate": 7.333385372926657e-05, "loss": 0.0422, "step": 5379 }, { "epoch": 0.6918427708002914, "grad_norm": 0.1767578125, "learning_rate": 7.332491453227892e-05, "loss": 0.048, "step": 5380 }, { "epoch": 0.6919713661108492, "grad_norm": 0.181640625, "learning_rate": 7.331597438226251e-05, "loss": 0.0439, "step": 5381 }, { "epoch": 0.6920999614214068, "grad_norm": 0.19140625, "learning_rate": 7.330703327958266e-05, "loss": 0.0433, "step": 5382 }, { "epoch": 0.6922285567319645, "grad_norm": 0.158203125, "learning_rate": 7.329809122460465e-05, "loss": 0.0418, "step": 5383 }, { "epoch": 0.6923571520425221, "grad_norm": 0.158203125, "learning_rate": 7.32891482176939e-05, "loss": 0.0374, "step": 5384 }, { "epoch": 0.6924857473530799, "grad_norm": 0.1748046875, "learning_rate": 7.328020425921579e-05, "loss": 0.0402, "step": 5385 }, { "epoch": 0.6926143426636375, "grad_norm": 0.1845703125, "learning_rate": 7.327125934953574e-05, "loss": 0.0512, "step": 5386 }, { "epoch": 0.6927429379741952, "grad_norm": 0.1865234375, "learning_rate": 7.326231348901924e-05, "loss": 0.0466, "step": 5387 }, { "epoch": 0.6928715332847529, "grad_norm": 0.1748046875, "learning_rate": 7.325336667803182e-05, "loss": 0.0369, "step": 5388 }, { "epoch": 0.6930001285953106, "grad_norm": 0.181640625, "learning_rate": 7.324441891693903e-05, "loss": 0.0396, "step": 5389 }, { "epoch": 0.6931287239058682, "grad_norm": 0.18359375, "learning_rate": 7.323547020610643e-05, "loss": 0.0492, "step": 5390 }, { "epoch": 0.6932573192164259, "grad_norm": 0.173828125, "learning_rate": 7.322652054589973e-05, "loss": 0.0388, "step": 5391 }, { "epoch": 0.6933859145269836, "grad_norm": 0.1796875, "learning_rate": 7.321756993668454e-05, "loss": 0.0463, "step": 5392 }, { "epoch": 0.6935145098375413, "grad_norm": 0.181640625, "learning_rate": 7.320861837882657e-05, "loss": 0.0446, "step": 5393 }, { "epoch": 0.6936431051480989, "grad_norm": 0.1865234375, "learning_rate": 7.319966587269161e-05, "loss": 0.0401, "step": 5394 }, { "epoch": 0.6937717004586567, "grad_norm": 0.17578125, "learning_rate": 7.319071241864541e-05, "loss": 0.0473, "step": 5395 }, { "epoch": 0.6939002957692143, "grad_norm": 0.1796875, "learning_rate": 7.318175801705382e-05, "loss": 0.0387, "step": 5396 }, { "epoch": 0.694028891079772, "grad_norm": 0.171875, "learning_rate": 7.317280266828271e-05, "loss": 0.0434, "step": 5397 }, { "epoch": 0.6941574863903296, "grad_norm": 0.173828125, "learning_rate": 7.316384637269798e-05, "loss": 0.0395, "step": 5398 }, { "epoch": 0.6942860817008873, "grad_norm": 0.181640625, "learning_rate": 7.315488913066558e-05, "loss": 0.0444, "step": 5399 }, { "epoch": 0.694414677011445, "grad_norm": 0.1884765625, "learning_rate": 7.314593094255146e-05, "loss": 0.041, "step": 5400 }, { "epoch": 0.6945432723220026, "grad_norm": 0.1591796875, "learning_rate": 7.313697180872167e-05, "loss": 0.0352, "step": 5401 }, { "epoch": 0.6946718676325603, "grad_norm": 0.173828125, "learning_rate": 7.312801172954227e-05, "loss": 0.0415, "step": 5402 }, { "epoch": 0.694800462943118, "grad_norm": 0.1875, "learning_rate": 7.311905070537934e-05, "loss": 0.0476, "step": 5403 }, { "epoch": 0.6949290582536757, "grad_norm": 0.181640625, "learning_rate": 7.311008873659905e-05, "loss": 0.0387, "step": 5404 }, { "epoch": 0.6950576535642333, "grad_norm": 0.1962890625, "learning_rate": 7.310112582356754e-05, "loss": 0.0317, "step": 5405 }, { "epoch": 0.6951862488747911, "grad_norm": 0.1845703125, "learning_rate": 7.309216196665105e-05, "loss": 0.045, "step": 5406 }, { "epoch": 0.6953148441853487, "grad_norm": 0.1669921875, "learning_rate": 7.308319716621581e-05, "loss": 0.0427, "step": 5407 }, { "epoch": 0.6954434394959064, "grad_norm": 0.1748046875, "learning_rate": 7.307423142262814e-05, "loss": 0.0449, "step": 5408 }, { "epoch": 0.695572034806464, "grad_norm": 0.1865234375, "learning_rate": 7.306526473625433e-05, "loss": 0.0451, "step": 5409 }, { "epoch": 0.6957006301170218, "grad_norm": 0.185546875, "learning_rate": 7.305629710746079e-05, "loss": 0.046, "step": 5410 }, { "epoch": 0.6958292254275794, "grad_norm": 0.1826171875, "learning_rate": 7.30473285366139e-05, "loss": 0.0424, "step": 5411 }, { "epoch": 0.6959578207381371, "grad_norm": 0.19921875, "learning_rate": 7.30383590240801e-05, "loss": 0.0602, "step": 5412 }, { "epoch": 0.6960864160486948, "grad_norm": 0.1865234375, "learning_rate": 7.30293885702259e-05, "loss": 0.0437, "step": 5413 }, { "epoch": 0.6962150113592525, "grad_norm": 0.1806640625, "learning_rate": 7.302041717541782e-05, "loss": 0.0448, "step": 5414 }, { "epoch": 0.6963436066698101, "grad_norm": 0.1669921875, "learning_rate": 7.30114448400224e-05, "loss": 0.0414, "step": 5415 }, { "epoch": 0.6964722019803677, "grad_norm": 0.1640625, "learning_rate": 7.300247156440626e-05, "loss": 0.0391, "step": 5416 }, { "epoch": 0.6966007972909255, "grad_norm": 0.185546875, "learning_rate": 7.299349734893601e-05, "loss": 0.0434, "step": 5417 }, { "epoch": 0.6967293926014831, "grad_norm": 0.17578125, "learning_rate": 7.298452219397836e-05, "loss": 0.0464, "step": 5418 }, { "epoch": 0.6968579879120408, "grad_norm": 0.171875, "learning_rate": 7.297554609989999e-05, "loss": 0.0419, "step": 5419 }, { "epoch": 0.6969865832225984, "grad_norm": 0.1494140625, "learning_rate": 7.296656906706768e-05, "loss": 0.0367, "step": 5420 }, { "epoch": 0.6971151785331562, "grad_norm": 0.1552734375, "learning_rate": 7.29575910958482e-05, "loss": 0.037, "step": 5421 }, { "epoch": 0.6972437738437138, "grad_norm": 0.1826171875, "learning_rate": 7.29486121866084e-05, "loss": 0.0403, "step": 5422 }, { "epoch": 0.6973723691542715, "grad_norm": 0.189453125, "learning_rate": 7.293963233971514e-05, "loss": 0.0476, "step": 5423 }, { "epoch": 0.6975009644648292, "grad_norm": 0.1748046875, "learning_rate": 7.293065155553531e-05, "loss": 0.0366, "step": 5424 }, { "epoch": 0.6976295597753869, "grad_norm": 0.177734375, "learning_rate": 7.292166983443589e-05, "loss": 0.0435, "step": 5425 }, { "epoch": 0.6977581550859445, "grad_norm": 0.232421875, "learning_rate": 7.291268717678383e-05, "loss": 0.0502, "step": 5426 }, { "epoch": 0.6978867503965022, "grad_norm": 0.1943359375, "learning_rate": 7.290370358294615e-05, "loss": 0.0514, "step": 5427 }, { "epoch": 0.6980153457070599, "grad_norm": 0.185546875, "learning_rate": 7.289471905328996e-05, "loss": 0.0482, "step": 5428 }, { "epoch": 0.6981439410176176, "grad_norm": 0.1748046875, "learning_rate": 7.288573358818227e-05, "loss": 0.0459, "step": 5429 }, { "epoch": 0.6982725363281752, "grad_norm": 0.166015625, "learning_rate": 7.28767471879903e-05, "loss": 0.0433, "step": 5430 }, { "epoch": 0.6984011316387329, "grad_norm": 0.181640625, "learning_rate": 7.286775985308116e-05, "loss": 0.0476, "step": 5431 }, { "epoch": 0.6985297269492906, "grad_norm": 0.169921875, "learning_rate": 7.28587715838221e-05, "loss": 0.04, "step": 5432 }, { "epoch": 0.6986583222598483, "grad_norm": 0.1767578125, "learning_rate": 7.284978238058037e-05, "loss": 0.0425, "step": 5433 }, { "epoch": 0.6987869175704059, "grad_norm": 0.1943359375, "learning_rate": 7.284079224372323e-05, "loss": 0.0545, "step": 5434 }, { "epoch": 0.6989155128809637, "grad_norm": 0.1806640625, "learning_rate": 7.283180117361804e-05, "loss": 0.0392, "step": 5435 }, { "epoch": 0.6990441081915213, "grad_norm": 0.1689453125, "learning_rate": 7.282280917063214e-05, "loss": 0.0403, "step": 5436 }, { "epoch": 0.6991727035020789, "grad_norm": 0.1669921875, "learning_rate": 7.281381623513296e-05, "loss": 0.0385, "step": 5437 }, { "epoch": 0.6993012988126366, "grad_norm": 0.1650390625, "learning_rate": 7.280482236748791e-05, "loss": 0.0405, "step": 5438 }, { "epoch": 0.6994298941231943, "grad_norm": 0.17578125, "learning_rate": 7.279582756806447e-05, "loss": 0.0433, "step": 5439 }, { "epoch": 0.699558489433752, "grad_norm": 0.189453125, "learning_rate": 7.278683183723019e-05, "loss": 0.045, "step": 5440 }, { "epoch": 0.6996870847443096, "grad_norm": 0.17578125, "learning_rate": 7.27778351753526e-05, "loss": 0.037, "step": 5441 }, { "epoch": 0.6998156800548674, "grad_norm": 0.1708984375, "learning_rate": 7.276883758279929e-05, "loss": 0.0419, "step": 5442 }, { "epoch": 0.699944275365425, "grad_norm": 0.203125, "learning_rate": 7.27598390599379e-05, "loss": 0.0579, "step": 5443 }, { "epoch": 0.7000728706759827, "grad_norm": 0.2080078125, "learning_rate": 7.27508396071361e-05, "loss": 0.0471, "step": 5444 }, { "epoch": 0.7002014659865403, "grad_norm": 0.1796875, "learning_rate": 7.274183922476158e-05, "loss": 0.0374, "step": 5445 }, { "epoch": 0.7003300612970981, "grad_norm": 0.166015625, "learning_rate": 7.273283791318211e-05, "loss": 0.0411, "step": 5446 }, { "epoch": 0.7004586566076557, "grad_norm": 0.171875, "learning_rate": 7.272383567276549e-05, "loss": 0.0419, "step": 5447 }, { "epoch": 0.7005872519182134, "grad_norm": 0.203125, "learning_rate": 7.271483250387947e-05, "loss": 0.0526, "step": 5448 }, { "epoch": 0.700715847228771, "grad_norm": 0.171875, "learning_rate": 7.270582840689197e-05, "loss": 0.0391, "step": 5449 }, { "epoch": 0.7008444425393288, "grad_norm": 0.177734375, "learning_rate": 7.269682338217087e-05, "loss": 0.0432, "step": 5450 }, { "epoch": 0.7009730378498864, "grad_norm": 0.1533203125, "learning_rate": 7.26878174300841e-05, "loss": 0.035, "step": 5451 }, { "epoch": 0.701101633160444, "grad_norm": 0.1962890625, "learning_rate": 7.267881055099965e-05, "loss": 0.0566, "step": 5452 }, { "epoch": 0.7012302284710018, "grad_norm": 0.177734375, "learning_rate": 7.26698027452855e-05, "loss": 0.0429, "step": 5453 }, { "epoch": 0.7013588237815594, "grad_norm": 0.1865234375, "learning_rate": 7.266079401330974e-05, "loss": 0.0531, "step": 5454 }, { "epoch": 0.7014874190921171, "grad_norm": 0.1552734375, "learning_rate": 7.265178435544041e-05, "loss": 0.0346, "step": 5455 }, { "epoch": 0.7016160144026747, "grad_norm": 0.16796875, "learning_rate": 7.264277377204566e-05, "loss": 0.0415, "step": 5456 }, { "epoch": 0.7017446097132325, "grad_norm": 0.158203125, "learning_rate": 7.263376226349365e-05, "loss": 0.0393, "step": 5457 }, { "epoch": 0.7018732050237901, "grad_norm": 0.173828125, "learning_rate": 7.26247498301526e-05, "loss": 0.0438, "step": 5458 }, { "epoch": 0.7020018003343478, "grad_norm": 0.19140625, "learning_rate": 7.261573647239068e-05, "loss": 0.0474, "step": 5459 }, { "epoch": 0.7021303956449055, "grad_norm": 0.185546875, "learning_rate": 7.260672219057624e-05, "loss": 0.0385, "step": 5460 }, { "epoch": 0.7022589909554632, "grad_norm": 0.1435546875, "learning_rate": 7.259770698507756e-05, "loss": 0.0325, "step": 5461 }, { "epoch": 0.7023875862660208, "grad_norm": 0.1884765625, "learning_rate": 7.2588690856263e-05, "loss": 0.0352, "step": 5462 }, { "epoch": 0.7025161815765785, "grad_norm": 0.1904296875, "learning_rate": 7.257967380450092e-05, "loss": 0.0417, "step": 5463 }, { "epoch": 0.7026447768871362, "grad_norm": 0.1669921875, "learning_rate": 7.25706558301598e-05, "loss": 0.041, "step": 5464 }, { "epoch": 0.7027733721976939, "grad_norm": 0.197265625, "learning_rate": 7.256163693360804e-05, "loss": 0.0516, "step": 5465 }, { "epoch": 0.7029019675082515, "grad_norm": 0.1826171875, "learning_rate": 7.25526171152142e-05, "loss": 0.0435, "step": 5466 }, { "epoch": 0.7030305628188092, "grad_norm": 0.16015625, "learning_rate": 7.254359637534678e-05, "loss": 0.0387, "step": 5467 }, { "epoch": 0.7031591581293669, "grad_norm": 0.1728515625, "learning_rate": 7.253457471437438e-05, "loss": 0.0403, "step": 5468 }, { "epoch": 0.7032877534399246, "grad_norm": 0.173828125, "learning_rate": 7.25255521326656e-05, "loss": 0.0474, "step": 5469 }, { "epoch": 0.7034163487504822, "grad_norm": 0.171875, "learning_rate": 7.25165286305891e-05, "loss": 0.0365, "step": 5470 }, { "epoch": 0.70354494406104, "grad_norm": 0.177734375, "learning_rate": 7.250750420851355e-05, "loss": 0.042, "step": 5471 }, { "epoch": 0.7036735393715976, "grad_norm": 0.185546875, "learning_rate": 7.249847886680772e-05, "loss": 0.0386, "step": 5472 }, { "epoch": 0.7038021346821552, "grad_norm": 0.162109375, "learning_rate": 7.248945260584033e-05, "loss": 0.0396, "step": 5473 }, { "epoch": 0.7039307299927129, "grad_norm": 0.1767578125, "learning_rate": 7.248042542598021e-05, "loss": 0.0386, "step": 5474 }, { "epoch": 0.7040593253032706, "grad_norm": 0.1669921875, "learning_rate": 7.24713973275962e-05, "loss": 0.0332, "step": 5475 }, { "epoch": 0.7041879206138283, "grad_norm": 0.1728515625, "learning_rate": 7.246236831105716e-05, "loss": 0.04, "step": 5476 }, { "epoch": 0.7043165159243859, "grad_norm": 0.1806640625, "learning_rate": 7.245333837673201e-05, "loss": 0.0434, "step": 5477 }, { "epoch": 0.7044451112349436, "grad_norm": 0.1865234375, "learning_rate": 7.244430752498971e-05, "loss": 0.0463, "step": 5478 }, { "epoch": 0.7045737065455013, "grad_norm": 0.171875, "learning_rate": 7.243527575619925e-05, "loss": 0.0425, "step": 5479 }, { "epoch": 0.704702301856059, "grad_norm": 0.1845703125, "learning_rate": 7.242624307072966e-05, "loss": 0.0423, "step": 5480 }, { "epoch": 0.7048308971666166, "grad_norm": 0.1728515625, "learning_rate": 7.241720946894999e-05, "loss": 0.0408, "step": 5481 }, { "epoch": 0.7049594924771744, "grad_norm": 0.1845703125, "learning_rate": 7.240817495122935e-05, "loss": 0.0474, "step": 5482 }, { "epoch": 0.705088087787732, "grad_norm": 0.1708984375, "learning_rate": 7.239913951793689e-05, "loss": 0.047, "step": 5483 }, { "epoch": 0.7052166830982897, "grad_norm": 0.189453125, "learning_rate": 7.239010316944177e-05, "loss": 0.0475, "step": 5484 }, { "epoch": 0.7053452784088473, "grad_norm": 0.1611328125, "learning_rate": 7.238106590611322e-05, "loss": 0.0402, "step": 5485 }, { "epoch": 0.7054738737194051, "grad_norm": 0.171875, "learning_rate": 7.237202772832048e-05, "loss": 0.0353, "step": 5486 }, { "epoch": 0.7056024690299627, "grad_norm": 0.1904296875, "learning_rate": 7.236298863643286e-05, "loss": 0.0442, "step": 5487 }, { "epoch": 0.7057310643405204, "grad_norm": 0.1904296875, "learning_rate": 7.235394863081965e-05, "loss": 0.0477, "step": 5488 }, { "epoch": 0.7058596596510781, "grad_norm": 0.1650390625, "learning_rate": 7.234490771185023e-05, "loss": 0.0468, "step": 5489 }, { "epoch": 0.7059882549616358, "grad_norm": 0.1650390625, "learning_rate": 7.233586587989403e-05, "loss": 0.042, "step": 5490 }, { "epoch": 0.7061168502721934, "grad_norm": 0.1640625, "learning_rate": 7.232682313532047e-05, "loss": 0.0366, "step": 5491 }, { "epoch": 0.706245445582751, "grad_norm": 0.17578125, "learning_rate": 7.2317779478499e-05, "loss": 0.0479, "step": 5492 }, { "epoch": 0.7063740408933088, "grad_norm": 0.1572265625, "learning_rate": 7.230873490979917e-05, "loss": 0.0384, "step": 5493 }, { "epoch": 0.7065026362038664, "grad_norm": 0.1572265625, "learning_rate": 7.229968942959052e-05, "loss": 0.0378, "step": 5494 }, { "epoch": 0.7066312315144241, "grad_norm": 0.1865234375, "learning_rate": 7.229064303824261e-05, "loss": 0.0462, "step": 5495 }, { "epoch": 0.7067598268249817, "grad_norm": 0.1611328125, "learning_rate": 7.228159573612511e-05, "loss": 0.0359, "step": 5496 }, { "epoch": 0.7068884221355395, "grad_norm": 0.177734375, "learning_rate": 7.227254752360763e-05, "loss": 0.0465, "step": 5497 }, { "epoch": 0.7070170174460971, "grad_norm": 0.1865234375, "learning_rate": 7.226349840105993e-05, "loss": 0.0467, "step": 5498 }, { "epoch": 0.7071456127566548, "grad_norm": 0.1787109375, "learning_rate": 7.225444836885172e-05, "loss": 0.0481, "step": 5499 }, { "epoch": 0.7072742080672125, "grad_norm": 0.197265625, "learning_rate": 7.224539742735276e-05, "loss": 0.0412, "step": 5500 }, { "epoch": 0.7072742080672125, "eval_loss": 0.041893765330314636, "eval_runtime": 1043.6395, "eval_samples_per_second": 94.119, "eval_steps_per_second": 1.177, "step": 5500 }, { "epoch": 0.7074028033777702, "grad_norm": 0.1591796875, "learning_rate": 7.223634557693286e-05, "loss": 0.0391, "step": 5501 }, { "epoch": 0.7075313986883278, "grad_norm": 0.1611328125, "learning_rate": 7.222729281796189e-05, "loss": 0.0368, "step": 5502 }, { "epoch": 0.7076599939988855, "grad_norm": 0.16796875, "learning_rate": 7.221823915080972e-05, "loss": 0.0405, "step": 5503 }, { "epoch": 0.7077885893094432, "grad_norm": 0.1787109375, "learning_rate": 7.220918457584628e-05, "loss": 0.046, "step": 5504 }, { "epoch": 0.7079171846200009, "grad_norm": 0.162109375, "learning_rate": 7.220012909344151e-05, "loss": 0.0397, "step": 5505 }, { "epoch": 0.7080457799305585, "grad_norm": 0.169921875, "learning_rate": 7.219107270396545e-05, "loss": 0.0376, "step": 5506 }, { "epoch": 0.7081743752411163, "grad_norm": 0.1875, "learning_rate": 7.218201540778809e-05, "loss": 0.0451, "step": 5507 }, { "epoch": 0.7083029705516739, "grad_norm": 0.1689453125, "learning_rate": 7.217295720527952e-05, "loss": 0.038, "step": 5508 }, { "epoch": 0.7084315658622315, "grad_norm": 0.1767578125, "learning_rate": 7.216389809680983e-05, "loss": 0.0428, "step": 5509 }, { "epoch": 0.7085601611727892, "grad_norm": 0.1640625, "learning_rate": 7.215483808274918e-05, "loss": 0.0429, "step": 5510 }, { "epoch": 0.708688756483347, "grad_norm": 0.1669921875, "learning_rate": 7.214577716346777e-05, "loss": 0.0421, "step": 5511 }, { "epoch": 0.7088173517939046, "grad_norm": 0.1689453125, "learning_rate": 7.21367153393358e-05, "loss": 0.0448, "step": 5512 }, { "epoch": 0.7089459471044622, "grad_norm": 0.1767578125, "learning_rate": 7.21276526107235e-05, "loss": 0.0423, "step": 5513 }, { "epoch": 0.7090745424150199, "grad_norm": 0.1650390625, "learning_rate": 7.211858897800121e-05, "loss": 0.0363, "step": 5514 }, { "epoch": 0.7092031377255776, "grad_norm": 0.16796875, "learning_rate": 7.21095244415392e-05, "loss": 0.0367, "step": 5515 }, { "epoch": 0.7093317330361353, "grad_norm": 0.1689453125, "learning_rate": 7.21004590017079e-05, "loss": 0.0426, "step": 5516 }, { "epoch": 0.7094603283466929, "grad_norm": 0.1904296875, "learning_rate": 7.209139265887767e-05, "loss": 0.046, "step": 5517 }, { "epoch": 0.7095889236572507, "grad_norm": 0.1708984375, "learning_rate": 7.2082325413419e-05, "loss": 0.0368, "step": 5518 }, { "epoch": 0.7097175189678083, "grad_norm": 0.177734375, "learning_rate": 7.20732572657023e-05, "loss": 0.0404, "step": 5519 }, { "epoch": 0.709846114278366, "grad_norm": 0.1826171875, "learning_rate": 7.206418821609812e-05, "loss": 0.0429, "step": 5520 }, { "epoch": 0.7099747095889236, "grad_norm": 0.1796875, "learning_rate": 7.205511826497702e-05, "loss": 0.0459, "step": 5521 }, { "epoch": 0.7101033048994814, "grad_norm": 0.1865234375, "learning_rate": 7.204604741270958e-05, "loss": 0.0474, "step": 5522 }, { "epoch": 0.710231900210039, "grad_norm": 0.1728515625, "learning_rate": 7.20369756596664e-05, "loss": 0.0409, "step": 5523 }, { "epoch": 0.7103604955205967, "grad_norm": 0.1689453125, "learning_rate": 7.202790300621816e-05, "loss": 0.0435, "step": 5524 }, { "epoch": 0.7104890908311543, "grad_norm": 0.18359375, "learning_rate": 7.201882945273558e-05, "loss": 0.0428, "step": 5525 }, { "epoch": 0.7106176861417121, "grad_norm": 0.2158203125, "learning_rate": 7.200975499958936e-05, "loss": 0.0547, "step": 5526 }, { "epoch": 0.7107462814522697, "grad_norm": 0.173828125, "learning_rate": 7.200067964715027e-05, "loss": 0.0416, "step": 5527 }, { "epoch": 0.7108748767628273, "grad_norm": 0.1640625, "learning_rate": 7.199160339578917e-05, "loss": 0.0371, "step": 5528 }, { "epoch": 0.7110034720733851, "grad_norm": 0.17578125, "learning_rate": 7.198252624587683e-05, "loss": 0.0352, "step": 5529 }, { "epoch": 0.7111320673839427, "grad_norm": 0.181640625, "learning_rate": 7.19734481977842e-05, "loss": 0.0372, "step": 5530 }, { "epoch": 0.7112606626945004, "grad_norm": 0.158203125, "learning_rate": 7.196436925188214e-05, "loss": 0.0381, "step": 5531 }, { "epoch": 0.711389258005058, "grad_norm": 0.173828125, "learning_rate": 7.195528940854166e-05, "loss": 0.0301, "step": 5532 }, { "epoch": 0.7115178533156158, "grad_norm": 0.1884765625, "learning_rate": 7.194620866813372e-05, "loss": 0.0453, "step": 5533 }, { "epoch": 0.7116464486261734, "grad_norm": 0.1650390625, "learning_rate": 7.193712703102934e-05, "loss": 0.0376, "step": 5534 }, { "epoch": 0.7117750439367311, "grad_norm": 0.1865234375, "learning_rate": 7.192804449759961e-05, "loss": 0.0438, "step": 5535 }, { "epoch": 0.7119036392472888, "grad_norm": 0.1865234375, "learning_rate": 7.191896106821562e-05, "loss": 0.0425, "step": 5536 }, { "epoch": 0.7120322345578465, "grad_norm": 0.1826171875, "learning_rate": 7.190987674324851e-05, "loss": 0.0494, "step": 5537 }, { "epoch": 0.7121608298684041, "grad_norm": 0.1796875, "learning_rate": 7.190079152306944e-05, "loss": 0.041, "step": 5538 }, { "epoch": 0.7122894251789618, "grad_norm": 0.185546875, "learning_rate": 7.189170540804963e-05, "loss": 0.0447, "step": 5539 }, { "epoch": 0.7124180204895195, "grad_norm": 0.197265625, "learning_rate": 7.188261839856036e-05, "loss": 0.0328, "step": 5540 }, { "epoch": 0.7125466158000772, "grad_norm": 0.169921875, "learning_rate": 7.187353049497287e-05, "loss": 0.0401, "step": 5541 }, { "epoch": 0.7126752111106348, "grad_norm": 0.2080078125, "learning_rate": 7.186444169765851e-05, "loss": 0.0536, "step": 5542 }, { "epoch": 0.7128038064211925, "grad_norm": 0.17578125, "learning_rate": 7.185535200698863e-05, "loss": 0.0467, "step": 5543 }, { "epoch": 0.7129324017317502, "grad_norm": 0.1708984375, "learning_rate": 7.184626142333462e-05, "loss": 0.032, "step": 5544 }, { "epoch": 0.7130609970423079, "grad_norm": 0.201171875, "learning_rate": 7.18371699470679e-05, "loss": 0.0526, "step": 5545 }, { "epoch": 0.7131895923528655, "grad_norm": 0.18359375, "learning_rate": 7.182807757855995e-05, "loss": 0.0455, "step": 5546 }, { "epoch": 0.7133181876634233, "grad_norm": 0.1689453125, "learning_rate": 7.181898431818228e-05, "loss": 0.0382, "step": 5547 }, { "epoch": 0.7134467829739809, "grad_norm": 0.17578125, "learning_rate": 7.180989016630642e-05, "loss": 0.0443, "step": 5548 }, { "epoch": 0.7135753782845385, "grad_norm": 0.1748046875, "learning_rate": 7.180079512330395e-05, "loss": 0.0428, "step": 5549 }, { "epoch": 0.7137039735950962, "grad_norm": 0.19140625, "learning_rate": 7.179169918954646e-05, "loss": 0.0463, "step": 5550 }, { "epoch": 0.7138325689056539, "grad_norm": 0.1552734375, "learning_rate": 7.178260236540564e-05, "loss": 0.0356, "step": 5551 }, { "epoch": 0.7139611642162116, "grad_norm": 0.18359375, "learning_rate": 7.177350465125314e-05, "loss": 0.0429, "step": 5552 }, { "epoch": 0.7140897595267692, "grad_norm": 0.1572265625, "learning_rate": 7.176440604746071e-05, "loss": 0.0334, "step": 5553 }, { "epoch": 0.714218354837327, "grad_norm": 0.1748046875, "learning_rate": 7.17553065544001e-05, "loss": 0.0395, "step": 5554 }, { "epoch": 0.7143469501478846, "grad_norm": 0.1689453125, "learning_rate": 7.174620617244309e-05, "loss": 0.0365, "step": 5555 }, { "epoch": 0.7144755454584423, "grad_norm": 0.169921875, "learning_rate": 7.173710490196156e-05, "loss": 0.0348, "step": 5556 }, { "epoch": 0.7146041407689999, "grad_norm": 0.150390625, "learning_rate": 7.17280027433273e-05, "loss": 0.0286, "step": 5557 }, { "epoch": 0.7147327360795577, "grad_norm": 0.16015625, "learning_rate": 7.171889969691226e-05, "loss": 0.0345, "step": 5558 }, { "epoch": 0.7148613313901153, "grad_norm": 0.1630859375, "learning_rate": 7.170979576308837e-05, "loss": 0.0366, "step": 5559 }, { "epoch": 0.714989926700673, "grad_norm": 0.166015625, "learning_rate": 7.170069094222761e-05, "loss": 0.0345, "step": 5560 }, { "epoch": 0.7151185220112306, "grad_norm": 0.1640625, "learning_rate": 7.1691585234702e-05, "loss": 0.0307, "step": 5561 }, { "epoch": 0.7152471173217884, "grad_norm": 0.1728515625, "learning_rate": 7.168247864088357e-05, "loss": 0.0396, "step": 5562 }, { "epoch": 0.715375712632346, "grad_norm": 0.1611328125, "learning_rate": 7.167337116114442e-05, "loss": 0.0383, "step": 5563 }, { "epoch": 0.7155043079429037, "grad_norm": 0.1708984375, "learning_rate": 7.166426279585665e-05, "loss": 0.0408, "step": 5564 }, { "epoch": 0.7156329032534614, "grad_norm": 0.20703125, "learning_rate": 7.165515354539246e-05, "loss": 0.0441, "step": 5565 }, { "epoch": 0.715761498564019, "grad_norm": 0.169921875, "learning_rate": 7.164604341012399e-05, "loss": 0.0393, "step": 5566 }, { "epoch": 0.7158900938745767, "grad_norm": 0.189453125, "learning_rate": 7.163693239042351e-05, "loss": 0.0422, "step": 5567 }, { "epoch": 0.7160186891851343, "grad_norm": 0.1796875, "learning_rate": 7.162782048666327e-05, "loss": 0.0422, "step": 5568 }, { "epoch": 0.7161472844956921, "grad_norm": 0.1904296875, "learning_rate": 7.161870769921557e-05, "loss": 0.0458, "step": 5569 }, { "epoch": 0.7162758798062497, "grad_norm": 0.2041015625, "learning_rate": 7.160959402845274e-05, "loss": 0.0451, "step": 5570 }, { "epoch": 0.7164044751168074, "grad_norm": 0.1591796875, "learning_rate": 7.160047947474718e-05, "loss": 0.0352, "step": 5571 }, { "epoch": 0.7165330704273651, "grad_norm": 0.162109375, "learning_rate": 7.159136403847127e-05, "loss": 0.0396, "step": 5572 }, { "epoch": 0.7166616657379228, "grad_norm": 0.1533203125, "learning_rate": 7.15822477199975e-05, "loss": 0.0356, "step": 5573 }, { "epoch": 0.7167902610484804, "grad_norm": 0.1640625, "learning_rate": 7.15731305196983e-05, "loss": 0.0357, "step": 5574 }, { "epoch": 0.7169188563590381, "grad_norm": 0.16796875, "learning_rate": 7.15640124379462e-05, "loss": 0.0401, "step": 5575 }, { "epoch": 0.7170474516695958, "grad_norm": 0.1796875, "learning_rate": 7.155489347511379e-05, "loss": 0.0442, "step": 5576 }, { "epoch": 0.7171760469801535, "grad_norm": 0.1982421875, "learning_rate": 7.15457736315736e-05, "loss": 0.0373, "step": 5577 }, { "epoch": 0.7173046422907111, "grad_norm": 0.1875, "learning_rate": 7.153665290769833e-05, "loss": 0.0445, "step": 5578 }, { "epoch": 0.7174332376012688, "grad_norm": 0.17578125, "learning_rate": 7.152753130386059e-05, "loss": 0.0431, "step": 5579 }, { "epoch": 0.7175618329118265, "grad_norm": 0.1669921875, "learning_rate": 7.151840882043307e-05, "loss": 0.0403, "step": 5580 }, { "epoch": 0.7176904282223842, "grad_norm": 0.18359375, "learning_rate": 7.150928545778856e-05, "loss": 0.0401, "step": 5581 }, { "epoch": 0.7178190235329418, "grad_norm": 0.177734375, "learning_rate": 7.150016121629978e-05, "loss": 0.0398, "step": 5582 }, { "epoch": 0.7179476188434996, "grad_norm": 0.18359375, "learning_rate": 7.149103609633955e-05, "loss": 0.0401, "step": 5583 }, { "epoch": 0.7180762141540572, "grad_norm": 0.1845703125, "learning_rate": 7.148191009828072e-05, "loss": 0.0469, "step": 5584 }, { "epoch": 0.7182048094646148, "grad_norm": 0.1826171875, "learning_rate": 7.147278322249615e-05, "loss": 0.0402, "step": 5585 }, { "epoch": 0.7183334047751725, "grad_norm": 0.1708984375, "learning_rate": 7.146365546935875e-05, "loss": 0.0398, "step": 5586 }, { "epoch": 0.7184620000857302, "grad_norm": 0.173828125, "learning_rate": 7.145452683924151e-05, "loss": 0.0394, "step": 5587 }, { "epoch": 0.7185905953962879, "grad_norm": 0.1865234375, "learning_rate": 7.144539733251738e-05, "loss": 0.0482, "step": 5588 }, { "epoch": 0.7187191907068455, "grad_norm": 0.1474609375, "learning_rate": 7.143626694955938e-05, "loss": 0.0342, "step": 5589 }, { "epoch": 0.7188477860174032, "grad_norm": 0.1884765625, "learning_rate": 7.142713569074059e-05, "loss": 0.0523, "step": 5590 }, { "epoch": 0.7189763813279609, "grad_norm": 0.166015625, "learning_rate": 7.141800355643408e-05, "loss": 0.0452, "step": 5591 }, { "epoch": 0.7191049766385186, "grad_norm": 0.169921875, "learning_rate": 7.1408870547013e-05, "loss": 0.0484, "step": 5592 }, { "epoch": 0.7192335719490762, "grad_norm": 0.1484375, "learning_rate": 7.13997366628505e-05, "loss": 0.0292, "step": 5593 }, { "epoch": 0.719362167259634, "grad_norm": 0.1728515625, "learning_rate": 7.139060190431975e-05, "loss": 0.0419, "step": 5594 }, { "epoch": 0.7194907625701916, "grad_norm": 0.16796875, "learning_rate": 7.138146627179406e-05, "loss": 0.0392, "step": 5595 }, { "epoch": 0.7196193578807493, "grad_norm": 0.177734375, "learning_rate": 7.137232976564663e-05, "loss": 0.0453, "step": 5596 }, { "epoch": 0.7197479531913069, "grad_norm": 0.1689453125, "learning_rate": 7.13631923862508e-05, "loss": 0.0401, "step": 5597 }, { "epoch": 0.7198765485018647, "grad_norm": 0.1572265625, "learning_rate": 7.135405413397991e-05, "loss": 0.0373, "step": 5598 }, { "epoch": 0.7200051438124223, "grad_norm": 0.234375, "learning_rate": 7.134491500920733e-05, "loss": 0.0511, "step": 5599 }, { "epoch": 0.72013373912298, "grad_norm": 0.171875, "learning_rate": 7.133577501230648e-05, "loss": 0.0395, "step": 5600 }, { "epoch": 0.7202623344335377, "grad_norm": 0.1767578125, "learning_rate": 7.132663414365081e-05, "loss": 0.041, "step": 5601 }, { "epoch": 0.7203909297440954, "grad_norm": 0.1953125, "learning_rate": 7.131749240361382e-05, "loss": 0.05, "step": 5602 }, { "epoch": 0.720519525054653, "grad_norm": 0.1611328125, "learning_rate": 7.130834979256899e-05, "loss": 0.0369, "step": 5603 }, { "epoch": 0.7206481203652106, "grad_norm": 0.1767578125, "learning_rate": 7.129920631088993e-05, "loss": 0.0476, "step": 5604 }, { "epoch": 0.7207767156757684, "grad_norm": 0.18359375, "learning_rate": 7.129006195895018e-05, "loss": 0.0454, "step": 5605 }, { "epoch": 0.720905310986326, "grad_norm": 0.197265625, "learning_rate": 7.128091673712341e-05, "loss": 0.0566, "step": 5606 }, { "epoch": 0.7210339062968837, "grad_norm": 0.2041015625, "learning_rate": 7.127177064578325e-05, "loss": 0.0514, "step": 5607 }, { "epoch": 0.7211625016074413, "grad_norm": 0.1669921875, "learning_rate": 7.12626236853034e-05, "loss": 0.043, "step": 5608 }, { "epoch": 0.7212910969179991, "grad_norm": 0.1923828125, "learning_rate": 7.125347585605763e-05, "loss": 0.0439, "step": 5609 }, { "epoch": 0.7214196922285567, "grad_norm": 0.1796875, "learning_rate": 7.124432715841968e-05, "loss": 0.0504, "step": 5610 }, { "epoch": 0.7215482875391144, "grad_norm": 0.158203125, "learning_rate": 7.123517759276336e-05, "loss": 0.0369, "step": 5611 }, { "epoch": 0.7216768828496721, "grad_norm": 0.189453125, "learning_rate": 7.122602715946252e-05, "loss": 0.0437, "step": 5612 }, { "epoch": 0.7218054781602298, "grad_norm": 0.1513671875, "learning_rate": 7.121687585889102e-05, "loss": 0.0339, "step": 5613 }, { "epoch": 0.7219340734707874, "grad_norm": 0.158203125, "learning_rate": 7.120772369142279e-05, "loss": 0.0354, "step": 5614 }, { "epoch": 0.7220626687813451, "grad_norm": 0.162109375, "learning_rate": 7.119857065743174e-05, "loss": 0.0436, "step": 5615 }, { "epoch": 0.7221912640919028, "grad_norm": 0.1787109375, "learning_rate": 7.118941675729191e-05, "loss": 0.0389, "step": 5616 }, { "epoch": 0.7223198594024605, "grad_norm": 0.177734375, "learning_rate": 7.118026199137728e-05, "loss": 0.0424, "step": 5617 }, { "epoch": 0.7224484547130181, "grad_norm": 0.1796875, "learning_rate": 7.11711063600619e-05, "loss": 0.0417, "step": 5618 }, { "epoch": 0.7225770500235759, "grad_norm": 0.173828125, "learning_rate": 7.116194986371987e-05, "loss": 0.0428, "step": 5619 }, { "epoch": 0.7227056453341335, "grad_norm": 0.1484375, "learning_rate": 7.11527925027253e-05, "loss": 0.04, "step": 5620 }, { "epoch": 0.7228342406446912, "grad_norm": 0.1611328125, "learning_rate": 7.114363427745237e-05, "loss": 0.0388, "step": 5621 }, { "epoch": 0.7229628359552488, "grad_norm": 0.1923828125, "learning_rate": 7.113447518827527e-05, "loss": 0.0498, "step": 5622 }, { "epoch": 0.7230914312658066, "grad_norm": 0.1767578125, "learning_rate": 7.112531523556822e-05, "loss": 0.0386, "step": 5623 }, { "epoch": 0.7232200265763642, "grad_norm": 0.1708984375, "learning_rate": 7.11161544197055e-05, "loss": 0.0449, "step": 5624 }, { "epoch": 0.7233486218869218, "grad_norm": 0.158203125, "learning_rate": 7.110699274106139e-05, "loss": 0.0428, "step": 5625 }, { "epoch": 0.7234772171974795, "grad_norm": 0.1572265625, "learning_rate": 7.109783020001023e-05, "loss": 0.0344, "step": 5626 }, { "epoch": 0.7236058125080372, "grad_norm": 0.1650390625, "learning_rate": 7.10886667969264e-05, "loss": 0.037, "step": 5627 }, { "epoch": 0.7237344078185949, "grad_norm": 0.166015625, "learning_rate": 7.107950253218431e-05, "loss": 0.0424, "step": 5628 }, { "epoch": 0.7238630031291525, "grad_norm": 0.169921875, "learning_rate": 7.107033740615839e-05, "loss": 0.0397, "step": 5629 }, { "epoch": 0.7239915984397103, "grad_norm": 0.1630859375, "learning_rate": 7.106117141922313e-05, "loss": 0.0361, "step": 5630 }, { "epoch": 0.7241201937502679, "grad_norm": 0.1689453125, "learning_rate": 7.105200457175301e-05, "loss": 0.0409, "step": 5631 }, { "epoch": 0.7242487890608256, "grad_norm": 0.193359375, "learning_rate": 7.10428368641226e-05, "loss": 0.0464, "step": 5632 }, { "epoch": 0.7243773843713832, "grad_norm": 0.1904296875, "learning_rate": 7.103366829670649e-05, "loss": 0.0462, "step": 5633 }, { "epoch": 0.724505979681941, "grad_norm": 0.17578125, "learning_rate": 7.102449886987927e-05, "loss": 0.0452, "step": 5634 }, { "epoch": 0.7246345749924986, "grad_norm": 0.181640625, "learning_rate": 7.101532858401562e-05, "loss": 0.0362, "step": 5635 }, { "epoch": 0.7247631703030563, "grad_norm": 0.17578125, "learning_rate": 7.100615743949021e-05, "loss": 0.0417, "step": 5636 }, { "epoch": 0.7248917656136139, "grad_norm": 0.1630859375, "learning_rate": 7.099698543667778e-05, "loss": 0.0368, "step": 5637 }, { "epoch": 0.7250203609241717, "grad_norm": 0.1611328125, "learning_rate": 7.098781257595307e-05, "loss": 0.0374, "step": 5638 }, { "epoch": 0.7251489562347293, "grad_norm": 0.19140625, "learning_rate": 7.097863885769088e-05, "loss": 0.0462, "step": 5639 }, { "epoch": 0.725277551545287, "grad_norm": 0.177734375, "learning_rate": 7.096946428226604e-05, "loss": 0.0435, "step": 5640 }, { "epoch": 0.7254061468558447, "grad_norm": 0.169921875, "learning_rate": 7.096028885005338e-05, "loss": 0.0374, "step": 5641 }, { "epoch": 0.7255347421664023, "grad_norm": 0.1767578125, "learning_rate": 7.095111256142786e-05, "loss": 0.0392, "step": 5642 }, { "epoch": 0.72566333747696, "grad_norm": 0.1845703125, "learning_rate": 7.094193541676436e-05, "loss": 0.0396, "step": 5643 }, { "epoch": 0.7257919327875176, "grad_norm": 0.1640625, "learning_rate": 7.093275741643786e-05, "loss": 0.0402, "step": 5644 }, { "epoch": 0.7259205280980754, "grad_norm": 0.201171875, "learning_rate": 7.092357856082338e-05, "loss": 0.0555, "step": 5645 }, { "epoch": 0.726049123408633, "grad_norm": 0.1796875, "learning_rate": 7.091439885029592e-05, "loss": 0.0397, "step": 5646 }, { "epoch": 0.7261777187191907, "grad_norm": 0.1572265625, "learning_rate": 7.09052182852306e-05, "loss": 0.033, "step": 5647 }, { "epoch": 0.7263063140297484, "grad_norm": 0.171875, "learning_rate": 7.08960368660025e-05, "loss": 0.0378, "step": 5648 }, { "epoch": 0.7264349093403061, "grad_norm": 0.1650390625, "learning_rate": 7.088685459298676e-05, "loss": 0.0359, "step": 5649 }, { "epoch": 0.7265635046508637, "grad_norm": 0.146484375, "learning_rate": 7.087767146655858e-05, "loss": 0.0286, "step": 5650 }, { "epoch": 0.7266920999614214, "grad_norm": 0.16796875, "learning_rate": 7.086848748709313e-05, "loss": 0.0424, "step": 5651 }, { "epoch": 0.7268206952719791, "grad_norm": 0.19921875, "learning_rate": 7.08593026549657e-05, "loss": 0.0529, "step": 5652 }, { "epoch": 0.7269492905825368, "grad_norm": 0.2158203125, "learning_rate": 7.085011697055153e-05, "loss": 0.0493, "step": 5653 }, { "epoch": 0.7270778858930944, "grad_norm": 0.15234375, "learning_rate": 7.084093043422598e-05, "loss": 0.0332, "step": 5654 }, { "epoch": 0.727206481203652, "grad_norm": 0.2158203125, "learning_rate": 7.083174304636437e-05, "loss": 0.0465, "step": 5655 }, { "epoch": 0.7273350765142098, "grad_norm": 0.19140625, "learning_rate": 7.08225548073421e-05, "loss": 0.0419, "step": 5656 }, { "epoch": 0.7274636718247675, "grad_norm": 0.1689453125, "learning_rate": 7.081336571753459e-05, "loss": 0.0461, "step": 5657 }, { "epoch": 0.7275922671353251, "grad_norm": 0.166015625, "learning_rate": 7.080417577731727e-05, "loss": 0.0372, "step": 5658 }, { "epoch": 0.7277208624458829, "grad_norm": 0.162109375, "learning_rate": 7.079498498706568e-05, "loss": 0.0373, "step": 5659 }, { "epoch": 0.7278494577564405, "grad_norm": 0.166015625, "learning_rate": 7.07857933471553e-05, "loss": 0.034, "step": 5660 }, { "epoch": 0.7279780530669981, "grad_norm": 0.2060546875, "learning_rate": 7.077660085796172e-05, "loss": 0.0469, "step": 5661 }, { "epoch": 0.7281066483775558, "grad_norm": 0.1943359375, "learning_rate": 7.07674075198605e-05, "loss": 0.0492, "step": 5662 }, { "epoch": 0.7282352436881135, "grad_norm": 0.16796875, "learning_rate": 7.075821333322732e-05, "loss": 0.0344, "step": 5663 }, { "epoch": 0.7283638389986712, "grad_norm": 0.1748046875, "learning_rate": 7.074901829843781e-05, "loss": 0.0399, "step": 5664 }, { "epoch": 0.7284924343092288, "grad_norm": 0.189453125, "learning_rate": 7.073982241586765e-05, "loss": 0.0453, "step": 5665 }, { "epoch": 0.7286210296197866, "grad_norm": 0.1806640625, "learning_rate": 7.073062568589262e-05, "loss": 0.0476, "step": 5666 }, { "epoch": 0.7287496249303442, "grad_norm": 0.16015625, "learning_rate": 7.072142810888844e-05, "loss": 0.0394, "step": 5667 }, { "epoch": 0.7288782202409019, "grad_norm": 0.173828125, "learning_rate": 7.071222968523095e-05, "loss": 0.0439, "step": 5668 }, { "epoch": 0.7290068155514595, "grad_norm": 0.166015625, "learning_rate": 7.070303041529596e-05, "loss": 0.0391, "step": 5669 }, { "epoch": 0.7291354108620173, "grad_norm": 0.1728515625, "learning_rate": 7.069383029945936e-05, "loss": 0.046, "step": 5670 }, { "epoch": 0.7292640061725749, "grad_norm": 0.166015625, "learning_rate": 7.068462933809706e-05, "loss": 0.0332, "step": 5671 }, { "epoch": 0.7293926014831326, "grad_norm": 0.1708984375, "learning_rate": 7.067542753158499e-05, "loss": 0.0396, "step": 5672 }, { "epoch": 0.7295211967936902, "grad_norm": 0.185546875, "learning_rate": 7.066622488029912e-05, "loss": 0.0453, "step": 5673 }, { "epoch": 0.729649792104248, "grad_norm": 0.1962890625, "learning_rate": 7.065702138461546e-05, "loss": 0.0466, "step": 5674 }, { "epoch": 0.7297783874148056, "grad_norm": 0.1640625, "learning_rate": 7.064781704491006e-05, "loss": 0.0384, "step": 5675 }, { "epoch": 0.7299069827253633, "grad_norm": 0.2060546875, "learning_rate": 7.0638611861559e-05, "loss": 0.0504, "step": 5676 }, { "epoch": 0.730035578035921, "grad_norm": 0.18359375, "learning_rate": 7.062940583493837e-05, "loss": 0.0494, "step": 5677 }, { "epoch": 0.7301641733464787, "grad_norm": 0.1875, "learning_rate": 7.062019896542438e-05, "loss": 0.0442, "step": 5678 }, { "epoch": 0.7302927686570363, "grad_norm": 0.1796875, "learning_rate": 7.061099125339313e-05, "loss": 0.0423, "step": 5679 }, { "epoch": 0.7304213639675939, "grad_norm": 0.1728515625, "learning_rate": 7.06017826992209e-05, "loss": 0.0333, "step": 5680 }, { "epoch": 0.7305499592781517, "grad_norm": 0.1748046875, "learning_rate": 7.05925733032839e-05, "loss": 0.0397, "step": 5681 }, { "epoch": 0.7306785545887093, "grad_norm": 0.16015625, "learning_rate": 7.058336306595844e-05, "loss": 0.0352, "step": 5682 }, { "epoch": 0.730807149899267, "grad_norm": 0.1708984375, "learning_rate": 7.057415198762085e-05, "loss": 0.039, "step": 5683 }, { "epoch": 0.7309357452098246, "grad_norm": 0.171875, "learning_rate": 7.056494006864743e-05, "loss": 0.0453, "step": 5684 }, { "epoch": 0.7310643405203824, "grad_norm": 0.1591796875, "learning_rate": 7.055572730941463e-05, "loss": 0.0313, "step": 5685 }, { "epoch": 0.73119293583094, "grad_norm": 0.1728515625, "learning_rate": 7.054651371029884e-05, "loss": 0.0455, "step": 5686 }, { "epoch": 0.7313215311414977, "grad_norm": 0.189453125, "learning_rate": 7.053729927167651e-05, "loss": 0.0488, "step": 5687 }, { "epoch": 0.7314501264520554, "grad_norm": 0.1767578125, "learning_rate": 7.052808399392417e-05, "loss": 0.0423, "step": 5688 }, { "epoch": 0.7315787217626131, "grad_norm": 0.15625, "learning_rate": 7.05188678774183e-05, "loss": 0.0386, "step": 5689 }, { "epoch": 0.7317073170731707, "grad_norm": 0.1953125, "learning_rate": 7.05096509225355e-05, "loss": 0.0518, "step": 5690 }, { "epoch": 0.7318359123837284, "grad_norm": 0.1640625, "learning_rate": 7.050043312965232e-05, "loss": 0.0397, "step": 5691 }, { "epoch": 0.7319645076942861, "grad_norm": 0.181640625, "learning_rate": 7.049121449914541e-05, "loss": 0.0433, "step": 5692 }, { "epoch": 0.7320931030048438, "grad_norm": 0.1826171875, "learning_rate": 7.048199503139146e-05, "loss": 0.0468, "step": 5693 }, { "epoch": 0.7322216983154014, "grad_norm": 0.16796875, "learning_rate": 7.047277472676713e-05, "loss": 0.0341, "step": 5694 }, { "epoch": 0.7323502936259592, "grad_norm": 0.19140625, "learning_rate": 7.046355358564916e-05, "loss": 0.0512, "step": 5695 }, { "epoch": 0.7324788889365168, "grad_norm": 0.1708984375, "learning_rate": 7.04543316084143e-05, "loss": 0.0484, "step": 5696 }, { "epoch": 0.7326074842470744, "grad_norm": 0.1708984375, "learning_rate": 7.044510879543938e-05, "loss": 0.0433, "step": 5697 }, { "epoch": 0.7327360795576321, "grad_norm": 0.1591796875, "learning_rate": 7.043588514710122e-05, "loss": 0.0326, "step": 5698 }, { "epoch": 0.7328646748681898, "grad_norm": 0.171875, "learning_rate": 7.04266606637767e-05, "loss": 0.0385, "step": 5699 }, { "epoch": 0.7329932701787475, "grad_norm": 0.1669921875, "learning_rate": 7.04174353458427e-05, "loss": 0.0407, "step": 5700 }, { "epoch": 0.7331218654893051, "grad_norm": 0.1787109375, "learning_rate": 7.040820919367616e-05, "loss": 0.0346, "step": 5701 }, { "epoch": 0.7332504607998628, "grad_norm": 0.181640625, "learning_rate": 7.039898220765405e-05, "loss": 0.045, "step": 5702 }, { "epoch": 0.7333790561104205, "grad_norm": 0.1669921875, "learning_rate": 7.038975438815338e-05, "loss": 0.037, "step": 5703 }, { "epoch": 0.7335076514209782, "grad_norm": 0.15625, "learning_rate": 7.038052573555119e-05, "loss": 0.0366, "step": 5704 }, { "epoch": 0.7336362467315358, "grad_norm": 0.193359375, "learning_rate": 7.037129625022454e-05, "loss": 0.0392, "step": 5705 }, { "epoch": 0.7337648420420936, "grad_norm": 0.1826171875, "learning_rate": 7.036206593255056e-05, "loss": 0.0393, "step": 5706 }, { "epoch": 0.7338934373526512, "grad_norm": 0.2333984375, "learning_rate": 7.035283478290635e-05, "loss": 0.0332, "step": 5707 }, { "epoch": 0.7340220326632089, "grad_norm": 0.166015625, "learning_rate": 7.034360280166912e-05, "loss": 0.039, "step": 5708 }, { "epoch": 0.7341506279737665, "grad_norm": 0.171875, "learning_rate": 7.033436998921607e-05, "loss": 0.0376, "step": 5709 }, { "epoch": 0.7342792232843243, "grad_norm": 0.1611328125, "learning_rate": 7.032513634592443e-05, "loss": 0.0348, "step": 5710 }, { "epoch": 0.7344078185948819, "grad_norm": 0.1884765625, "learning_rate": 7.031590187217151e-05, "loss": 0.0441, "step": 5711 }, { "epoch": 0.7345364139054396, "grad_norm": 0.1796875, "learning_rate": 7.030666656833457e-05, "loss": 0.0485, "step": 5712 }, { "epoch": 0.7346650092159973, "grad_norm": 0.169921875, "learning_rate": 7.029743043479097e-05, "loss": 0.0385, "step": 5713 }, { "epoch": 0.734793604526555, "grad_norm": 0.15234375, "learning_rate": 7.028819347191811e-05, "loss": 0.0356, "step": 5714 }, { "epoch": 0.7349221998371126, "grad_norm": 0.1806640625, "learning_rate": 7.027895568009338e-05, "loss": 0.0432, "step": 5715 }, { "epoch": 0.7350507951476702, "grad_norm": 0.169921875, "learning_rate": 7.026971705969423e-05, "loss": 0.0403, "step": 5716 }, { "epoch": 0.735179390458228, "grad_norm": 0.18359375, "learning_rate": 7.026047761109813e-05, "loss": 0.0411, "step": 5717 }, { "epoch": 0.7353079857687856, "grad_norm": 0.1826171875, "learning_rate": 7.025123733468262e-05, "loss": 0.0458, "step": 5718 }, { "epoch": 0.7354365810793433, "grad_norm": 0.18359375, "learning_rate": 7.024199623082522e-05, "loss": 0.0442, "step": 5719 }, { "epoch": 0.7355651763899009, "grad_norm": 0.16015625, "learning_rate": 7.023275429990353e-05, "loss": 0.0292, "step": 5720 }, { "epoch": 0.7356937717004587, "grad_norm": 0.1953125, "learning_rate": 7.022351154229514e-05, "loss": 0.0476, "step": 5721 }, { "epoch": 0.7358223670110163, "grad_norm": 0.1787109375, "learning_rate": 7.021426795837774e-05, "loss": 0.0448, "step": 5722 }, { "epoch": 0.735950962321574, "grad_norm": 0.177734375, "learning_rate": 7.020502354852898e-05, "loss": 0.0434, "step": 5723 }, { "epoch": 0.7360795576321317, "grad_norm": 0.2001953125, "learning_rate": 7.019577831312657e-05, "loss": 0.0434, "step": 5724 }, { "epoch": 0.7362081529426894, "grad_norm": 0.181640625, "learning_rate": 7.018653225254828e-05, "loss": 0.043, "step": 5725 }, { "epoch": 0.736336748253247, "grad_norm": 0.185546875, "learning_rate": 7.017728536717187e-05, "loss": 0.0426, "step": 5726 }, { "epoch": 0.7364653435638047, "grad_norm": 0.1923828125, "learning_rate": 7.016803765737518e-05, "loss": 0.0502, "step": 5727 }, { "epoch": 0.7365939388743624, "grad_norm": 0.177734375, "learning_rate": 7.015878912353606e-05, "loss": 0.0416, "step": 5728 }, { "epoch": 0.7367225341849201, "grad_norm": 0.1611328125, "learning_rate": 7.014953976603238e-05, "loss": 0.0371, "step": 5729 }, { "epoch": 0.7368511294954777, "grad_norm": 0.18359375, "learning_rate": 7.014028958524207e-05, "loss": 0.0463, "step": 5730 }, { "epoch": 0.7369797248060354, "grad_norm": 0.166015625, "learning_rate": 7.013103858154307e-05, "loss": 0.0365, "step": 5731 }, { "epoch": 0.7371083201165931, "grad_norm": 0.1748046875, "learning_rate": 7.012178675531337e-05, "loss": 0.0458, "step": 5732 }, { "epoch": 0.7372369154271508, "grad_norm": 0.1572265625, "learning_rate": 7.011253410693099e-05, "loss": 0.0399, "step": 5733 }, { "epoch": 0.7373655107377084, "grad_norm": 0.1513671875, "learning_rate": 7.010328063677398e-05, "loss": 0.0397, "step": 5734 }, { "epoch": 0.7374941060482662, "grad_norm": 0.1796875, "learning_rate": 7.009402634522045e-05, "loss": 0.0366, "step": 5735 }, { "epoch": 0.7376227013588238, "grad_norm": 0.166015625, "learning_rate": 7.008477123264848e-05, "loss": 0.0386, "step": 5736 }, { "epoch": 0.7377512966693814, "grad_norm": 0.1748046875, "learning_rate": 7.007551529943623e-05, "loss": 0.0406, "step": 5737 }, { "epoch": 0.7378798919799391, "grad_norm": 0.1953125, "learning_rate": 7.006625854596193e-05, "loss": 0.0492, "step": 5738 }, { "epoch": 0.7380084872904968, "grad_norm": 0.189453125, "learning_rate": 7.005700097260374e-05, "loss": 0.0504, "step": 5739 }, { "epoch": 0.7381370826010545, "grad_norm": 0.193359375, "learning_rate": 7.004774257973996e-05, "loss": 0.0487, "step": 5740 }, { "epoch": 0.7382656779116121, "grad_norm": 0.1982421875, "learning_rate": 7.003848336774883e-05, "loss": 0.0469, "step": 5741 }, { "epoch": 0.7383942732221699, "grad_norm": 0.185546875, "learning_rate": 7.002922333700872e-05, "loss": 0.0476, "step": 5742 }, { "epoch": 0.7385228685327275, "grad_norm": 0.1689453125, "learning_rate": 7.001996248789796e-05, "loss": 0.0356, "step": 5743 }, { "epoch": 0.7386514638432852, "grad_norm": 0.1630859375, "learning_rate": 7.001070082079495e-05, "loss": 0.0414, "step": 5744 }, { "epoch": 0.7387800591538428, "grad_norm": 0.2021484375, "learning_rate": 7.000143833607808e-05, "loss": 0.0533, "step": 5745 }, { "epoch": 0.7389086544644006, "grad_norm": 0.1796875, "learning_rate": 6.999217503412583e-05, "loss": 0.0425, "step": 5746 }, { "epoch": 0.7390372497749582, "grad_norm": 0.1875, "learning_rate": 6.99829109153167e-05, "loss": 0.0488, "step": 5747 }, { "epoch": 0.7391658450855159, "grad_norm": 0.162109375, "learning_rate": 6.99736459800292e-05, "loss": 0.0356, "step": 5748 }, { "epoch": 0.7392944403960735, "grad_norm": 0.171875, "learning_rate": 6.996438022864186e-05, "loss": 0.0428, "step": 5749 }, { "epoch": 0.7394230357066313, "grad_norm": 0.1650390625, "learning_rate": 6.995511366153327e-05, "loss": 0.0371, "step": 5750 }, { "epoch": 0.7395516310171889, "grad_norm": 0.1669921875, "learning_rate": 6.99458462790821e-05, "loss": 0.0418, "step": 5751 }, { "epoch": 0.7396802263277465, "grad_norm": 0.1650390625, "learning_rate": 6.993657808166696e-05, "loss": 0.0378, "step": 5752 }, { "epoch": 0.7398088216383043, "grad_norm": 0.1748046875, "learning_rate": 6.992730906966654e-05, "loss": 0.0436, "step": 5753 }, { "epoch": 0.739937416948862, "grad_norm": 0.1767578125, "learning_rate": 6.991803924345959e-05, "loss": 0.0407, "step": 5754 }, { "epoch": 0.7400660122594196, "grad_norm": 0.15625, "learning_rate": 6.990876860342484e-05, "loss": 0.0375, "step": 5755 }, { "epoch": 0.7401946075699772, "grad_norm": 0.173828125, "learning_rate": 6.989949714994108e-05, "loss": 0.0465, "step": 5756 }, { "epoch": 0.740323202880535, "grad_norm": 0.16796875, "learning_rate": 6.989022488338712e-05, "loss": 0.0405, "step": 5757 }, { "epoch": 0.7404517981910926, "grad_norm": 0.1689453125, "learning_rate": 6.988095180414184e-05, "loss": 0.0394, "step": 5758 }, { "epoch": 0.7405803935016503, "grad_norm": 0.1650390625, "learning_rate": 6.987167791258412e-05, "loss": 0.0367, "step": 5759 }, { "epoch": 0.740708988812208, "grad_norm": 0.169921875, "learning_rate": 6.986240320909287e-05, "loss": 0.0423, "step": 5760 }, { "epoch": 0.7408375841227657, "grad_norm": 0.15625, "learning_rate": 6.985312769404705e-05, "loss": 0.0395, "step": 5761 }, { "epoch": 0.7409661794333233, "grad_norm": 0.166015625, "learning_rate": 6.984385136782563e-05, "loss": 0.0407, "step": 5762 }, { "epoch": 0.741094774743881, "grad_norm": 0.1748046875, "learning_rate": 6.983457423080766e-05, "loss": 0.0423, "step": 5763 }, { "epoch": 0.7412233700544387, "grad_norm": 0.171875, "learning_rate": 6.982529628337218e-05, "loss": 0.0369, "step": 5764 }, { "epoch": 0.7413519653649964, "grad_norm": 0.1796875, "learning_rate": 6.981601752589826e-05, "loss": 0.0377, "step": 5765 }, { "epoch": 0.741480560675554, "grad_norm": 0.17578125, "learning_rate": 6.980673795876505e-05, "loss": 0.0449, "step": 5766 }, { "epoch": 0.7416091559861117, "grad_norm": 0.185546875, "learning_rate": 6.979745758235167e-05, "loss": 0.0424, "step": 5767 }, { "epoch": 0.7417377512966694, "grad_norm": 0.177734375, "learning_rate": 6.978817639703733e-05, "loss": 0.0426, "step": 5768 }, { "epoch": 0.7418663466072271, "grad_norm": 0.18359375, "learning_rate": 6.977889440320124e-05, "loss": 0.0452, "step": 5769 }, { "epoch": 0.7419949419177847, "grad_norm": 0.1884765625, "learning_rate": 6.976961160122264e-05, "loss": 0.0414, "step": 5770 }, { "epoch": 0.7421235372283425, "grad_norm": 0.17578125, "learning_rate": 6.976032799148084e-05, "loss": 0.0434, "step": 5771 }, { "epoch": 0.7422521325389001, "grad_norm": 0.173828125, "learning_rate": 6.975104357435513e-05, "loss": 0.0452, "step": 5772 }, { "epoch": 0.7423807278494577, "grad_norm": 0.1875, "learning_rate": 6.97417583502249e-05, "loss": 0.0438, "step": 5773 }, { "epoch": 0.7425093231600154, "grad_norm": 0.16796875, "learning_rate": 6.973247231946947e-05, "loss": 0.0385, "step": 5774 }, { "epoch": 0.7426379184705731, "grad_norm": 0.1748046875, "learning_rate": 6.97231854824683e-05, "loss": 0.046, "step": 5775 }, { "epoch": 0.7427665137811308, "grad_norm": 0.1826171875, "learning_rate": 6.971389783960084e-05, "loss": 0.0456, "step": 5776 }, { "epoch": 0.7428951090916884, "grad_norm": 0.1591796875, "learning_rate": 6.970460939124657e-05, "loss": 0.036, "step": 5777 }, { "epoch": 0.7430237044022461, "grad_norm": 0.1611328125, "learning_rate": 6.969532013778499e-05, "loss": 0.0368, "step": 5778 }, { "epoch": 0.7431522997128038, "grad_norm": 0.1708984375, "learning_rate": 6.968603007959566e-05, "loss": 0.0402, "step": 5779 }, { "epoch": 0.7432808950233615, "grad_norm": 0.1845703125, "learning_rate": 6.967673921705816e-05, "loss": 0.0482, "step": 5780 }, { "epoch": 0.7434094903339191, "grad_norm": 0.177734375, "learning_rate": 6.96674475505521e-05, "loss": 0.042, "step": 5781 }, { "epoch": 0.7435380856444769, "grad_norm": 0.171875, "learning_rate": 6.965815508045713e-05, "loss": 0.0408, "step": 5782 }, { "epoch": 0.7436666809550345, "grad_norm": 0.1767578125, "learning_rate": 6.964886180715294e-05, "loss": 0.0435, "step": 5783 }, { "epoch": 0.7437952762655922, "grad_norm": 0.171875, "learning_rate": 6.963956773101921e-05, "loss": 0.0425, "step": 5784 }, { "epoch": 0.7439238715761498, "grad_norm": 0.1748046875, "learning_rate": 6.963027285243571e-05, "loss": 0.0469, "step": 5785 }, { "epoch": 0.7440524668867076, "grad_norm": 0.169921875, "learning_rate": 6.962097717178221e-05, "loss": 0.0359, "step": 5786 }, { "epoch": 0.7441810621972652, "grad_norm": 0.19140625, "learning_rate": 6.961168068943853e-05, "loss": 0.0519, "step": 5787 }, { "epoch": 0.7443096575078229, "grad_norm": 0.1845703125, "learning_rate": 6.960238340578451e-05, "loss": 0.0461, "step": 5788 }, { "epoch": 0.7444382528183806, "grad_norm": 0.185546875, "learning_rate": 6.959308532120001e-05, "loss": 0.046, "step": 5789 }, { "epoch": 0.7445668481289383, "grad_norm": 0.1728515625, "learning_rate": 6.958378643606498e-05, "loss": 0.0439, "step": 5790 }, { "epoch": 0.7446954434394959, "grad_norm": 0.16796875, "learning_rate": 6.957448675075931e-05, "loss": 0.0376, "step": 5791 }, { "epoch": 0.7448240387500535, "grad_norm": 0.171875, "learning_rate": 6.956518626566301e-05, "loss": 0.0426, "step": 5792 }, { "epoch": 0.7449526340606113, "grad_norm": 0.19921875, "learning_rate": 6.955588498115606e-05, "loss": 0.0497, "step": 5793 }, { "epoch": 0.7450812293711689, "grad_norm": 0.1640625, "learning_rate": 6.954658289761853e-05, "loss": 0.0435, "step": 5794 }, { "epoch": 0.7452098246817266, "grad_norm": 0.177734375, "learning_rate": 6.953728001543049e-05, "loss": 0.0424, "step": 5795 }, { "epoch": 0.7453384199922842, "grad_norm": 0.1455078125, "learning_rate": 6.9527976334972e-05, "loss": 0.0328, "step": 5796 }, { "epoch": 0.745467015302842, "grad_norm": 0.1708984375, "learning_rate": 6.951867185662325e-05, "loss": 0.0422, "step": 5797 }, { "epoch": 0.7455956106133996, "grad_norm": 0.2060546875, "learning_rate": 6.950936658076438e-05, "loss": 0.0458, "step": 5798 }, { "epoch": 0.7457242059239573, "grad_norm": 0.181640625, "learning_rate": 6.950006050777561e-05, "loss": 0.0463, "step": 5799 }, { "epoch": 0.745852801234515, "grad_norm": 0.21875, "learning_rate": 6.949075363803716e-05, "loss": 0.0487, "step": 5800 }, { "epoch": 0.7459813965450727, "grad_norm": 0.171875, "learning_rate": 6.94814459719293e-05, "loss": 0.0386, "step": 5801 }, { "epoch": 0.7461099918556303, "grad_norm": 0.169921875, "learning_rate": 6.947213750983235e-05, "loss": 0.0434, "step": 5802 }, { "epoch": 0.746238587166188, "grad_norm": 0.177734375, "learning_rate": 6.946282825212663e-05, "loss": 0.0499, "step": 5803 }, { "epoch": 0.7463671824767457, "grad_norm": 0.189453125, "learning_rate": 6.945351819919249e-05, "loss": 0.0432, "step": 5804 }, { "epoch": 0.7464957777873034, "grad_norm": 0.1826171875, "learning_rate": 6.944420735141036e-05, "loss": 0.043, "step": 5805 }, { "epoch": 0.746624373097861, "grad_norm": 0.17578125, "learning_rate": 6.943489570916064e-05, "loss": 0.0388, "step": 5806 }, { "epoch": 0.7467529684084188, "grad_norm": 0.1650390625, "learning_rate": 6.94255832728238e-05, "loss": 0.0371, "step": 5807 }, { "epoch": 0.7468815637189764, "grad_norm": 0.166015625, "learning_rate": 6.941627004278035e-05, "loss": 0.0391, "step": 5808 }, { "epoch": 0.747010159029534, "grad_norm": 0.162109375, "learning_rate": 6.94069560194108e-05, "loss": 0.0371, "step": 5809 }, { "epoch": 0.7471387543400917, "grad_norm": 0.1767578125, "learning_rate": 6.939764120309573e-05, "loss": 0.0445, "step": 5810 }, { "epoch": 0.7472673496506494, "grad_norm": 0.1630859375, "learning_rate": 6.938832559421571e-05, "loss": 0.0375, "step": 5811 }, { "epoch": 0.7473959449612071, "grad_norm": 0.1572265625, "learning_rate": 6.937900919315138e-05, "loss": 0.037, "step": 5812 }, { "epoch": 0.7475245402717647, "grad_norm": 0.173828125, "learning_rate": 6.936969200028338e-05, "loss": 0.0387, "step": 5813 }, { "epoch": 0.7476531355823224, "grad_norm": 0.1494140625, "learning_rate": 6.936037401599243e-05, "loss": 0.0346, "step": 5814 }, { "epoch": 0.7477817308928801, "grad_norm": 0.181640625, "learning_rate": 6.935105524065922e-05, "loss": 0.0354, "step": 5815 }, { "epoch": 0.7479103262034378, "grad_norm": 0.166015625, "learning_rate": 6.934173567466454e-05, "loss": 0.0325, "step": 5816 }, { "epoch": 0.7480389215139954, "grad_norm": 0.1708984375, "learning_rate": 6.933241531838913e-05, "loss": 0.0376, "step": 5817 }, { "epoch": 0.7481675168245532, "grad_norm": 0.16796875, "learning_rate": 6.932309417221385e-05, "loss": 0.0372, "step": 5818 }, { "epoch": 0.7482961121351108, "grad_norm": 0.193359375, "learning_rate": 6.931377223651954e-05, "loss": 0.042, "step": 5819 }, { "epoch": 0.7484247074456685, "grad_norm": 0.171875, "learning_rate": 6.930444951168707e-05, "loss": 0.0366, "step": 5820 }, { "epoch": 0.7485533027562261, "grad_norm": 0.1669921875, "learning_rate": 6.929512599809735e-05, "loss": 0.0421, "step": 5821 }, { "epoch": 0.7486818980667839, "grad_norm": 0.1611328125, "learning_rate": 6.928580169613135e-05, "loss": 0.033, "step": 5822 }, { "epoch": 0.7488104933773415, "grad_norm": 0.244140625, "learning_rate": 6.927647660617006e-05, "loss": 0.0469, "step": 5823 }, { "epoch": 0.7489390886878992, "grad_norm": 0.1787109375, "learning_rate": 6.926715072859445e-05, "loss": 0.0342, "step": 5824 }, { "epoch": 0.7490676839984568, "grad_norm": 0.1640625, "learning_rate": 6.92578240637856e-05, "loss": 0.0366, "step": 5825 }, { "epoch": 0.7491962793090146, "grad_norm": 0.1748046875, "learning_rate": 6.924849661212458e-05, "loss": 0.0434, "step": 5826 }, { "epoch": 0.7493248746195722, "grad_norm": 0.1591796875, "learning_rate": 6.923916837399249e-05, "loss": 0.0333, "step": 5827 }, { "epoch": 0.7494534699301298, "grad_norm": 0.19140625, "learning_rate": 6.922983934977047e-05, "loss": 0.0433, "step": 5828 }, { "epoch": 0.7495820652406876, "grad_norm": 0.17578125, "learning_rate": 6.92205095398397e-05, "loss": 0.0373, "step": 5829 }, { "epoch": 0.7497106605512452, "grad_norm": 0.1943359375, "learning_rate": 6.92111789445814e-05, "loss": 0.0393, "step": 5830 }, { "epoch": 0.7498392558618029, "grad_norm": 0.1630859375, "learning_rate": 6.920184756437677e-05, "loss": 0.0376, "step": 5831 }, { "epoch": 0.7499678511723605, "grad_norm": 0.1494140625, "learning_rate": 6.91925153996071e-05, "loss": 0.0333, "step": 5832 }, { "epoch": 0.7500964464829183, "grad_norm": 0.2099609375, "learning_rate": 6.918318245065371e-05, "loss": 0.0482, "step": 5833 }, { "epoch": 0.7502250417934759, "grad_norm": 0.21484375, "learning_rate": 6.917384871789789e-05, "loss": 0.0565, "step": 5834 }, { "epoch": 0.7503536371040336, "grad_norm": 0.16015625, "learning_rate": 6.916451420172106e-05, "loss": 0.0366, "step": 5835 }, { "epoch": 0.7504822324145913, "grad_norm": 0.1689453125, "learning_rate": 6.915517890250456e-05, "loss": 0.0344, "step": 5836 }, { "epoch": 0.750610827725149, "grad_norm": 0.1884765625, "learning_rate": 6.914584282062986e-05, "loss": 0.0412, "step": 5837 }, { "epoch": 0.7507394230357066, "grad_norm": 0.1484375, "learning_rate": 6.91365059564784e-05, "loss": 0.0393, "step": 5838 }, { "epoch": 0.7508680183462643, "grad_norm": 0.173828125, "learning_rate": 6.912716831043166e-05, "loss": 0.042, "step": 5839 }, { "epoch": 0.750996613656822, "grad_norm": 0.1552734375, "learning_rate": 6.911782988287123e-05, "loss": 0.0335, "step": 5840 }, { "epoch": 0.7511252089673797, "grad_norm": 0.158203125, "learning_rate": 6.910849067417859e-05, "loss": 0.0344, "step": 5841 }, { "epoch": 0.7512538042779373, "grad_norm": 0.1708984375, "learning_rate": 6.90991506847354e-05, "loss": 0.0392, "step": 5842 }, { "epoch": 0.751382399588495, "grad_norm": 0.19140625, "learning_rate": 6.908980991492322e-05, "loss": 0.0403, "step": 5843 }, { "epoch": 0.7515109948990527, "grad_norm": 0.158203125, "learning_rate": 6.908046836512373e-05, "loss": 0.0372, "step": 5844 }, { "epoch": 0.7516395902096104, "grad_norm": 0.1728515625, "learning_rate": 6.907112603571861e-05, "loss": 0.0424, "step": 5845 }, { "epoch": 0.751768185520168, "grad_norm": 0.1806640625, "learning_rate": 6.906178292708958e-05, "loss": 0.0434, "step": 5846 }, { "epoch": 0.7518967808307258, "grad_norm": 0.177734375, "learning_rate": 6.905243903961838e-05, "loss": 0.0436, "step": 5847 }, { "epoch": 0.7520253761412834, "grad_norm": 0.177734375, "learning_rate": 6.904309437368681e-05, "loss": 0.0428, "step": 5848 }, { "epoch": 0.752153971451841, "grad_norm": 0.1708984375, "learning_rate": 6.903374892967667e-05, "loss": 0.0405, "step": 5849 }, { "epoch": 0.7522825667623987, "grad_norm": 0.1767578125, "learning_rate": 6.902440270796979e-05, "loss": 0.0397, "step": 5850 }, { "epoch": 0.7524111620729564, "grad_norm": 0.1806640625, "learning_rate": 6.901505570894806e-05, "loss": 0.044, "step": 5851 }, { "epoch": 0.7525397573835141, "grad_norm": 0.1923828125, "learning_rate": 6.90057079329934e-05, "loss": 0.0539, "step": 5852 }, { "epoch": 0.7526683526940717, "grad_norm": 0.1865234375, "learning_rate": 6.899635938048772e-05, "loss": 0.0494, "step": 5853 }, { "epoch": 0.7527969480046295, "grad_norm": 0.17578125, "learning_rate": 6.898701005181303e-05, "loss": 0.0463, "step": 5854 }, { "epoch": 0.7529255433151871, "grad_norm": 0.1923828125, "learning_rate": 6.89776599473513e-05, "loss": 0.0432, "step": 5855 }, { "epoch": 0.7530541386257448, "grad_norm": 0.1845703125, "learning_rate": 6.896830906748456e-05, "loss": 0.0454, "step": 5856 }, { "epoch": 0.7531827339363024, "grad_norm": 0.1650390625, "learning_rate": 6.895895741259491e-05, "loss": 0.0433, "step": 5857 }, { "epoch": 0.7533113292468602, "grad_norm": 0.16015625, "learning_rate": 6.89496049830644e-05, "loss": 0.0366, "step": 5858 }, { "epoch": 0.7534399245574178, "grad_norm": 0.16796875, "learning_rate": 6.89402517792752e-05, "loss": 0.0396, "step": 5859 }, { "epoch": 0.7535685198679755, "grad_norm": 0.142578125, "learning_rate": 6.893089780160946e-05, "loss": 0.0361, "step": 5860 }, { "epoch": 0.7536971151785331, "grad_norm": 0.1923828125, "learning_rate": 6.892154305044938e-05, "loss": 0.0513, "step": 5861 }, { "epoch": 0.7538257104890909, "grad_norm": 0.1845703125, "learning_rate": 6.891218752617716e-05, "loss": 0.046, "step": 5862 }, { "epoch": 0.7539543057996485, "grad_norm": 0.1806640625, "learning_rate": 6.890283122917506e-05, "loss": 0.0542, "step": 5863 }, { "epoch": 0.7540829011102061, "grad_norm": 0.193359375, "learning_rate": 6.88934741598254e-05, "loss": 0.0368, "step": 5864 }, { "epoch": 0.7542114964207639, "grad_norm": 0.1572265625, "learning_rate": 6.888411631851047e-05, "loss": 0.0397, "step": 5865 }, { "epoch": 0.7543400917313215, "grad_norm": 0.1953125, "learning_rate": 6.887475770561264e-05, "loss": 0.051, "step": 5866 }, { "epoch": 0.7544686870418792, "grad_norm": 0.171875, "learning_rate": 6.886539832151425e-05, "loss": 0.042, "step": 5867 }, { "epoch": 0.7545972823524368, "grad_norm": 0.1962890625, "learning_rate": 6.885603816659776e-05, "loss": 0.0448, "step": 5868 }, { "epoch": 0.7547258776629946, "grad_norm": 0.173828125, "learning_rate": 6.88466772412456e-05, "loss": 0.047, "step": 5869 }, { "epoch": 0.7548544729735522, "grad_norm": 0.162109375, "learning_rate": 6.883731554584024e-05, "loss": 0.0414, "step": 5870 }, { "epoch": 0.7549830682841099, "grad_norm": 0.154296875, "learning_rate": 6.882795308076417e-05, "loss": 0.0347, "step": 5871 }, { "epoch": 0.7551116635946675, "grad_norm": 0.14453125, "learning_rate": 6.881858984639997e-05, "loss": 0.0338, "step": 5872 }, { "epoch": 0.7552402589052253, "grad_norm": 0.16796875, "learning_rate": 6.88092258431302e-05, "loss": 0.0388, "step": 5873 }, { "epoch": 0.7553688542157829, "grad_norm": 0.1787109375, "learning_rate": 6.879986107133745e-05, "loss": 0.044, "step": 5874 }, { "epoch": 0.7554974495263406, "grad_norm": 0.171875, "learning_rate": 6.879049553140435e-05, "loss": 0.0459, "step": 5875 }, { "epoch": 0.7556260448368983, "grad_norm": 0.162109375, "learning_rate": 6.878112922371358e-05, "loss": 0.0379, "step": 5876 }, { "epoch": 0.755754640147456, "grad_norm": 0.203125, "learning_rate": 6.877176214864782e-05, "loss": 0.0471, "step": 5877 }, { "epoch": 0.7558832354580136, "grad_norm": 0.1748046875, "learning_rate": 6.876239430658982e-05, "loss": 0.0419, "step": 5878 }, { "epoch": 0.7560118307685713, "grad_norm": 0.1708984375, "learning_rate": 6.875302569792233e-05, "loss": 0.0396, "step": 5879 }, { "epoch": 0.756140426079129, "grad_norm": 0.1787109375, "learning_rate": 6.87436563230281e-05, "loss": 0.0435, "step": 5880 }, { "epoch": 0.7562690213896867, "grad_norm": 0.189453125, "learning_rate": 6.873428618229003e-05, "loss": 0.0417, "step": 5881 }, { "epoch": 0.7563976167002443, "grad_norm": 0.1806640625, "learning_rate": 6.872491527609091e-05, "loss": 0.0439, "step": 5882 }, { "epoch": 0.7565262120108021, "grad_norm": 0.1904296875, "learning_rate": 6.871554360481366e-05, "loss": 0.0427, "step": 5883 }, { "epoch": 0.7566548073213597, "grad_norm": 0.1796875, "learning_rate": 6.870617116884116e-05, "loss": 0.0452, "step": 5884 }, { "epoch": 0.7567834026319173, "grad_norm": 0.1787109375, "learning_rate": 6.869679796855639e-05, "loss": 0.0405, "step": 5885 }, { "epoch": 0.756911997942475, "grad_norm": 0.16015625, "learning_rate": 6.868742400434231e-05, "loss": 0.0347, "step": 5886 }, { "epoch": 0.7570405932530327, "grad_norm": 0.1689453125, "learning_rate": 6.867804927658194e-05, "loss": 0.0403, "step": 5887 }, { "epoch": 0.7571691885635904, "grad_norm": 0.185546875, "learning_rate": 6.866867378565832e-05, "loss": 0.0428, "step": 5888 }, { "epoch": 0.757297783874148, "grad_norm": 0.19140625, "learning_rate": 6.865929753195451e-05, "loss": 0.0447, "step": 5889 }, { "epoch": 0.7574263791847057, "grad_norm": 0.1865234375, "learning_rate": 6.864992051585363e-05, "loss": 0.0431, "step": 5890 }, { "epoch": 0.7575549744952634, "grad_norm": 0.296875, "learning_rate": 6.864054273773879e-05, "loss": 0.0368, "step": 5891 }, { "epoch": 0.7576835698058211, "grad_norm": 0.2109375, "learning_rate": 6.863116419799317e-05, "loss": 0.0609, "step": 5892 }, { "epoch": 0.7578121651163787, "grad_norm": 0.1845703125, "learning_rate": 6.862178489699998e-05, "loss": 0.0459, "step": 5893 }, { "epoch": 0.7579407604269365, "grad_norm": 0.177734375, "learning_rate": 6.86124048351424e-05, "loss": 0.0434, "step": 5894 }, { "epoch": 0.7580693557374941, "grad_norm": 0.1591796875, "learning_rate": 6.860302401280376e-05, "loss": 0.0403, "step": 5895 }, { "epoch": 0.7581979510480518, "grad_norm": 0.15234375, "learning_rate": 6.859364243036727e-05, "loss": 0.0346, "step": 5896 }, { "epoch": 0.7583265463586094, "grad_norm": 0.1728515625, "learning_rate": 6.858426008821632e-05, "loss": 0.0368, "step": 5897 }, { "epoch": 0.7584551416691672, "grad_norm": 0.18359375, "learning_rate": 6.857487698673422e-05, "loss": 0.0457, "step": 5898 }, { "epoch": 0.7585837369797248, "grad_norm": 0.173828125, "learning_rate": 6.856549312630438e-05, "loss": 0.0372, "step": 5899 }, { "epoch": 0.7587123322902825, "grad_norm": 0.1708984375, "learning_rate": 6.855610850731018e-05, "loss": 0.0433, "step": 5900 }, { "epoch": 0.7588409276008402, "grad_norm": 0.1689453125, "learning_rate": 6.85467231301351e-05, "loss": 0.0441, "step": 5901 }, { "epoch": 0.7589695229113979, "grad_norm": 0.166015625, "learning_rate": 6.853733699516258e-05, "loss": 0.0384, "step": 5902 }, { "epoch": 0.7590981182219555, "grad_norm": 0.169921875, "learning_rate": 6.852795010277615e-05, "loss": 0.0424, "step": 5903 }, { "epoch": 0.7592267135325131, "grad_norm": 0.1728515625, "learning_rate": 6.851856245335934e-05, "loss": 0.0404, "step": 5904 }, { "epoch": 0.7593553088430709, "grad_norm": 0.17578125, "learning_rate": 6.850917404729573e-05, "loss": 0.0453, "step": 5905 }, { "epoch": 0.7594839041536285, "grad_norm": 0.169921875, "learning_rate": 6.84997848849689e-05, "loss": 0.0443, "step": 5906 }, { "epoch": 0.7596124994641862, "grad_norm": 0.1826171875, "learning_rate": 6.849039496676249e-05, "loss": 0.0427, "step": 5907 }, { "epoch": 0.7597410947747438, "grad_norm": 0.1728515625, "learning_rate": 6.848100429306015e-05, "loss": 0.0512, "step": 5908 }, { "epoch": 0.7598696900853016, "grad_norm": 0.197265625, "learning_rate": 6.847161286424562e-05, "loss": 0.0472, "step": 5909 }, { "epoch": 0.7599982853958592, "grad_norm": 0.1806640625, "learning_rate": 6.846222068070255e-05, "loss": 0.047, "step": 5910 }, { "epoch": 0.7601268807064169, "grad_norm": 0.1650390625, "learning_rate": 6.845282774281475e-05, "loss": 0.0427, "step": 5911 }, { "epoch": 0.7602554760169746, "grad_norm": 0.1953125, "learning_rate": 6.844343405096597e-05, "loss": 0.0514, "step": 5912 }, { "epoch": 0.7603840713275323, "grad_norm": 0.162109375, "learning_rate": 6.843403960554004e-05, "loss": 0.0345, "step": 5913 }, { "epoch": 0.7605126666380899, "grad_norm": 0.1787109375, "learning_rate": 6.842464440692082e-05, "loss": 0.042, "step": 5914 }, { "epoch": 0.7606412619486476, "grad_norm": 0.1640625, "learning_rate": 6.841524845549217e-05, "loss": 0.0402, "step": 5915 }, { "epoch": 0.7607698572592053, "grad_norm": 0.177734375, "learning_rate": 6.840585175163799e-05, "loss": 0.0424, "step": 5916 }, { "epoch": 0.760898452569763, "grad_norm": 0.140625, "learning_rate": 6.839645429574222e-05, "loss": 0.0343, "step": 5917 }, { "epoch": 0.7610270478803206, "grad_norm": 0.17578125, "learning_rate": 6.838705608818886e-05, "loss": 0.0473, "step": 5918 }, { "epoch": 0.7611556431908783, "grad_norm": 0.1787109375, "learning_rate": 6.837765712936186e-05, "loss": 0.0458, "step": 5919 }, { "epoch": 0.761284238501436, "grad_norm": 0.1640625, "learning_rate": 6.836825741964529e-05, "loss": 0.041, "step": 5920 }, { "epoch": 0.7614128338119937, "grad_norm": 0.181640625, "learning_rate": 6.83588569594232e-05, "loss": 0.0394, "step": 5921 }, { "epoch": 0.7615414291225513, "grad_norm": 0.154296875, "learning_rate": 6.834945574907968e-05, "loss": 0.0397, "step": 5922 }, { "epoch": 0.761670024433109, "grad_norm": 0.1865234375, "learning_rate": 6.834005378899886e-05, "loss": 0.0376, "step": 5923 }, { "epoch": 0.7617986197436667, "grad_norm": 0.1884765625, "learning_rate": 6.833065107956487e-05, "loss": 0.0511, "step": 5924 }, { "epoch": 0.7619272150542243, "grad_norm": 0.1611328125, "learning_rate": 6.832124762116192e-05, "loss": 0.0379, "step": 5925 }, { "epoch": 0.762055810364782, "grad_norm": 0.19140625, "learning_rate": 6.831184341417421e-05, "loss": 0.0438, "step": 5926 }, { "epoch": 0.7621844056753397, "grad_norm": 0.166015625, "learning_rate": 6.830243845898598e-05, "loss": 0.0402, "step": 5927 }, { "epoch": 0.7623130009858974, "grad_norm": 0.162109375, "learning_rate": 6.829303275598154e-05, "loss": 0.0378, "step": 5928 }, { "epoch": 0.762441596296455, "grad_norm": 0.171875, "learning_rate": 6.828362630554514e-05, "loss": 0.0422, "step": 5929 }, { "epoch": 0.7625701916070128, "grad_norm": 0.1787109375, "learning_rate": 6.827421910806117e-05, "loss": 0.0534, "step": 5930 }, { "epoch": 0.7626987869175704, "grad_norm": 0.1484375, "learning_rate": 6.826481116391396e-05, "loss": 0.0354, "step": 5931 }, { "epoch": 0.7628273822281281, "grad_norm": 0.1728515625, "learning_rate": 6.825540247348792e-05, "loss": 0.0399, "step": 5932 }, { "epoch": 0.7629559775386857, "grad_norm": 0.1708984375, "learning_rate": 6.82459930371675e-05, "loss": 0.04, "step": 5933 }, { "epoch": 0.7630845728492435, "grad_norm": 0.171875, "learning_rate": 6.823658285533712e-05, "loss": 0.0399, "step": 5934 }, { "epoch": 0.7632131681598011, "grad_norm": 0.1650390625, "learning_rate": 6.822717192838129e-05, "loss": 0.0448, "step": 5935 }, { "epoch": 0.7633417634703588, "grad_norm": 0.177734375, "learning_rate": 6.821776025668451e-05, "loss": 0.0371, "step": 5936 }, { "epoch": 0.7634703587809164, "grad_norm": 0.1943359375, "learning_rate": 6.820834784063139e-05, "loss": 0.053, "step": 5937 }, { "epoch": 0.7635989540914742, "grad_norm": 0.1728515625, "learning_rate": 6.819893468060643e-05, "loss": 0.0428, "step": 5938 }, { "epoch": 0.7637275494020318, "grad_norm": 0.1708984375, "learning_rate": 6.818952077699429e-05, "loss": 0.0336, "step": 5939 }, { "epoch": 0.7638561447125894, "grad_norm": 0.1689453125, "learning_rate": 6.81801061301796e-05, "loss": 0.042, "step": 5940 }, { "epoch": 0.7639847400231472, "grad_norm": 0.16796875, "learning_rate": 6.817069074054701e-05, "loss": 0.0403, "step": 5941 }, { "epoch": 0.7641133353337048, "grad_norm": 0.1767578125, "learning_rate": 6.816127460848127e-05, "loss": 0.0411, "step": 5942 }, { "epoch": 0.7642419306442625, "grad_norm": 0.18359375, "learning_rate": 6.815185773436707e-05, "loss": 0.0467, "step": 5943 }, { "epoch": 0.7643705259548201, "grad_norm": 0.193359375, "learning_rate": 6.81424401185892e-05, "loss": 0.041, "step": 5944 }, { "epoch": 0.7644991212653779, "grad_norm": 0.1728515625, "learning_rate": 6.813302176153243e-05, "loss": 0.0437, "step": 5945 }, { "epoch": 0.7646277165759355, "grad_norm": 0.1884765625, "learning_rate": 6.812360266358158e-05, "loss": 0.044, "step": 5946 }, { "epoch": 0.7647563118864932, "grad_norm": 0.1630859375, "learning_rate": 6.811418282512152e-05, "loss": 0.0382, "step": 5947 }, { "epoch": 0.7648849071970509, "grad_norm": 0.1904296875, "learning_rate": 6.810476224653714e-05, "loss": 0.0483, "step": 5948 }, { "epoch": 0.7650135025076086, "grad_norm": 0.2001953125, "learning_rate": 6.809534092821333e-05, "loss": 0.0436, "step": 5949 }, { "epoch": 0.7651420978181662, "grad_norm": 0.1689453125, "learning_rate": 6.808591887053508e-05, "loss": 0.0399, "step": 5950 }, { "epoch": 0.7652706931287239, "grad_norm": 0.2216796875, "learning_rate": 6.807649607388728e-05, "loss": 0.0318, "step": 5951 }, { "epoch": 0.7653992884392816, "grad_norm": 0.1826171875, "learning_rate": 6.806707253865503e-05, "loss": 0.0468, "step": 5952 }, { "epoch": 0.7655278837498393, "grad_norm": 0.1748046875, "learning_rate": 6.80576482652233e-05, "loss": 0.0433, "step": 5953 }, { "epoch": 0.7656564790603969, "grad_norm": 0.1689453125, "learning_rate": 6.804822325397718e-05, "loss": 0.0408, "step": 5954 }, { "epoch": 0.7657850743709546, "grad_norm": 0.1669921875, "learning_rate": 6.803879750530175e-05, "loss": 0.042, "step": 5955 }, { "epoch": 0.7659136696815123, "grad_norm": 0.166015625, "learning_rate": 6.802937101958216e-05, "loss": 0.0361, "step": 5956 }, { "epoch": 0.76604226499207, "grad_norm": 0.1669921875, "learning_rate": 6.801994379720354e-05, "loss": 0.0431, "step": 5957 }, { "epoch": 0.7661708603026276, "grad_norm": 0.1708984375, "learning_rate": 6.80105158385511e-05, "loss": 0.0365, "step": 5958 }, { "epoch": 0.7662994556131854, "grad_norm": 0.2001953125, "learning_rate": 6.800108714401003e-05, "loss": 0.0408, "step": 5959 }, { "epoch": 0.766428050923743, "grad_norm": 0.158203125, "learning_rate": 6.799165771396559e-05, "loss": 0.0329, "step": 5960 }, { "epoch": 0.7665566462343006, "grad_norm": 0.1630859375, "learning_rate": 6.798222754880304e-05, "loss": 0.0338, "step": 5961 }, { "epoch": 0.7666852415448583, "grad_norm": 0.1826171875, "learning_rate": 6.797279664890772e-05, "loss": 0.0412, "step": 5962 }, { "epoch": 0.766813836855416, "grad_norm": 0.1552734375, "learning_rate": 6.796336501466491e-05, "loss": 0.0387, "step": 5963 }, { "epoch": 0.7669424321659737, "grad_norm": 0.1767578125, "learning_rate": 6.795393264646004e-05, "loss": 0.0397, "step": 5964 }, { "epoch": 0.7670710274765313, "grad_norm": 0.185546875, "learning_rate": 6.794449954467846e-05, "loss": 0.043, "step": 5965 }, { "epoch": 0.7671996227870891, "grad_norm": 0.1796875, "learning_rate": 6.793506570970563e-05, "loss": 0.0455, "step": 5966 }, { "epoch": 0.7673282180976467, "grad_norm": 0.1767578125, "learning_rate": 6.792563114192695e-05, "loss": 0.0471, "step": 5967 }, { "epoch": 0.7674568134082044, "grad_norm": 0.1689453125, "learning_rate": 6.791619584172796e-05, "loss": 0.0371, "step": 5968 }, { "epoch": 0.767585408718762, "grad_norm": 0.17578125, "learning_rate": 6.790675980949415e-05, "loss": 0.0435, "step": 5969 }, { "epoch": 0.7677140040293198, "grad_norm": 0.234375, "learning_rate": 6.789732304561107e-05, "loss": 0.0416, "step": 5970 }, { "epoch": 0.7678425993398774, "grad_norm": 0.1689453125, "learning_rate": 6.788788555046432e-05, "loss": 0.0402, "step": 5971 }, { "epoch": 0.7679711946504351, "grad_norm": 0.1982421875, "learning_rate": 6.787844732443947e-05, "loss": 0.0429, "step": 5972 }, { "epoch": 0.7680997899609927, "grad_norm": 0.1767578125, "learning_rate": 6.786900836792214e-05, "loss": 0.0394, "step": 5973 }, { "epoch": 0.7682283852715505, "grad_norm": 0.185546875, "learning_rate": 6.785956868129806e-05, "loss": 0.0529, "step": 5974 }, { "epoch": 0.7683569805821081, "grad_norm": 0.158203125, "learning_rate": 6.785012826495285e-05, "loss": 0.0371, "step": 5975 }, { "epoch": 0.7684855758926658, "grad_norm": 0.1474609375, "learning_rate": 6.784068711927229e-05, "loss": 0.0314, "step": 5976 }, { "epoch": 0.7686141712032235, "grad_norm": 0.181640625, "learning_rate": 6.783124524464211e-05, "loss": 0.0447, "step": 5977 }, { "epoch": 0.7687427665137812, "grad_norm": 0.17578125, "learning_rate": 6.782180264144811e-05, "loss": 0.0459, "step": 5978 }, { "epoch": 0.7688713618243388, "grad_norm": 0.1962890625, "learning_rate": 6.781235931007607e-05, "loss": 0.0477, "step": 5979 }, { "epoch": 0.7689999571348964, "grad_norm": 0.2392578125, "learning_rate": 6.780291525091187e-05, "loss": 0.0438, "step": 5980 }, { "epoch": 0.7691285524454542, "grad_norm": 0.1953125, "learning_rate": 6.779347046434135e-05, "loss": 0.0451, "step": 5981 }, { "epoch": 0.7692571477560118, "grad_norm": 0.162109375, "learning_rate": 6.778402495075045e-05, "loss": 0.0343, "step": 5982 }, { "epoch": 0.7693857430665695, "grad_norm": 0.16015625, "learning_rate": 6.777457871052508e-05, "loss": 0.033, "step": 5983 }, { "epoch": 0.7695143383771271, "grad_norm": 0.16796875, "learning_rate": 6.77651317440512e-05, "loss": 0.0461, "step": 5984 }, { "epoch": 0.7696429336876849, "grad_norm": 0.1875, "learning_rate": 6.775568405171481e-05, "loss": 0.0429, "step": 5985 }, { "epoch": 0.7697715289982425, "grad_norm": 0.158203125, "learning_rate": 6.774623563390194e-05, "loss": 0.0387, "step": 5986 }, { "epoch": 0.7699001243088002, "grad_norm": 0.1953125, "learning_rate": 6.773678649099862e-05, "loss": 0.0375, "step": 5987 }, { "epoch": 0.7700287196193579, "grad_norm": 0.1767578125, "learning_rate": 6.772733662339094e-05, "loss": 0.041, "step": 5988 }, { "epoch": 0.7701573149299156, "grad_norm": 0.1708984375, "learning_rate": 6.771788603146502e-05, "loss": 0.0438, "step": 5989 }, { "epoch": 0.7702859102404732, "grad_norm": 0.1650390625, "learning_rate": 6.7708434715607e-05, "loss": 0.0402, "step": 5990 }, { "epoch": 0.7704145055510309, "grad_norm": 0.1650390625, "learning_rate": 6.769898267620302e-05, "loss": 0.0339, "step": 5991 }, { "epoch": 0.7705431008615886, "grad_norm": 0.169921875, "learning_rate": 6.768952991363933e-05, "loss": 0.0409, "step": 5992 }, { "epoch": 0.7706716961721463, "grad_norm": 0.1552734375, "learning_rate": 6.768007642830211e-05, "loss": 0.0352, "step": 5993 }, { "epoch": 0.7708002914827039, "grad_norm": 0.1962890625, "learning_rate": 6.767062222057764e-05, "loss": 0.0453, "step": 5994 }, { "epoch": 0.7709288867932617, "grad_norm": 0.1611328125, "learning_rate": 6.766116729085222e-05, "loss": 0.0364, "step": 5995 }, { "epoch": 0.7710574821038193, "grad_norm": 0.1572265625, "learning_rate": 6.765171163951215e-05, "loss": 0.0365, "step": 5996 }, { "epoch": 0.771186077414377, "grad_norm": 0.1748046875, "learning_rate": 6.764225526694379e-05, "loss": 0.0478, "step": 5997 }, { "epoch": 0.7713146727249346, "grad_norm": 0.18359375, "learning_rate": 6.763279817353348e-05, "loss": 0.0435, "step": 5998 }, { "epoch": 0.7714432680354923, "grad_norm": 0.193359375, "learning_rate": 6.762334035966767e-05, "loss": 0.0413, "step": 5999 }, { "epoch": 0.77157186334605, "grad_norm": 0.1552734375, "learning_rate": 6.761388182573279e-05, "loss": 0.0355, "step": 6000 }, { "epoch": 0.77157186334605, "eval_loss": 0.04111693054437637, "eval_runtime": 1042.5318, "eval_samples_per_second": 94.219, "eval_steps_per_second": 1.178, "step": 6000 }, { "epoch": 0.7717004586566076, "grad_norm": 0.1669921875, "learning_rate": 6.760442257211526e-05, "loss": 0.0392, "step": 6001 }, { "epoch": 0.7718290539671653, "grad_norm": 0.2158203125, "learning_rate": 6.759496259920163e-05, "loss": 0.0498, "step": 6002 }, { "epoch": 0.771957649277723, "grad_norm": 0.17578125, "learning_rate": 6.75855019073784e-05, "loss": 0.0485, "step": 6003 }, { "epoch": 0.7720862445882807, "grad_norm": 0.181640625, "learning_rate": 6.757604049703212e-05, "loss": 0.0403, "step": 6004 }, { "epoch": 0.7722148398988383, "grad_norm": 0.18359375, "learning_rate": 6.756657836854938e-05, "loss": 0.0423, "step": 6005 }, { "epoch": 0.7723434352093961, "grad_norm": 0.1865234375, "learning_rate": 6.755711552231679e-05, "loss": 0.0376, "step": 6006 }, { "epoch": 0.7724720305199537, "grad_norm": 0.171875, "learning_rate": 6.754765195872099e-05, "loss": 0.0388, "step": 6007 }, { "epoch": 0.7726006258305114, "grad_norm": 0.1845703125, "learning_rate": 6.753818767814863e-05, "loss": 0.0412, "step": 6008 }, { "epoch": 0.772729221141069, "grad_norm": 0.169921875, "learning_rate": 6.752872268098645e-05, "loss": 0.0459, "step": 6009 }, { "epoch": 0.7728578164516268, "grad_norm": 0.1728515625, "learning_rate": 6.751925696762113e-05, "loss": 0.042, "step": 6010 }, { "epoch": 0.7729864117621844, "grad_norm": 0.1708984375, "learning_rate": 6.750979053843946e-05, "loss": 0.039, "step": 6011 }, { "epoch": 0.773115007072742, "grad_norm": 0.18359375, "learning_rate": 6.750032339382824e-05, "loss": 0.0408, "step": 6012 }, { "epoch": 0.7732436023832998, "grad_norm": 0.1591796875, "learning_rate": 6.749085553417425e-05, "loss": 0.0327, "step": 6013 }, { "epoch": 0.7733721976938575, "grad_norm": 0.1962890625, "learning_rate": 6.748138695986437e-05, "loss": 0.0477, "step": 6014 }, { "epoch": 0.7735007930044151, "grad_norm": 0.1767578125, "learning_rate": 6.747191767128544e-05, "loss": 0.0397, "step": 6015 }, { "epoch": 0.7736293883149727, "grad_norm": 0.169921875, "learning_rate": 6.74624476688244e-05, "loss": 0.0395, "step": 6016 }, { "epoch": 0.7737579836255305, "grad_norm": 0.1767578125, "learning_rate": 6.745297695286815e-05, "loss": 0.0419, "step": 6017 }, { "epoch": 0.7738865789360881, "grad_norm": 0.203125, "learning_rate": 6.744350552380367e-05, "loss": 0.0534, "step": 6018 }, { "epoch": 0.7740151742466458, "grad_norm": 0.1806640625, "learning_rate": 6.743403338201797e-05, "loss": 0.0444, "step": 6019 }, { "epoch": 0.7741437695572034, "grad_norm": 0.1650390625, "learning_rate": 6.742456052789803e-05, "loss": 0.0349, "step": 6020 }, { "epoch": 0.7742723648677612, "grad_norm": 0.1484375, "learning_rate": 6.741508696183093e-05, "loss": 0.031, "step": 6021 }, { "epoch": 0.7744009601783188, "grad_norm": 0.1787109375, "learning_rate": 6.740561268420373e-05, "loss": 0.0419, "step": 6022 }, { "epoch": 0.7745295554888765, "grad_norm": 0.173828125, "learning_rate": 6.739613769540354e-05, "loss": 0.0452, "step": 6023 }, { "epoch": 0.7746581507994342, "grad_norm": 0.1708984375, "learning_rate": 6.738666199581752e-05, "loss": 0.0443, "step": 6024 }, { "epoch": 0.7747867461099919, "grad_norm": 0.1689453125, "learning_rate": 6.737718558583281e-05, "loss": 0.0379, "step": 6025 }, { "epoch": 0.7749153414205495, "grad_norm": 0.181640625, "learning_rate": 6.736770846583664e-05, "loss": 0.0438, "step": 6026 }, { "epoch": 0.7750439367311072, "grad_norm": 0.17578125, "learning_rate": 6.735823063621617e-05, "loss": 0.0448, "step": 6027 }, { "epoch": 0.7751725320416649, "grad_norm": 0.203125, "learning_rate": 6.734875209735871e-05, "loss": 0.0488, "step": 6028 }, { "epoch": 0.7753011273522226, "grad_norm": 0.1845703125, "learning_rate": 6.733927284965152e-05, "loss": 0.0486, "step": 6029 }, { "epoch": 0.7754297226627802, "grad_norm": 0.1806640625, "learning_rate": 6.732979289348193e-05, "loss": 0.0455, "step": 6030 }, { "epoch": 0.7755583179733379, "grad_norm": 0.1669921875, "learning_rate": 6.732031222923724e-05, "loss": 0.0435, "step": 6031 }, { "epoch": 0.7756869132838956, "grad_norm": 0.1806640625, "learning_rate": 6.731083085730488e-05, "loss": 0.0374, "step": 6032 }, { "epoch": 0.7758155085944533, "grad_norm": 0.1708984375, "learning_rate": 6.730134877807218e-05, "loss": 0.0314, "step": 6033 }, { "epoch": 0.7759441039050109, "grad_norm": 0.1962890625, "learning_rate": 6.729186599192661e-05, "loss": 0.0419, "step": 6034 }, { "epoch": 0.7760726992155687, "grad_norm": 0.1943359375, "learning_rate": 6.72823824992556e-05, "loss": 0.0463, "step": 6035 }, { "epoch": 0.7762012945261263, "grad_norm": 0.2001953125, "learning_rate": 6.727289830044666e-05, "loss": 0.0377, "step": 6036 }, { "epoch": 0.7763298898366839, "grad_norm": 0.16796875, "learning_rate": 6.726341339588729e-05, "loss": 0.0443, "step": 6037 }, { "epoch": 0.7764584851472416, "grad_norm": 0.19140625, "learning_rate": 6.725392778596503e-05, "loss": 0.0447, "step": 6038 }, { "epoch": 0.7765870804577993, "grad_norm": 0.1484375, "learning_rate": 6.724444147106745e-05, "loss": 0.0372, "step": 6039 }, { "epoch": 0.776715675768357, "grad_norm": 0.1689453125, "learning_rate": 6.723495445158217e-05, "loss": 0.044, "step": 6040 }, { "epoch": 0.7768442710789146, "grad_norm": 0.171875, "learning_rate": 6.722546672789678e-05, "loss": 0.0325, "step": 6041 }, { "epoch": 0.7769728663894724, "grad_norm": 0.1748046875, "learning_rate": 6.721597830039898e-05, "loss": 0.0408, "step": 6042 }, { "epoch": 0.77710146170003, "grad_norm": 0.19140625, "learning_rate": 6.720648916947643e-05, "loss": 0.0447, "step": 6043 }, { "epoch": 0.7772300570105877, "grad_norm": 0.1943359375, "learning_rate": 6.719699933551684e-05, "loss": 0.0464, "step": 6044 }, { "epoch": 0.7773586523211453, "grad_norm": 0.1767578125, "learning_rate": 6.718750879890799e-05, "loss": 0.0409, "step": 6045 }, { "epoch": 0.7774872476317031, "grad_norm": 0.162109375, "learning_rate": 6.71780175600376e-05, "loss": 0.0378, "step": 6046 }, { "epoch": 0.7776158429422607, "grad_norm": 0.1513671875, "learning_rate": 6.716852561929351e-05, "loss": 0.0339, "step": 6047 }, { "epoch": 0.7777444382528184, "grad_norm": 0.2041015625, "learning_rate": 6.715903297706353e-05, "loss": 0.0418, "step": 6048 }, { "epoch": 0.777873033563376, "grad_norm": 0.171875, "learning_rate": 6.714953963373553e-05, "loss": 0.0466, "step": 6049 }, { "epoch": 0.7780016288739338, "grad_norm": 0.1533203125, "learning_rate": 6.714004558969739e-05, "loss": 0.0362, "step": 6050 }, { "epoch": 0.7781302241844914, "grad_norm": 0.162109375, "learning_rate": 6.713055084533703e-05, "loss": 0.0349, "step": 6051 }, { "epoch": 0.778258819495049, "grad_norm": 0.169921875, "learning_rate": 6.712105540104239e-05, "loss": 0.037, "step": 6052 }, { "epoch": 0.7783874148056068, "grad_norm": 0.1904296875, "learning_rate": 6.711155925720145e-05, "loss": 0.0427, "step": 6053 }, { "epoch": 0.7785160101161644, "grad_norm": 0.1611328125, "learning_rate": 6.71020624142022e-05, "loss": 0.0346, "step": 6054 }, { "epoch": 0.7786446054267221, "grad_norm": 0.1923828125, "learning_rate": 6.709256487243268e-05, "loss": 0.0461, "step": 6055 }, { "epoch": 0.7787732007372797, "grad_norm": 0.1826171875, "learning_rate": 6.708306663228095e-05, "loss": 0.0425, "step": 6056 }, { "epoch": 0.7789017960478375, "grad_norm": 0.1611328125, "learning_rate": 6.707356769413508e-05, "loss": 0.0343, "step": 6057 }, { "epoch": 0.7790303913583951, "grad_norm": 0.16015625, "learning_rate": 6.70640680583832e-05, "loss": 0.0391, "step": 6058 }, { "epoch": 0.7791589866689528, "grad_norm": 0.1650390625, "learning_rate": 6.705456772541346e-05, "loss": 0.0381, "step": 6059 }, { "epoch": 0.7792875819795105, "grad_norm": 0.1787109375, "learning_rate": 6.7045066695614e-05, "loss": 0.0443, "step": 6060 }, { "epoch": 0.7794161772900682, "grad_norm": 0.189453125, "learning_rate": 6.703556496937305e-05, "loss": 0.0499, "step": 6061 }, { "epoch": 0.7795447726006258, "grad_norm": 0.1591796875, "learning_rate": 6.702606254707885e-05, "loss": 0.0339, "step": 6062 }, { "epoch": 0.7796733679111835, "grad_norm": 0.1826171875, "learning_rate": 6.701655942911963e-05, "loss": 0.0468, "step": 6063 }, { "epoch": 0.7798019632217412, "grad_norm": 0.158203125, "learning_rate": 6.70070556158837e-05, "loss": 0.0319, "step": 6064 }, { "epoch": 0.7799305585322989, "grad_norm": 0.1611328125, "learning_rate": 6.699755110775933e-05, "loss": 0.0356, "step": 6065 }, { "epoch": 0.7800591538428565, "grad_norm": 0.1748046875, "learning_rate": 6.698804590513495e-05, "loss": 0.0385, "step": 6066 }, { "epoch": 0.7801877491534142, "grad_norm": 0.2080078125, "learning_rate": 6.697854000839883e-05, "loss": 0.0518, "step": 6067 }, { "epoch": 0.7803163444639719, "grad_norm": 0.1826171875, "learning_rate": 6.696903341793944e-05, "loss": 0.046, "step": 6068 }, { "epoch": 0.7804449397745296, "grad_norm": 0.189453125, "learning_rate": 6.695952613414518e-05, "loss": 0.0493, "step": 6069 }, { "epoch": 0.7805735350850872, "grad_norm": 0.1728515625, "learning_rate": 6.695001815740451e-05, "loss": 0.0422, "step": 6070 }, { "epoch": 0.780702130395645, "grad_norm": 0.171875, "learning_rate": 6.694050948810591e-05, "loss": 0.0348, "step": 6071 }, { "epoch": 0.7808307257062026, "grad_norm": 0.154296875, "learning_rate": 6.69310001266379e-05, "loss": 0.0352, "step": 6072 }, { "epoch": 0.7809593210167602, "grad_norm": 0.189453125, "learning_rate": 6.692149007338903e-05, "loss": 0.0435, "step": 6073 }, { "epoch": 0.7810879163273179, "grad_norm": 0.16796875, "learning_rate": 6.691197932874787e-05, "loss": 0.037, "step": 6074 }, { "epoch": 0.7812165116378756, "grad_norm": 0.166015625, "learning_rate": 6.690246789310298e-05, "loss": 0.0389, "step": 6075 }, { "epoch": 0.7813451069484333, "grad_norm": 0.173828125, "learning_rate": 6.689295576684305e-05, "loss": 0.0392, "step": 6076 }, { "epoch": 0.7814737022589909, "grad_norm": 0.1669921875, "learning_rate": 6.688344295035667e-05, "loss": 0.0403, "step": 6077 }, { "epoch": 0.7816022975695486, "grad_norm": 0.158203125, "learning_rate": 6.687392944403256e-05, "loss": 0.0372, "step": 6078 }, { "epoch": 0.7817308928801063, "grad_norm": 0.162109375, "learning_rate": 6.686441524825943e-05, "loss": 0.0443, "step": 6079 }, { "epoch": 0.781859488190664, "grad_norm": 0.1572265625, "learning_rate": 6.6854900363426e-05, "loss": 0.0368, "step": 6080 }, { "epoch": 0.7819880835012216, "grad_norm": 0.2119140625, "learning_rate": 6.684538478992106e-05, "loss": 0.0377, "step": 6081 }, { "epoch": 0.7821166788117794, "grad_norm": 0.2060546875, "learning_rate": 6.683586852813337e-05, "loss": 0.0626, "step": 6082 }, { "epoch": 0.782245274122337, "grad_norm": 0.1708984375, "learning_rate": 6.68263515784518e-05, "loss": 0.0467, "step": 6083 }, { "epoch": 0.7823738694328947, "grad_norm": 0.189453125, "learning_rate": 6.681683394126517e-05, "loss": 0.0495, "step": 6084 }, { "epoch": 0.7825024647434523, "grad_norm": 0.146484375, "learning_rate": 6.680731561696237e-05, "loss": 0.0288, "step": 6085 }, { "epoch": 0.7826310600540101, "grad_norm": 0.1689453125, "learning_rate": 6.67977966059323e-05, "loss": 0.0481, "step": 6086 }, { "epoch": 0.7827596553645677, "grad_norm": 0.1962890625, "learning_rate": 6.678827690856391e-05, "loss": 0.0538, "step": 6087 }, { "epoch": 0.7828882506751254, "grad_norm": 0.1767578125, "learning_rate": 6.677875652524614e-05, "loss": 0.0381, "step": 6088 }, { "epoch": 0.7830168459856831, "grad_norm": 0.173828125, "learning_rate": 6.676923545636801e-05, "loss": 0.0507, "step": 6089 }, { "epoch": 0.7831454412962408, "grad_norm": 0.2060546875, "learning_rate": 6.675971370231853e-05, "loss": 0.0524, "step": 6090 }, { "epoch": 0.7832740366067984, "grad_norm": 0.1708984375, "learning_rate": 6.675019126348673e-05, "loss": 0.0361, "step": 6091 }, { "epoch": 0.783402631917356, "grad_norm": 0.1796875, "learning_rate": 6.67406681402617e-05, "loss": 0.0403, "step": 6092 }, { "epoch": 0.7835312272279138, "grad_norm": 0.162109375, "learning_rate": 6.673114433303257e-05, "loss": 0.0392, "step": 6093 }, { "epoch": 0.7836598225384714, "grad_norm": 0.1767578125, "learning_rate": 6.672161984218841e-05, "loss": 0.0388, "step": 6094 }, { "epoch": 0.7837884178490291, "grad_norm": 0.171875, "learning_rate": 6.671209466811844e-05, "loss": 0.0431, "step": 6095 }, { "epoch": 0.7839170131595867, "grad_norm": 0.193359375, "learning_rate": 6.670256881121181e-05, "loss": 0.0527, "step": 6096 }, { "epoch": 0.7840456084701445, "grad_norm": 0.1494140625, "learning_rate": 6.669304227185775e-05, "loss": 0.0319, "step": 6097 }, { "epoch": 0.7841742037807021, "grad_norm": 0.1787109375, "learning_rate": 6.668351505044549e-05, "loss": 0.0482, "step": 6098 }, { "epoch": 0.7843027990912598, "grad_norm": 0.19140625, "learning_rate": 6.667398714736432e-05, "loss": 0.0468, "step": 6099 }, { "epoch": 0.7844313944018175, "grad_norm": 0.1611328125, "learning_rate": 6.666445856300353e-05, "loss": 0.035, "step": 6100 }, { "epoch": 0.7845599897123752, "grad_norm": 0.173828125, "learning_rate": 6.665492929775244e-05, "loss": 0.0423, "step": 6101 }, { "epoch": 0.7846885850229328, "grad_norm": 0.166015625, "learning_rate": 6.664539935200043e-05, "loss": 0.0398, "step": 6102 }, { "epoch": 0.7848171803334905, "grad_norm": 0.1630859375, "learning_rate": 6.663586872613686e-05, "loss": 0.0352, "step": 6103 }, { "epoch": 0.7849457756440482, "grad_norm": 0.154296875, "learning_rate": 6.662633742055113e-05, "loss": 0.0353, "step": 6104 }, { "epoch": 0.7850743709546059, "grad_norm": 0.189453125, "learning_rate": 6.661680543563271e-05, "loss": 0.0447, "step": 6105 }, { "epoch": 0.7852029662651635, "grad_norm": 0.1591796875, "learning_rate": 6.660727277177104e-05, "loss": 0.0402, "step": 6106 }, { "epoch": 0.7853315615757213, "grad_norm": 0.17578125, "learning_rate": 6.659773942935564e-05, "loss": 0.0432, "step": 6107 }, { "epoch": 0.7854601568862789, "grad_norm": 0.185546875, "learning_rate": 6.658820540877599e-05, "loss": 0.0429, "step": 6108 }, { "epoch": 0.7855887521968365, "grad_norm": 0.1689453125, "learning_rate": 6.657867071042168e-05, "loss": 0.0427, "step": 6109 }, { "epoch": 0.7857173475073942, "grad_norm": 0.1865234375, "learning_rate": 6.656913533468224e-05, "loss": 0.0474, "step": 6110 }, { "epoch": 0.785845942817952, "grad_norm": 0.162109375, "learning_rate": 6.655959928194733e-05, "loss": 0.0415, "step": 6111 }, { "epoch": 0.7859745381285096, "grad_norm": 0.1748046875, "learning_rate": 6.655006255260655e-05, "loss": 0.0402, "step": 6112 }, { "epoch": 0.7861031334390672, "grad_norm": 0.1650390625, "learning_rate": 6.654052514704955e-05, "loss": 0.0401, "step": 6113 }, { "epoch": 0.7862317287496249, "grad_norm": 0.171875, "learning_rate": 6.653098706566605e-05, "loss": 0.0388, "step": 6114 }, { "epoch": 0.7863603240601826, "grad_norm": 0.158203125, "learning_rate": 6.652144830884574e-05, "loss": 0.0349, "step": 6115 }, { "epoch": 0.7864889193707403, "grad_norm": 0.1689453125, "learning_rate": 6.651190887697837e-05, "loss": 0.0405, "step": 6116 }, { "epoch": 0.7866175146812979, "grad_norm": 0.19140625, "learning_rate": 6.650236877045372e-05, "loss": 0.0437, "step": 6117 }, { "epoch": 0.7867461099918557, "grad_norm": 0.173828125, "learning_rate": 6.649282798966156e-05, "loss": 0.045, "step": 6118 }, { "epoch": 0.7868747053024133, "grad_norm": 0.1669921875, "learning_rate": 6.648328653499174e-05, "loss": 0.0346, "step": 6119 }, { "epoch": 0.787003300612971, "grad_norm": 0.1533203125, "learning_rate": 6.647374440683409e-05, "loss": 0.0345, "step": 6120 }, { "epoch": 0.7871318959235286, "grad_norm": 0.1533203125, "learning_rate": 6.646420160557852e-05, "loss": 0.0358, "step": 6121 }, { "epoch": 0.7872604912340864, "grad_norm": 0.162109375, "learning_rate": 6.645465813161492e-05, "loss": 0.0417, "step": 6122 }, { "epoch": 0.787389086544644, "grad_norm": 0.1865234375, "learning_rate": 6.644511398533324e-05, "loss": 0.0465, "step": 6123 }, { "epoch": 0.7875176818552017, "grad_norm": 0.1923828125, "learning_rate": 6.643556916712341e-05, "loss": 0.0456, "step": 6124 }, { "epoch": 0.7876462771657593, "grad_norm": 0.1689453125, "learning_rate": 6.642602367737545e-05, "loss": 0.0401, "step": 6125 }, { "epoch": 0.7877748724763171, "grad_norm": 0.1669921875, "learning_rate": 6.641647751647938e-05, "loss": 0.036, "step": 6126 }, { "epoch": 0.7879034677868747, "grad_norm": 0.166015625, "learning_rate": 6.640693068482521e-05, "loss": 0.0367, "step": 6127 }, { "epoch": 0.7880320630974323, "grad_norm": 0.19140625, "learning_rate": 6.639738318280304e-05, "loss": 0.046, "step": 6128 }, { "epoch": 0.7881606584079901, "grad_norm": 0.197265625, "learning_rate": 6.638783501080297e-05, "loss": 0.0511, "step": 6129 }, { "epoch": 0.7882892537185477, "grad_norm": 0.18359375, "learning_rate": 6.637828616921514e-05, "loss": 0.0446, "step": 6130 }, { "epoch": 0.7884178490291054, "grad_norm": 0.177734375, "learning_rate": 6.636873665842966e-05, "loss": 0.0441, "step": 6131 }, { "epoch": 0.788546444339663, "grad_norm": 0.1806640625, "learning_rate": 6.635918647883676e-05, "loss": 0.0473, "step": 6132 }, { "epoch": 0.7886750396502208, "grad_norm": 0.2021484375, "learning_rate": 6.634963563082662e-05, "loss": 0.0516, "step": 6133 }, { "epoch": 0.7888036349607784, "grad_norm": 0.173828125, "learning_rate": 6.634008411478947e-05, "loss": 0.0409, "step": 6134 }, { "epoch": 0.7889322302713361, "grad_norm": 0.177734375, "learning_rate": 6.633053193111561e-05, "loss": 0.0379, "step": 6135 }, { "epoch": 0.7890608255818938, "grad_norm": 0.2255859375, "learning_rate": 6.632097908019528e-05, "loss": 0.0417, "step": 6136 }, { "epoch": 0.7891894208924515, "grad_norm": 0.1787109375, "learning_rate": 6.631142556241885e-05, "loss": 0.039, "step": 6137 }, { "epoch": 0.7893180162030091, "grad_norm": 0.1826171875, "learning_rate": 6.630187137817666e-05, "loss": 0.0447, "step": 6138 }, { "epoch": 0.7894466115135668, "grad_norm": 0.16796875, "learning_rate": 6.629231652785904e-05, "loss": 0.0433, "step": 6139 }, { "epoch": 0.7895752068241245, "grad_norm": 0.181640625, "learning_rate": 6.628276101185644e-05, "loss": 0.0503, "step": 6140 }, { "epoch": 0.7897038021346822, "grad_norm": 0.1533203125, "learning_rate": 6.627320483055923e-05, "loss": 0.0382, "step": 6141 }, { "epoch": 0.7898323974452398, "grad_norm": 0.1640625, "learning_rate": 6.626364798435792e-05, "loss": 0.0394, "step": 6142 }, { "epoch": 0.7899609927557975, "grad_norm": 0.1875, "learning_rate": 6.625409047364296e-05, "loss": 0.0494, "step": 6143 }, { "epoch": 0.7900895880663552, "grad_norm": 0.1826171875, "learning_rate": 6.624453229880486e-05, "loss": 0.0476, "step": 6144 }, { "epoch": 0.7902181833769129, "grad_norm": 0.1884765625, "learning_rate": 6.623497346023418e-05, "loss": 0.0478, "step": 6145 }, { "epoch": 0.7903467786874705, "grad_norm": 0.15625, "learning_rate": 6.622541395832146e-05, "loss": 0.0335, "step": 6146 }, { "epoch": 0.7904753739980283, "grad_norm": 0.1630859375, "learning_rate": 6.62158537934573e-05, "loss": 0.0385, "step": 6147 }, { "epoch": 0.7906039693085859, "grad_norm": 0.171875, "learning_rate": 6.620629296603232e-05, "loss": 0.0459, "step": 6148 }, { "epoch": 0.7907325646191435, "grad_norm": 0.16015625, "learning_rate": 6.619673147643713e-05, "loss": 0.0402, "step": 6149 }, { "epoch": 0.7908611599297012, "grad_norm": 0.1611328125, "learning_rate": 6.618716932506246e-05, "loss": 0.0341, "step": 6150 }, { "epoch": 0.7909897552402589, "grad_norm": 0.162109375, "learning_rate": 6.617760651229896e-05, "loss": 0.0381, "step": 6151 }, { "epoch": 0.7911183505508166, "grad_norm": 0.1552734375, "learning_rate": 6.616804303853738e-05, "loss": 0.0347, "step": 6152 }, { "epoch": 0.7912469458613742, "grad_norm": 0.232421875, "learning_rate": 6.615847890416848e-05, "loss": 0.0448, "step": 6153 }, { "epoch": 0.791375541171932, "grad_norm": 0.177734375, "learning_rate": 6.6148914109583e-05, "loss": 0.0389, "step": 6154 }, { "epoch": 0.7915041364824896, "grad_norm": 0.1611328125, "learning_rate": 6.613934865517179e-05, "loss": 0.0365, "step": 6155 }, { "epoch": 0.7916327317930473, "grad_norm": 0.1630859375, "learning_rate": 6.612978254132565e-05, "loss": 0.0409, "step": 6156 }, { "epoch": 0.7917613271036049, "grad_norm": 0.1640625, "learning_rate": 6.612021576843545e-05, "loss": 0.0377, "step": 6157 }, { "epoch": 0.7918899224141627, "grad_norm": 0.1787109375, "learning_rate": 6.611064833689208e-05, "loss": 0.0436, "step": 6158 }, { "epoch": 0.7920185177247203, "grad_norm": 0.1728515625, "learning_rate": 6.610108024708649e-05, "loss": 0.0497, "step": 6159 }, { "epoch": 0.792147113035278, "grad_norm": 0.162109375, "learning_rate": 6.609151149940955e-05, "loss": 0.0338, "step": 6160 }, { "epoch": 0.7922757083458356, "grad_norm": 0.181640625, "learning_rate": 6.608194209425228e-05, "loss": 0.0488, "step": 6161 }, { "epoch": 0.7924043036563934, "grad_norm": 0.1904296875, "learning_rate": 6.607237203200568e-05, "loss": 0.0544, "step": 6162 }, { "epoch": 0.792532898966951, "grad_norm": 0.185546875, "learning_rate": 6.606280131306072e-05, "loss": 0.0372, "step": 6163 }, { "epoch": 0.7926614942775086, "grad_norm": 0.177734375, "learning_rate": 6.60532299378085e-05, "loss": 0.0461, "step": 6164 }, { "epoch": 0.7927900895880664, "grad_norm": 0.16015625, "learning_rate": 6.604365790664006e-05, "loss": 0.0367, "step": 6165 }, { "epoch": 0.792918684898624, "grad_norm": 0.154296875, "learning_rate": 6.603408521994651e-05, "loss": 0.0315, "step": 6166 }, { "epoch": 0.7930472802091817, "grad_norm": 0.166015625, "learning_rate": 6.6024511878119e-05, "loss": 0.0398, "step": 6167 }, { "epoch": 0.7931758755197393, "grad_norm": 0.18359375, "learning_rate": 6.601493788154866e-05, "loss": 0.0396, "step": 6168 }, { "epoch": 0.7933044708302971, "grad_norm": 0.1787109375, "learning_rate": 6.600536323062669e-05, "loss": 0.0506, "step": 6169 }, { "epoch": 0.7934330661408547, "grad_norm": 0.185546875, "learning_rate": 6.599578792574428e-05, "loss": 0.0493, "step": 6170 }, { "epoch": 0.7935616614514124, "grad_norm": 0.224609375, "learning_rate": 6.59862119672927e-05, "loss": 0.0479, "step": 6171 }, { "epoch": 0.79369025676197, "grad_norm": 0.1904296875, "learning_rate": 6.597663535566319e-05, "loss": 0.0456, "step": 6172 }, { "epoch": 0.7938188520725278, "grad_norm": 0.1640625, "learning_rate": 6.596705809124702e-05, "loss": 0.0395, "step": 6173 }, { "epoch": 0.7939474473830854, "grad_norm": 0.17578125, "learning_rate": 6.595748017443555e-05, "loss": 0.0418, "step": 6174 }, { "epoch": 0.7940760426936431, "grad_norm": 0.1640625, "learning_rate": 6.594790160562008e-05, "loss": 0.0432, "step": 6175 }, { "epoch": 0.7942046380042008, "grad_norm": 0.1806640625, "learning_rate": 6.593832238519199e-05, "loss": 0.0501, "step": 6176 }, { "epoch": 0.7943332333147585, "grad_norm": 0.1708984375, "learning_rate": 6.59287425135427e-05, "loss": 0.0425, "step": 6177 }, { "epoch": 0.7944618286253161, "grad_norm": 0.1708984375, "learning_rate": 6.59191619910636e-05, "loss": 0.0414, "step": 6178 }, { "epoch": 0.7945904239358738, "grad_norm": 0.1689453125, "learning_rate": 6.590958081814616e-05, "loss": 0.0442, "step": 6179 }, { "epoch": 0.7947190192464315, "grad_norm": 0.1787109375, "learning_rate": 6.589999899518184e-05, "loss": 0.0495, "step": 6180 }, { "epoch": 0.7948476145569892, "grad_norm": 0.1630859375, "learning_rate": 6.589041652256218e-05, "loss": 0.0417, "step": 6181 }, { "epoch": 0.7949762098675468, "grad_norm": 0.150390625, "learning_rate": 6.588083340067865e-05, "loss": 0.0376, "step": 6182 }, { "epoch": 0.7951048051781046, "grad_norm": 0.1943359375, "learning_rate": 6.587124962992286e-05, "loss": 0.0476, "step": 6183 }, { "epoch": 0.7952334004886622, "grad_norm": 0.16015625, "learning_rate": 6.586166521068635e-05, "loss": 0.0355, "step": 6184 }, { "epoch": 0.7953619957992198, "grad_norm": 0.1845703125, "learning_rate": 6.585208014336074e-05, "loss": 0.0436, "step": 6185 }, { "epoch": 0.7954905911097775, "grad_norm": 0.1728515625, "learning_rate": 6.584249442833769e-05, "loss": 0.0424, "step": 6186 }, { "epoch": 0.7956191864203352, "grad_norm": 0.16796875, "learning_rate": 6.583290806600884e-05, "loss": 0.0375, "step": 6187 }, { "epoch": 0.7957477817308929, "grad_norm": 0.1884765625, "learning_rate": 6.582332105676587e-05, "loss": 0.051, "step": 6188 }, { "epoch": 0.7958763770414505, "grad_norm": 0.1806640625, "learning_rate": 6.58137334010005e-05, "loss": 0.0528, "step": 6189 }, { "epoch": 0.7960049723520082, "grad_norm": 0.173828125, "learning_rate": 6.580414509910448e-05, "loss": 0.0456, "step": 6190 }, { "epoch": 0.7961335676625659, "grad_norm": 0.177734375, "learning_rate": 6.579455615146956e-05, "loss": 0.0436, "step": 6191 }, { "epoch": 0.7962621629731236, "grad_norm": 0.154296875, "learning_rate": 6.578496655848755e-05, "loss": 0.0349, "step": 6192 }, { "epoch": 0.7963907582836812, "grad_norm": 0.177734375, "learning_rate": 6.577537632055027e-05, "loss": 0.0456, "step": 6193 }, { "epoch": 0.796519353594239, "grad_norm": 0.1865234375, "learning_rate": 6.576578543804954e-05, "loss": 0.0424, "step": 6194 }, { "epoch": 0.7966479489047966, "grad_norm": 0.16015625, "learning_rate": 6.575619391137726e-05, "loss": 0.037, "step": 6195 }, { "epoch": 0.7967765442153543, "grad_norm": 0.17578125, "learning_rate": 6.574660174092532e-05, "loss": 0.0433, "step": 6196 }, { "epoch": 0.7969051395259119, "grad_norm": 0.1796875, "learning_rate": 6.573700892708564e-05, "loss": 0.0487, "step": 6197 }, { "epoch": 0.7970337348364697, "grad_norm": 0.1708984375, "learning_rate": 6.572741547025019e-05, "loss": 0.0375, "step": 6198 }, { "epoch": 0.7971623301470273, "grad_norm": 0.1923828125, "learning_rate": 6.571782137081092e-05, "loss": 0.0474, "step": 6199 }, { "epoch": 0.797290925457585, "grad_norm": 0.146484375, "learning_rate": 6.570822662915986e-05, "loss": 0.0348, "step": 6200 }, { "epoch": 0.7974195207681427, "grad_norm": 0.1630859375, "learning_rate": 6.5698631245689e-05, "loss": 0.0391, "step": 6201 }, { "epoch": 0.7975481160787004, "grad_norm": 0.1748046875, "learning_rate": 6.568903522079044e-05, "loss": 0.0384, "step": 6202 }, { "epoch": 0.797676711389258, "grad_norm": 0.17578125, "learning_rate": 6.567943855485623e-05, "loss": 0.0473, "step": 6203 }, { "epoch": 0.7978053066998156, "grad_norm": 0.1796875, "learning_rate": 6.56698412482785e-05, "loss": 0.0385, "step": 6204 }, { "epoch": 0.7979339020103734, "grad_norm": 0.16796875, "learning_rate": 6.56602433014494e-05, "loss": 0.0376, "step": 6205 }, { "epoch": 0.798062497320931, "grad_norm": 0.1787109375, "learning_rate": 6.565064471476104e-05, "loss": 0.0395, "step": 6206 }, { "epoch": 0.7981910926314887, "grad_norm": 0.1611328125, "learning_rate": 6.564104548860566e-05, "loss": 0.0358, "step": 6207 }, { "epoch": 0.7983196879420463, "grad_norm": 0.201171875, "learning_rate": 6.563144562337542e-05, "loss": 0.0439, "step": 6208 }, { "epoch": 0.7984482832526041, "grad_norm": 0.18359375, "learning_rate": 6.562184511946261e-05, "loss": 0.0421, "step": 6209 }, { "epoch": 0.7985768785631617, "grad_norm": 0.1689453125, "learning_rate": 6.561224397725948e-05, "loss": 0.0401, "step": 6210 }, { "epoch": 0.7987054738737194, "grad_norm": 0.15234375, "learning_rate": 6.56026421971583e-05, "loss": 0.0381, "step": 6211 }, { "epoch": 0.7988340691842771, "grad_norm": 0.1611328125, "learning_rate": 6.559303977955142e-05, "loss": 0.0385, "step": 6212 }, { "epoch": 0.7989626644948348, "grad_norm": 0.197265625, "learning_rate": 6.558343672483114e-05, "loss": 0.0501, "step": 6213 }, { "epoch": 0.7990912598053924, "grad_norm": 0.1640625, "learning_rate": 6.557383303338987e-05, "loss": 0.0369, "step": 6214 }, { "epoch": 0.7992198551159501, "grad_norm": 0.1650390625, "learning_rate": 6.556422870561999e-05, "loss": 0.0408, "step": 6215 }, { "epoch": 0.7993484504265078, "grad_norm": 0.193359375, "learning_rate": 6.555462374191393e-05, "loss": 0.0489, "step": 6216 }, { "epoch": 0.7994770457370655, "grad_norm": 0.1552734375, "learning_rate": 6.554501814266411e-05, "loss": 0.037, "step": 6217 }, { "epoch": 0.7996056410476231, "grad_norm": 0.1943359375, "learning_rate": 6.553541190826305e-05, "loss": 0.0537, "step": 6218 }, { "epoch": 0.7997342363581808, "grad_norm": 0.1484375, "learning_rate": 6.552580503910322e-05, "loss": 0.0301, "step": 6219 }, { "epoch": 0.7998628316687385, "grad_norm": 0.15625, "learning_rate": 6.551619753557713e-05, "loss": 0.0335, "step": 6220 }, { "epoch": 0.7999914269792961, "grad_norm": 0.1630859375, "learning_rate": 6.550658939807736e-05, "loss": 0.0337, "step": 6221 }, { "epoch": 0.8001200222898538, "grad_norm": 0.23828125, "learning_rate": 6.54969806269965e-05, "loss": 0.0469, "step": 6222 }, { "epoch": 0.8002486176004115, "grad_norm": 0.17578125, "learning_rate": 6.54873712227271e-05, "loss": 0.0425, "step": 6223 }, { "epoch": 0.8003772129109692, "grad_norm": 0.18359375, "learning_rate": 6.547776118566183e-05, "loss": 0.0423, "step": 6224 }, { "epoch": 0.8005058082215268, "grad_norm": 0.150390625, "learning_rate": 6.546815051619334e-05, "loss": 0.0332, "step": 6225 }, { "epoch": 0.8006344035320845, "grad_norm": 0.1787109375, "learning_rate": 6.545853921471431e-05, "loss": 0.0455, "step": 6226 }, { "epoch": 0.8007629988426422, "grad_norm": 0.1630859375, "learning_rate": 6.544892728161742e-05, "loss": 0.0388, "step": 6227 }, { "epoch": 0.8008915941531999, "grad_norm": 0.18359375, "learning_rate": 6.543931471729546e-05, "loss": 0.0448, "step": 6228 }, { "epoch": 0.8010201894637575, "grad_norm": 0.228515625, "learning_rate": 6.542970152214112e-05, "loss": 0.0463, "step": 6229 }, { "epoch": 0.8011487847743153, "grad_norm": 0.1611328125, "learning_rate": 6.542008769654722e-05, "loss": 0.0365, "step": 6230 }, { "epoch": 0.8012773800848729, "grad_norm": 0.1728515625, "learning_rate": 6.541047324090657e-05, "loss": 0.0402, "step": 6231 }, { "epoch": 0.8014059753954306, "grad_norm": 0.171875, "learning_rate": 6.5400858155612e-05, "loss": 0.0385, "step": 6232 }, { "epoch": 0.8015345707059882, "grad_norm": 0.1591796875, "learning_rate": 6.539124244105639e-05, "loss": 0.0333, "step": 6233 }, { "epoch": 0.801663166016546, "grad_norm": 0.1708984375, "learning_rate": 6.53816260976326e-05, "loss": 0.0398, "step": 6234 }, { "epoch": 0.8017917613271036, "grad_norm": 0.1669921875, "learning_rate": 6.537200912573355e-05, "loss": 0.04, "step": 6235 }, { "epoch": 0.8019203566376613, "grad_norm": 0.201171875, "learning_rate": 6.536239152575218e-05, "loss": 0.048, "step": 6236 }, { "epoch": 0.8020489519482189, "grad_norm": 0.1650390625, "learning_rate": 6.535277329808145e-05, "loss": 0.0341, "step": 6237 }, { "epoch": 0.8021775472587767, "grad_norm": 0.1650390625, "learning_rate": 6.534315444311436e-05, "loss": 0.037, "step": 6238 }, { "epoch": 0.8023061425693343, "grad_norm": 0.1884765625, "learning_rate": 6.533353496124392e-05, "loss": 0.0405, "step": 6239 }, { "epoch": 0.802434737879892, "grad_norm": 0.1572265625, "learning_rate": 6.532391485286317e-05, "loss": 0.0383, "step": 6240 }, { "epoch": 0.8025633331904497, "grad_norm": 0.1826171875, "learning_rate": 6.531429411836517e-05, "loss": 0.0436, "step": 6241 }, { "epoch": 0.8026919285010073, "grad_norm": 0.1630859375, "learning_rate": 6.530467275814301e-05, "loss": 0.033, "step": 6242 }, { "epoch": 0.802820523811565, "grad_norm": 0.166015625, "learning_rate": 6.529505077258984e-05, "loss": 0.0406, "step": 6243 }, { "epoch": 0.8029491191221226, "grad_norm": 0.1689453125, "learning_rate": 6.528542816209875e-05, "loss": 0.044, "step": 6244 }, { "epoch": 0.8030777144326804, "grad_norm": 0.173828125, "learning_rate": 6.527580492706298e-05, "loss": 0.0462, "step": 6245 }, { "epoch": 0.803206309743238, "grad_norm": 0.1767578125, "learning_rate": 6.526618106787564e-05, "loss": 0.0375, "step": 6246 }, { "epoch": 0.8033349050537957, "grad_norm": 0.1748046875, "learning_rate": 6.525655658493e-05, "loss": 0.041, "step": 6247 }, { "epoch": 0.8034635003643534, "grad_norm": 0.1689453125, "learning_rate": 6.524693147861931e-05, "loss": 0.0436, "step": 6248 }, { "epoch": 0.8035920956749111, "grad_norm": 0.146484375, "learning_rate": 6.523730574933682e-05, "loss": 0.033, "step": 6249 }, { "epoch": 0.8037206909854687, "grad_norm": 0.1875, "learning_rate": 6.522767939747582e-05, "loss": 0.0355, "step": 6250 }, { "epoch": 0.8038492862960264, "grad_norm": 0.1796875, "learning_rate": 6.521805242342966e-05, "loss": 0.042, "step": 6251 }, { "epoch": 0.8039778816065841, "grad_norm": 0.171875, "learning_rate": 6.520842482759167e-05, "loss": 0.0394, "step": 6252 }, { "epoch": 0.8041064769171418, "grad_norm": 0.177734375, "learning_rate": 6.519879661035522e-05, "loss": 0.0414, "step": 6253 }, { "epoch": 0.8042350722276994, "grad_norm": 0.1689453125, "learning_rate": 6.51891677721137e-05, "loss": 0.0369, "step": 6254 }, { "epoch": 0.804363667538257, "grad_norm": 0.2080078125, "learning_rate": 6.517953831326057e-05, "loss": 0.0559, "step": 6255 }, { "epoch": 0.8044922628488148, "grad_norm": 0.1962890625, "learning_rate": 6.516990823418923e-05, "loss": 0.048, "step": 6256 }, { "epoch": 0.8046208581593725, "grad_norm": 0.1640625, "learning_rate": 6.51602775352932e-05, "loss": 0.0366, "step": 6257 }, { "epoch": 0.8047494534699301, "grad_norm": 0.1513671875, "learning_rate": 6.515064621696594e-05, "loss": 0.0339, "step": 6258 }, { "epoch": 0.8048780487804879, "grad_norm": 0.150390625, "learning_rate": 6.514101427960099e-05, "loss": 0.0322, "step": 6259 }, { "epoch": 0.8050066440910455, "grad_norm": 0.26171875, "learning_rate": 6.51313817235919e-05, "loss": 0.0356, "step": 6260 }, { "epoch": 0.8051352394016031, "grad_norm": 0.1708984375, "learning_rate": 6.512174854933224e-05, "loss": 0.0371, "step": 6261 }, { "epoch": 0.8052638347121608, "grad_norm": 0.1572265625, "learning_rate": 6.511211475721562e-05, "loss": 0.0351, "step": 6262 }, { "epoch": 0.8053924300227185, "grad_norm": 0.1875, "learning_rate": 6.510248034763566e-05, "loss": 0.0432, "step": 6263 }, { "epoch": 0.8055210253332762, "grad_norm": 0.1806640625, "learning_rate": 6.509284532098601e-05, "loss": 0.0401, "step": 6264 }, { "epoch": 0.8056496206438338, "grad_norm": 0.173828125, "learning_rate": 6.508320967766035e-05, "loss": 0.0432, "step": 6265 }, { "epoch": 0.8057782159543915, "grad_norm": 0.1923828125, "learning_rate": 6.507357341805239e-05, "loss": 0.0524, "step": 6266 }, { "epoch": 0.8059068112649492, "grad_norm": 0.181640625, "learning_rate": 6.506393654255583e-05, "loss": 0.0437, "step": 6267 }, { "epoch": 0.8060354065755069, "grad_norm": 0.1630859375, "learning_rate": 6.505429905156443e-05, "loss": 0.0371, "step": 6268 }, { "epoch": 0.8061640018860645, "grad_norm": 0.19140625, "learning_rate": 6.504466094547199e-05, "loss": 0.0439, "step": 6269 }, { "epoch": 0.8062925971966223, "grad_norm": 0.1865234375, "learning_rate": 6.503502222467229e-05, "loss": 0.049, "step": 6270 }, { "epoch": 0.8064211925071799, "grad_norm": 0.158203125, "learning_rate": 6.502538288955914e-05, "loss": 0.0349, "step": 6271 }, { "epoch": 0.8065497878177376, "grad_norm": 0.16015625, "learning_rate": 6.501574294052644e-05, "loss": 0.0345, "step": 6272 }, { "epoch": 0.8066783831282952, "grad_norm": 0.189453125, "learning_rate": 6.500610237796801e-05, "loss": 0.0464, "step": 6273 }, { "epoch": 0.806806978438853, "grad_norm": 0.1630859375, "learning_rate": 6.499646120227781e-05, "loss": 0.0376, "step": 6274 }, { "epoch": 0.8069355737494106, "grad_norm": 0.166015625, "learning_rate": 6.498681941384972e-05, "loss": 0.0391, "step": 6275 }, { "epoch": 0.8070641690599683, "grad_norm": 0.1962890625, "learning_rate": 6.497717701307774e-05, "loss": 0.0506, "step": 6276 }, { "epoch": 0.807192764370526, "grad_norm": 0.169921875, "learning_rate": 6.496753400035579e-05, "loss": 0.0431, "step": 6277 }, { "epoch": 0.8073213596810837, "grad_norm": 0.16015625, "learning_rate": 6.495789037607792e-05, "loss": 0.0372, "step": 6278 }, { "epoch": 0.8074499549916413, "grad_norm": 0.1767578125, "learning_rate": 6.494824614063812e-05, "loss": 0.0358, "step": 6279 }, { "epoch": 0.8075785503021989, "grad_norm": 0.1591796875, "learning_rate": 6.493860129443047e-05, "loss": 0.0404, "step": 6280 }, { "epoch": 0.8077071456127567, "grad_norm": 0.19140625, "learning_rate": 6.492895583784906e-05, "loss": 0.0491, "step": 6281 }, { "epoch": 0.8078357409233143, "grad_norm": 0.185546875, "learning_rate": 6.491930977128795e-05, "loss": 0.0407, "step": 6282 }, { "epoch": 0.807964336233872, "grad_norm": 0.1591796875, "learning_rate": 6.490966309514128e-05, "loss": 0.0328, "step": 6283 }, { "epoch": 0.8080929315444296, "grad_norm": 0.212890625, "learning_rate": 6.49000158098032e-05, "loss": 0.061, "step": 6284 }, { "epoch": 0.8082215268549874, "grad_norm": 0.1591796875, "learning_rate": 6.48903679156679e-05, "loss": 0.0369, "step": 6285 }, { "epoch": 0.808350122165545, "grad_norm": 0.1689453125, "learning_rate": 6.48807194131296e-05, "loss": 0.037, "step": 6286 }, { "epoch": 0.8084787174761027, "grad_norm": 0.205078125, "learning_rate": 6.487107030258248e-05, "loss": 0.0517, "step": 6287 }, { "epoch": 0.8086073127866604, "grad_norm": 0.166015625, "learning_rate": 6.486142058442083e-05, "loss": 0.0353, "step": 6288 }, { "epoch": 0.8087359080972181, "grad_norm": 0.1845703125, "learning_rate": 6.48517702590389e-05, "loss": 0.0507, "step": 6289 }, { "epoch": 0.8088645034077757, "grad_norm": 0.193359375, "learning_rate": 6.484211932683102e-05, "loss": 0.0422, "step": 6290 }, { "epoch": 0.8089930987183334, "grad_norm": 0.1865234375, "learning_rate": 6.483246778819148e-05, "loss": 0.0497, "step": 6291 }, { "epoch": 0.8091216940288911, "grad_norm": 0.185546875, "learning_rate": 6.482281564351464e-05, "loss": 0.0457, "step": 6292 }, { "epoch": 0.8092502893394488, "grad_norm": 0.16015625, "learning_rate": 6.481316289319492e-05, "loss": 0.0325, "step": 6293 }, { "epoch": 0.8093788846500064, "grad_norm": 0.171875, "learning_rate": 6.480350953762666e-05, "loss": 0.0356, "step": 6294 }, { "epoch": 0.8095074799605642, "grad_norm": 0.169921875, "learning_rate": 6.479385557720433e-05, "loss": 0.0422, "step": 6295 }, { "epoch": 0.8096360752711218, "grad_norm": 0.1884765625, "learning_rate": 6.478420101232232e-05, "loss": 0.0438, "step": 6296 }, { "epoch": 0.8097646705816794, "grad_norm": 0.1826171875, "learning_rate": 6.477454584337515e-05, "loss": 0.0511, "step": 6297 }, { "epoch": 0.8098932658922371, "grad_norm": 0.1875, "learning_rate": 6.476489007075734e-05, "loss": 0.0421, "step": 6298 }, { "epoch": 0.8100218612027948, "grad_norm": 0.17578125, "learning_rate": 6.475523369486336e-05, "loss": 0.0422, "step": 6299 }, { "epoch": 0.8101504565133525, "grad_norm": 0.1787109375, "learning_rate": 6.47455767160878e-05, "loss": 0.0446, "step": 6300 }, { "epoch": 0.8102790518239101, "grad_norm": 0.171875, "learning_rate": 6.47359191348252e-05, "loss": 0.0387, "step": 6301 }, { "epoch": 0.8104076471344678, "grad_norm": 0.162109375, "learning_rate": 6.472626095147018e-05, "loss": 0.036, "step": 6302 }, { "epoch": 0.8105362424450255, "grad_norm": 0.177734375, "learning_rate": 6.471660216641736e-05, "loss": 0.0413, "step": 6303 }, { "epoch": 0.8106648377555832, "grad_norm": 0.158203125, "learning_rate": 6.470694278006139e-05, "loss": 0.0366, "step": 6304 }, { "epoch": 0.8107934330661408, "grad_norm": 0.166015625, "learning_rate": 6.469728279279693e-05, "loss": 0.0425, "step": 6305 }, { "epoch": 0.8109220283766986, "grad_norm": 0.1669921875, "learning_rate": 6.468762220501867e-05, "loss": 0.0397, "step": 6306 }, { "epoch": 0.8110506236872562, "grad_norm": 0.1845703125, "learning_rate": 6.467796101712134e-05, "loss": 0.0487, "step": 6307 }, { "epoch": 0.8111792189978139, "grad_norm": 0.177734375, "learning_rate": 6.466829922949968e-05, "loss": 0.0438, "step": 6308 }, { "epoch": 0.8113078143083715, "grad_norm": 0.1904296875, "learning_rate": 6.465863684254847e-05, "loss": 0.0488, "step": 6309 }, { "epoch": 0.8114364096189293, "grad_norm": 0.1552734375, "learning_rate": 6.464897385666252e-05, "loss": 0.0369, "step": 6310 }, { "epoch": 0.8115650049294869, "grad_norm": 0.1845703125, "learning_rate": 6.46393102722366e-05, "loss": 0.0455, "step": 6311 }, { "epoch": 0.8116936002400446, "grad_norm": 0.1728515625, "learning_rate": 6.462964608966558e-05, "loss": 0.0468, "step": 6312 }, { "epoch": 0.8118221955506023, "grad_norm": 0.1748046875, "learning_rate": 6.461998130934433e-05, "loss": 0.0451, "step": 6313 }, { "epoch": 0.81195079086116, "grad_norm": 0.1689453125, "learning_rate": 6.461031593166774e-05, "loss": 0.043, "step": 6314 }, { "epoch": 0.8120793861717176, "grad_norm": 0.1748046875, "learning_rate": 6.460064995703073e-05, "loss": 0.045, "step": 6315 }, { "epoch": 0.8122079814822752, "grad_norm": 0.1962890625, "learning_rate": 6.459098338582822e-05, "loss": 0.0442, "step": 6316 }, { "epoch": 0.812336576792833, "grad_norm": 0.1630859375, "learning_rate": 6.45813162184552e-05, "loss": 0.0357, "step": 6317 }, { "epoch": 0.8124651721033906, "grad_norm": 0.1728515625, "learning_rate": 6.457164845530664e-05, "loss": 0.043, "step": 6318 }, { "epoch": 0.8125937674139483, "grad_norm": 0.1875, "learning_rate": 6.456198009677756e-05, "loss": 0.0408, "step": 6319 }, { "epoch": 0.8127223627245059, "grad_norm": 0.1962890625, "learning_rate": 6.455231114326301e-05, "loss": 0.0504, "step": 6320 }, { "epoch": 0.8128509580350637, "grad_norm": 0.158203125, "learning_rate": 6.454264159515804e-05, "loss": 0.0307, "step": 6321 }, { "epoch": 0.8129795533456213, "grad_norm": 0.185546875, "learning_rate": 6.453297145285773e-05, "loss": 0.0384, "step": 6322 }, { "epoch": 0.813108148656179, "grad_norm": 0.16796875, "learning_rate": 6.452330071675719e-05, "loss": 0.039, "step": 6323 }, { "epoch": 0.8132367439667367, "grad_norm": 0.162109375, "learning_rate": 6.451362938725158e-05, "loss": 0.0403, "step": 6324 }, { "epoch": 0.8133653392772944, "grad_norm": 0.138671875, "learning_rate": 6.450395746473604e-05, "loss": 0.0304, "step": 6325 }, { "epoch": 0.813493934587852, "grad_norm": 0.1533203125, "learning_rate": 6.449428494960576e-05, "loss": 0.0348, "step": 6326 }, { "epoch": 0.8136225298984097, "grad_norm": 0.1865234375, "learning_rate": 6.448461184225594e-05, "loss": 0.0463, "step": 6327 }, { "epoch": 0.8137511252089674, "grad_norm": 0.1748046875, "learning_rate": 6.447493814308181e-05, "loss": 0.0422, "step": 6328 }, { "epoch": 0.8138797205195251, "grad_norm": 0.1630859375, "learning_rate": 6.446526385247864e-05, "loss": 0.0372, "step": 6329 }, { "epoch": 0.8140083158300827, "grad_norm": 0.1826171875, "learning_rate": 6.44555889708417e-05, "loss": 0.0413, "step": 6330 }, { "epoch": 0.8141369111406404, "grad_norm": 0.1845703125, "learning_rate": 6.44459134985663e-05, "loss": 0.0449, "step": 6331 }, { "epoch": 0.8142655064511981, "grad_norm": 0.1689453125, "learning_rate": 6.443623743604776e-05, "loss": 0.0382, "step": 6332 }, { "epoch": 0.8143941017617558, "grad_norm": 0.189453125, "learning_rate": 6.442656078368144e-05, "loss": 0.0414, "step": 6333 }, { "epoch": 0.8145226970723134, "grad_norm": 0.21875, "learning_rate": 6.441688354186271e-05, "loss": 0.0562, "step": 6334 }, { "epoch": 0.8146512923828712, "grad_norm": 0.18359375, "learning_rate": 6.440720571098698e-05, "loss": 0.0415, "step": 6335 }, { "epoch": 0.8147798876934288, "grad_norm": 0.171875, "learning_rate": 6.439752729144965e-05, "loss": 0.0373, "step": 6336 }, { "epoch": 0.8149084830039864, "grad_norm": 0.1484375, "learning_rate": 6.438784828364621e-05, "loss": 0.0331, "step": 6337 }, { "epoch": 0.8150370783145441, "grad_norm": 0.169921875, "learning_rate": 6.43781686879721e-05, "loss": 0.0415, "step": 6338 }, { "epoch": 0.8151656736251018, "grad_norm": 0.1796875, "learning_rate": 6.436848850482285e-05, "loss": 0.0361, "step": 6339 }, { "epoch": 0.8152942689356595, "grad_norm": 0.1640625, "learning_rate": 6.435880773459396e-05, "loss": 0.0402, "step": 6340 }, { "epoch": 0.8154228642462171, "grad_norm": 0.1611328125, "learning_rate": 6.434912637768096e-05, "loss": 0.0387, "step": 6341 }, { "epoch": 0.8155514595567749, "grad_norm": 0.1943359375, "learning_rate": 6.433944443447942e-05, "loss": 0.0497, "step": 6342 }, { "epoch": 0.8156800548673325, "grad_norm": 0.177734375, "learning_rate": 6.432976190538496e-05, "loss": 0.0405, "step": 6343 }, { "epoch": 0.8158086501778902, "grad_norm": 0.1650390625, "learning_rate": 6.432007879079318e-05, "loss": 0.0411, "step": 6344 }, { "epoch": 0.8159372454884478, "grad_norm": 0.1806640625, "learning_rate": 6.431039509109973e-05, "loss": 0.0437, "step": 6345 }, { "epoch": 0.8160658407990056, "grad_norm": 0.1650390625, "learning_rate": 6.430071080670026e-05, "loss": 0.041, "step": 6346 }, { "epoch": 0.8161944361095632, "grad_norm": 0.1669921875, "learning_rate": 6.429102593799048e-05, "loss": 0.0388, "step": 6347 }, { "epoch": 0.8163230314201209, "grad_norm": 0.17578125, "learning_rate": 6.428134048536609e-05, "loss": 0.0409, "step": 6348 }, { "epoch": 0.8164516267306785, "grad_norm": 0.162109375, "learning_rate": 6.427165444922281e-05, "loss": 0.039, "step": 6349 }, { "epoch": 0.8165802220412363, "grad_norm": 0.1650390625, "learning_rate": 6.426196782995642e-05, "loss": 0.0392, "step": 6350 }, { "epoch": 0.8167088173517939, "grad_norm": 0.1689453125, "learning_rate": 6.42522806279627e-05, "loss": 0.0383, "step": 6351 }, { "epoch": 0.8168374126623515, "grad_norm": 0.173828125, "learning_rate": 6.424259284363748e-05, "loss": 0.0406, "step": 6352 }, { "epoch": 0.8169660079729093, "grad_norm": 0.171875, "learning_rate": 6.423290447737654e-05, "loss": 0.0416, "step": 6353 }, { "epoch": 0.817094603283467, "grad_norm": 0.1728515625, "learning_rate": 6.422321552957578e-05, "loss": 0.0405, "step": 6354 }, { "epoch": 0.8172231985940246, "grad_norm": 0.1845703125, "learning_rate": 6.421352600063106e-05, "loss": 0.0399, "step": 6355 }, { "epoch": 0.8173517939045822, "grad_norm": 0.1513671875, "learning_rate": 6.420383589093828e-05, "loss": 0.0302, "step": 6356 }, { "epoch": 0.81748038921514, "grad_norm": 0.154296875, "learning_rate": 6.419414520089338e-05, "loss": 0.0353, "step": 6357 }, { "epoch": 0.8176089845256976, "grad_norm": 0.185546875, "learning_rate": 6.41844539308923e-05, "loss": 0.0411, "step": 6358 }, { "epoch": 0.8177375798362553, "grad_norm": 0.1884765625, "learning_rate": 6.417476208133101e-05, "loss": 0.049, "step": 6359 }, { "epoch": 0.817866175146813, "grad_norm": 0.1767578125, "learning_rate": 6.416506965260553e-05, "loss": 0.0431, "step": 6360 }, { "epoch": 0.8179947704573707, "grad_norm": 0.1953125, "learning_rate": 6.415537664511185e-05, "loss": 0.052, "step": 6361 }, { "epoch": 0.8181233657679283, "grad_norm": 0.173828125, "learning_rate": 6.414568305924605e-05, "loss": 0.0373, "step": 6362 }, { "epoch": 0.818251961078486, "grad_norm": 0.150390625, "learning_rate": 6.413598889540417e-05, "loss": 0.0327, "step": 6363 }, { "epoch": 0.8183805563890437, "grad_norm": 0.16796875, "learning_rate": 6.412629415398232e-05, "loss": 0.0399, "step": 6364 }, { "epoch": 0.8185091516996014, "grad_norm": 0.15625, "learning_rate": 6.41165988353766e-05, "loss": 0.0358, "step": 6365 }, { "epoch": 0.818637747010159, "grad_norm": 0.18359375, "learning_rate": 6.410690293998314e-05, "loss": 0.0424, "step": 6366 }, { "epoch": 0.8187663423207167, "grad_norm": 0.1708984375, "learning_rate": 6.409720646819815e-05, "loss": 0.042, "step": 6367 }, { "epoch": 0.8188949376312744, "grad_norm": 0.166015625, "learning_rate": 6.408750942041777e-05, "loss": 0.0405, "step": 6368 }, { "epoch": 0.819023532941832, "grad_norm": 0.193359375, "learning_rate": 6.407781179703823e-05, "loss": 0.05, "step": 6369 }, { "epoch": 0.8191521282523897, "grad_norm": 0.17578125, "learning_rate": 6.406811359845575e-05, "loss": 0.0406, "step": 6370 }, { "epoch": 0.8192807235629475, "grad_norm": 0.1875, "learning_rate": 6.405841482506662e-05, "loss": 0.0443, "step": 6371 }, { "epoch": 0.8194093188735051, "grad_norm": 0.171875, "learning_rate": 6.404871547726708e-05, "loss": 0.0393, "step": 6372 }, { "epoch": 0.8195379141840627, "grad_norm": 0.16015625, "learning_rate": 6.403901555545344e-05, "loss": 0.0362, "step": 6373 }, { "epoch": 0.8196665094946204, "grad_norm": 0.1689453125, "learning_rate": 6.402931506002206e-05, "loss": 0.0448, "step": 6374 }, { "epoch": 0.8197951048051781, "grad_norm": 0.173828125, "learning_rate": 6.401961399136926e-05, "loss": 0.0363, "step": 6375 }, { "epoch": 0.8199237001157358, "grad_norm": 0.177734375, "learning_rate": 6.400991234989144e-05, "loss": 0.0452, "step": 6376 }, { "epoch": 0.8200522954262934, "grad_norm": 0.169921875, "learning_rate": 6.400021013598497e-05, "loss": 0.0422, "step": 6377 }, { "epoch": 0.8201808907368511, "grad_norm": 0.158203125, "learning_rate": 6.399050735004628e-05, "loss": 0.0367, "step": 6378 }, { "epoch": 0.8203094860474088, "grad_norm": 0.197265625, "learning_rate": 6.398080399247182e-05, "loss": 0.0436, "step": 6379 }, { "epoch": 0.8204380813579665, "grad_norm": 0.1533203125, "learning_rate": 6.397110006365806e-05, "loss": 0.0349, "step": 6380 }, { "epoch": 0.8205666766685241, "grad_norm": 0.1923828125, "learning_rate": 6.39613955640015e-05, "loss": 0.0508, "step": 6381 }, { "epoch": 0.8206952719790819, "grad_norm": 0.1689453125, "learning_rate": 6.395169049389862e-05, "loss": 0.0368, "step": 6382 }, { "epoch": 0.8208238672896395, "grad_norm": 0.19921875, "learning_rate": 6.394198485374599e-05, "loss": 0.053, "step": 6383 }, { "epoch": 0.8209524626001972, "grad_norm": 0.16796875, "learning_rate": 6.393227864394014e-05, "loss": 0.0326, "step": 6384 }, { "epoch": 0.8210810579107548, "grad_norm": 0.1640625, "learning_rate": 6.392257186487769e-05, "loss": 0.0379, "step": 6385 }, { "epoch": 0.8212096532213126, "grad_norm": 0.177734375, "learning_rate": 6.391286451695522e-05, "loss": 0.0449, "step": 6386 }, { "epoch": 0.8213382485318702, "grad_norm": 0.1552734375, "learning_rate": 6.39031566005694e-05, "loss": 0.0379, "step": 6387 }, { "epoch": 0.8214668438424279, "grad_norm": 0.1904296875, "learning_rate": 6.389344811611685e-05, "loss": 0.0461, "step": 6388 }, { "epoch": 0.8215954391529856, "grad_norm": 0.1640625, "learning_rate": 6.388373906399425e-05, "loss": 0.0375, "step": 6389 }, { "epoch": 0.8217240344635433, "grad_norm": 0.1904296875, "learning_rate": 6.387402944459828e-05, "loss": 0.0509, "step": 6390 }, { "epoch": 0.8218526297741009, "grad_norm": 0.1796875, "learning_rate": 6.386431925832572e-05, "loss": 0.039, "step": 6391 }, { "epoch": 0.8219812250846585, "grad_norm": 0.1416015625, "learning_rate": 6.385460850557327e-05, "loss": 0.0288, "step": 6392 }, { "epoch": 0.8221098203952163, "grad_norm": 0.1689453125, "learning_rate": 6.384489718673771e-05, "loss": 0.0409, "step": 6393 }, { "epoch": 0.8222384157057739, "grad_norm": 0.1708984375, "learning_rate": 6.383518530221586e-05, "loss": 0.0394, "step": 6394 }, { "epoch": 0.8223670110163316, "grad_norm": 0.166015625, "learning_rate": 6.382547285240451e-05, "loss": 0.0377, "step": 6395 }, { "epoch": 0.8224956063268892, "grad_norm": 0.1650390625, "learning_rate": 6.381575983770049e-05, "loss": 0.035, "step": 6396 }, { "epoch": 0.822624201637447, "grad_norm": 0.1455078125, "learning_rate": 6.380604625850068e-05, "loss": 0.0323, "step": 6397 }, { "epoch": 0.8227527969480046, "grad_norm": 0.1796875, "learning_rate": 6.379633211520197e-05, "loss": 0.0489, "step": 6398 }, { "epoch": 0.8228813922585623, "grad_norm": 0.1787109375, "learning_rate": 6.378661740820127e-05, "loss": 0.0412, "step": 6399 }, { "epoch": 0.82300998756912, "grad_norm": 0.166015625, "learning_rate": 6.37769021378955e-05, "loss": 0.0381, "step": 6400 }, { "epoch": 0.8231385828796777, "grad_norm": 0.1689453125, "learning_rate": 6.376718630468159e-05, "loss": 0.0471, "step": 6401 }, { "epoch": 0.8232671781902353, "grad_norm": 0.181640625, "learning_rate": 6.375746990895656e-05, "loss": 0.0348, "step": 6402 }, { "epoch": 0.823395773500793, "grad_norm": 0.166015625, "learning_rate": 6.37477529511174e-05, "loss": 0.0413, "step": 6403 }, { "epoch": 0.8235243688113507, "grad_norm": 0.154296875, "learning_rate": 6.373803543156112e-05, "loss": 0.037, "step": 6404 }, { "epoch": 0.8236529641219084, "grad_norm": 0.185546875, "learning_rate": 6.372831735068479e-05, "loss": 0.0417, "step": 6405 }, { "epoch": 0.823781559432466, "grad_norm": 0.1865234375, "learning_rate": 6.371859870888545e-05, "loss": 0.0456, "step": 6406 }, { "epoch": 0.8239101547430238, "grad_norm": 0.154296875, "learning_rate": 6.370887950656021e-05, "loss": 0.0377, "step": 6407 }, { "epoch": 0.8240387500535814, "grad_norm": 0.2119140625, "learning_rate": 6.369915974410618e-05, "loss": 0.0426, "step": 6408 }, { "epoch": 0.824167345364139, "grad_norm": 0.181640625, "learning_rate": 6.368943942192051e-05, "loss": 0.0483, "step": 6409 }, { "epoch": 0.8242959406746967, "grad_norm": 0.1953125, "learning_rate": 6.367971854040034e-05, "loss": 0.048, "step": 6410 }, { "epoch": 0.8244245359852544, "grad_norm": 0.18359375, "learning_rate": 6.366999709994288e-05, "loss": 0.0546, "step": 6411 }, { "epoch": 0.8245531312958121, "grad_norm": 0.1728515625, "learning_rate": 6.366027510094532e-05, "loss": 0.0349, "step": 6412 }, { "epoch": 0.8246817266063697, "grad_norm": 0.1982421875, "learning_rate": 6.365055254380487e-05, "loss": 0.0494, "step": 6413 }, { "epoch": 0.8248103219169274, "grad_norm": 0.16796875, "learning_rate": 6.364082942891881e-05, "loss": 0.0406, "step": 6414 }, { "epoch": 0.8249389172274851, "grad_norm": 0.1748046875, "learning_rate": 6.363110575668443e-05, "loss": 0.042, "step": 6415 }, { "epoch": 0.8250675125380428, "grad_norm": 0.177734375, "learning_rate": 6.362138152749899e-05, "loss": 0.0405, "step": 6416 }, { "epoch": 0.8251961078486004, "grad_norm": 0.1650390625, "learning_rate": 6.361165674175982e-05, "loss": 0.04, "step": 6417 }, { "epoch": 0.8253247031591582, "grad_norm": 0.177734375, "learning_rate": 6.360193139986427e-05, "loss": 0.0424, "step": 6418 }, { "epoch": 0.8254532984697158, "grad_norm": 0.1689453125, "learning_rate": 6.359220550220973e-05, "loss": 0.0362, "step": 6419 }, { "epoch": 0.8255818937802735, "grad_norm": 0.201171875, "learning_rate": 6.358247904919354e-05, "loss": 0.0471, "step": 6420 }, { "epoch": 0.8257104890908311, "grad_norm": 0.162109375, "learning_rate": 6.357275204121315e-05, "loss": 0.0354, "step": 6421 }, { "epoch": 0.8258390844013889, "grad_norm": 0.181640625, "learning_rate": 6.356302447866599e-05, "loss": 0.0469, "step": 6422 }, { "epoch": 0.8259676797119465, "grad_norm": 0.17578125, "learning_rate": 6.35532963619495e-05, "loss": 0.0437, "step": 6423 }, { "epoch": 0.8260962750225042, "grad_norm": 0.166015625, "learning_rate": 6.354356769146117e-05, "loss": 0.0413, "step": 6424 }, { "epoch": 0.8262248703330618, "grad_norm": 0.1591796875, "learning_rate": 6.35338384675985e-05, "loss": 0.0399, "step": 6425 }, { "epoch": 0.8263534656436196, "grad_norm": 0.15234375, "learning_rate": 6.3524108690759e-05, "loss": 0.0366, "step": 6426 }, { "epoch": 0.8264820609541772, "grad_norm": 0.185546875, "learning_rate": 6.351437836134024e-05, "loss": 0.049, "step": 6427 }, { "epoch": 0.8266106562647348, "grad_norm": 0.1611328125, "learning_rate": 6.350464747973978e-05, "loss": 0.0393, "step": 6428 }, { "epoch": 0.8267392515752926, "grad_norm": 0.177734375, "learning_rate": 6.349491604635522e-05, "loss": 0.0439, "step": 6429 }, { "epoch": 0.8268678468858502, "grad_norm": 0.1845703125, "learning_rate": 6.348518406158416e-05, "loss": 0.0448, "step": 6430 }, { "epoch": 0.8269964421964079, "grad_norm": 0.177734375, "learning_rate": 6.347545152582426e-05, "loss": 0.0402, "step": 6431 }, { "epoch": 0.8271250375069655, "grad_norm": 0.1640625, "learning_rate": 6.346571843947315e-05, "loss": 0.0393, "step": 6432 }, { "epoch": 0.8272536328175233, "grad_norm": 0.181640625, "learning_rate": 6.345598480292853e-05, "loss": 0.05, "step": 6433 }, { "epoch": 0.8273822281280809, "grad_norm": 0.16796875, "learning_rate": 6.344625061658811e-05, "loss": 0.0443, "step": 6434 }, { "epoch": 0.8275108234386386, "grad_norm": 0.1611328125, "learning_rate": 6.343651588084963e-05, "loss": 0.033, "step": 6435 }, { "epoch": 0.8276394187491963, "grad_norm": 0.1650390625, "learning_rate": 6.34267805961108e-05, "loss": 0.0378, "step": 6436 }, { "epoch": 0.827768014059754, "grad_norm": 0.1865234375, "learning_rate": 6.341704476276941e-05, "loss": 0.0457, "step": 6437 }, { "epoch": 0.8278966093703116, "grad_norm": 0.205078125, "learning_rate": 6.340730838122328e-05, "loss": 0.0458, "step": 6438 }, { "epoch": 0.8280252046808693, "grad_norm": 0.1953125, "learning_rate": 6.339757145187017e-05, "loss": 0.0398, "step": 6439 }, { "epoch": 0.828153799991427, "grad_norm": 0.154296875, "learning_rate": 6.338783397510797e-05, "loss": 0.0371, "step": 6440 }, { "epoch": 0.8282823953019847, "grad_norm": 0.146484375, "learning_rate": 6.337809595133455e-05, "loss": 0.0347, "step": 6441 }, { "epoch": 0.8284109906125423, "grad_norm": 0.1767578125, "learning_rate": 6.336835738094775e-05, "loss": 0.0387, "step": 6442 }, { "epoch": 0.8285395859231, "grad_norm": 0.1865234375, "learning_rate": 6.335861826434551e-05, "loss": 0.0493, "step": 6443 }, { "epoch": 0.8286681812336577, "grad_norm": 0.166015625, "learning_rate": 6.334887860192573e-05, "loss": 0.0415, "step": 6444 }, { "epoch": 0.8287967765442154, "grad_norm": 0.1787109375, "learning_rate": 6.333913839408639e-05, "loss": 0.0446, "step": 6445 }, { "epoch": 0.828925371854773, "grad_norm": 0.16015625, "learning_rate": 6.332939764122545e-05, "loss": 0.0368, "step": 6446 }, { "epoch": 0.8290539671653308, "grad_norm": 0.150390625, "learning_rate": 6.331965634374091e-05, "loss": 0.0345, "step": 6447 }, { "epoch": 0.8291825624758884, "grad_norm": 0.1865234375, "learning_rate": 6.330991450203078e-05, "loss": 0.0431, "step": 6448 }, { "epoch": 0.829311157786446, "grad_norm": 0.173828125, "learning_rate": 6.33001721164931e-05, "loss": 0.04, "step": 6449 }, { "epoch": 0.8294397530970037, "grad_norm": 0.1865234375, "learning_rate": 6.329042918752596e-05, "loss": 0.0362, "step": 6450 }, { "epoch": 0.8295683484075614, "grad_norm": 0.162109375, "learning_rate": 6.328068571552741e-05, "loss": 0.0387, "step": 6451 }, { "epoch": 0.8296969437181191, "grad_norm": 0.185546875, "learning_rate": 6.327094170089556e-05, "loss": 0.0456, "step": 6452 }, { "epoch": 0.8298255390286767, "grad_norm": 0.1533203125, "learning_rate": 6.326119714402856e-05, "loss": 0.0366, "step": 6453 }, { "epoch": 0.8299541343392345, "grad_norm": 0.19140625, "learning_rate": 6.325145204532455e-05, "loss": 0.0468, "step": 6454 }, { "epoch": 0.8300827296497921, "grad_norm": 0.162109375, "learning_rate": 6.32417064051817e-05, "loss": 0.0354, "step": 6455 }, { "epoch": 0.8302113249603498, "grad_norm": 0.1826171875, "learning_rate": 6.323196022399819e-05, "loss": 0.0446, "step": 6456 }, { "epoch": 0.8303399202709074, "grad_norm": 0.1728515625, "learning_rate": 6.32222135021723e-05, "loss": 0.0425, "step": 6457 }, { "epoch": 0.8304685155814652, "grad_norm": 0.19140625, "learning_rate": 6.321246624010218e-05, "loss": 0.0442, "step": 6458 }, { "epoch": 0.8305971108920228, "grad_norm": 0.1865234375, "learning_rate": 6.320271843818618e-05, "loss": 0.046, "step": 6459 }, { "epoch": 0.8307257062025805, "grad_norm": 0.1689453125, "learning_rate": 6.319297009682252e-05, "loss": 0.0355, "step": 6460 }, { "epoch": 0.8308543015131381, "grad_norm": 0.1845703125, "learning_rate": 6.318322121640951e-05, "loss": 0.0451, "step": 6461 }, { "epoch": 0.8309828968236959, "grad_norm": 0.1875, "learning_rate": 6.317347179734554e-05, "loss": 0.0463, "step": 6462 }, { "epoch": 0.8311114921342535, "grad_norm": 0.1923828125, "learning_rate": 6.316372184002888e-05, "loss": 0.0442, "step": 6463 }, { "epoch": 0.8312400874448111, "grad_norm": 0.1689453125, "learning_rate": 6.315397134485794e-05, "loss": 0.0369, "step": 6464 }, { "epoch": 0.8313686827553689, "grad_norm": 0.1806640625, "learning_rate": 6.314422031223112e-05, "loss": 0.036, "step": 6465 }, { "epoch": 0.8314972780659265, "grad_norm": 0.1953125, "learning_rate": 6.313446874254682e-05, "loss": 0.0436, "step": 6466 }, { "epoch": 0.8316258733764842, "grad_norm": 0.1669921875, "learning_rate": 6.312471663620348e-05, "loss": 0.0364, "step": 6467 }, { "epoch": 0.8317544686870418, "grad_norm": 0.1611328125, "learning_rate": 6.311496399359958e-05, "loss": 0.0383, "step": 6468 }, { "epoch": 0.8318830639975996, "grad_norm": 0.169921875, "learning_rate": 6.310521081513358e-05, "loss": 0.0427, "step": 6469 }, { "epoch": 0.8320116593081572, "grad_norm": 0.1826171875, "learning_rate": 6.309545710120397e-05, "loss": 0.0466, "step": 6470 }, { "epoch": 0.8321402546187149, "grad_norm": 0.1923828125, "learning_rate": 6.308570285220932e-05, "loss": 0.0535, "step": 6471 }, { "epoch": 0.8322688499292725, "grad_norm": 0.1611328125, "learning_rate": 6.307594806854814e-05, "loss": 0.0355, "step": 6472 }, { "epoch": 0.8323974452398303, "grad_norm": 0.1611328125, "learning_rate": 6.306619275061901e-05, "loss": 0.0391, "step": 6473 }, { "epoch": 0.8325260405503879, "grad_norm": 0.1630859375, "learning_rate": 6.305643689882052e-05, "loss": 0.0386, "step": 6474 }, { "epoch": 0.8326546358609456, "grad_norm": 0.158203125, "learning_rate": 6.304668051355129e-05, "loss": 0.0372, "step": 6475 }, { "epoch": 0.8327832311715033, "grad_norm": 0.1572265625, "learning_rate": 6.303692359520995e-05, "loss": 0.0334, "step": 6476 }, { "epoch": 0.832911826482061, "grad_norm": 0.166015625, "learning_rate": 6.302716614419515e-05, "loss": 0.0391, "step": 6477 }, { "epoch": 0.8330404217926186, "grad_norm": 0.18359375, "learning_rate": 6.301740816090557e-05, "loss": 0.0449, "step": 6478 }, { "epoch": 0.8331690171031763, "grad_norm": 0.1572265625, "learning_rate": 6.300764964573991e-05, "loss": 0.0342, "step": 6479 }, { "epoch": 0.833297612413734, "grad_norm": 0.1787109375, "learning_rate": 6.29978905990969e-05, "loss": 0.0416, "step": 6480 }, { "epoch": 0.8334262077242917, "grad_norm": 0.166015625, "learning_rate": 6.298813102137528e-05, "loss": 0.0386, "step": 6481 }, { "epoch": 0.8335548030348493, "grad_norm": 0.1689453125, "learning_rate": 6.297837091297383e-05, "loss": 0.0378, "step": 6482 }, { "epoch": 0.8336833983454071, "grad_norm": 0.193359375, "learning_rate": 6.296861027429129e-05, "loss": 0.0472, "step": 6483 }, { "epoch": 0.8338119936559647, "grad_norm": 0.1630859375, "learning_rate": 6.295884910572652e-05, "loss": 0.0372, "step": 6484 }, { "epoch": 0.8339405889665223, "grad_norm": 0.1845703125, "learning_rate": 6.29490874076783e-05, "loss": 0.0413, "step": 6485 }, { "epoch": 0.83406918427708, "grad_norm": 0.1552734375, "learning_rate": 6.293932518054552e-05, "loss": 0.0341, "step": 6486 }, { "epoch": 0.8341977795876377, "grad_norm": 0.1787109375, "learning_rate": 6.292956242472705e-05, "loss": 0.0432, "step": 6487 }, { "epoch": 0.8343263748981954, "grad_norm": 0.1865234375, "learning_rate": 6.291979914062178e-05, "loss": 0.0458, "step": 6488 }, { "epoch": 0.834454970208753, "grad_norm": 0.1875, "learning_rate": 6.29100353286286e-05, "loss": 0.0451, "step": 6489 }, { "epoch": 0.8345835655193107, "grad_norm": 0.1650390625, "learning_rate": 6.290027098914648e-05, "loss": 0.0397, "step": 6490 }, { "epoch": 0.8347121608298684, "grad_norm": 0.1826171875, "learning_rate": 6.289050612257438e-05, "loss": 0.0417, "step": 6491 }, { "epoch": 0.8348407561404261, "grad_norm": 0.1884765625, "learning_rate": 6.288074072931127e-05, "loss": 0.0383, "step": 6492 }, { "epoch": 0.8349693514509837, "grad_norm": 0.1591796875, "learning_rate": 6.287097480975616e-05, "loss": 0.0368, "step": 6493 }, { "epoch": 0.8350979467615415, "grad_norm": 0.1611328125, "learning_rate": 6.286120836430806e-05, "loss": 0.0381, "step": 6494 }, { "epoch": 0.8352265420720991, "grad_norm": 0.1484375, "learning_rate": 6.285144139336603e-05, "loss": 0.0317, "step": 6495 }, { "epoch": 0.8353551373826568, "grad_norm": 0.171875, "learning_rate": 6.284167389732914e-05, "loss": 0.0435, "step": 6496 }, { "epoch": 0.8354837326932144, "grad_norm": 0.166015625, "learning_rate": 6.283190587659644e-05, "loss": 0.041, "step": 6497 }, { "epoch": 0.8356123280037722, "grad_norm": 0.1767578125, "learning_rate": 6.282213733156709e-05, "loss": 0.0419, "step": 6498 }, { "epoch": 0.8357409233143298, "grad_norm": 0.1689453125, "learning_rate": 6.281236826264018e-05, "loss": 0.0337, "step": 6499 }, { "epoch": 0.8358695186248875, "grad_norm": 0.171875, "learning_rate": 6.280259867021492e-05, "loss": 0.0342, "step": 6500 }, { "epoch": 0.8358695186248875, "eval_loss": 0.04029998928308487, "eval_runtime": 1042.6689, "eval_samples_per_second": 94.206, "eval_steps_per_second": 1.178, "step": 6500 }, { "epoch": 0.8359981139354452, "grad_norm": 0.171875, "learning_rate": 6.279282855469042e-05, "loss": 0.0415, "step": 6501 }, { "epoch": 0.8361267092460029, "grad_norm": 0.1689453125, "learning_rate": 6.27830579164659e-05, "loss": 0.0342, "step": 6502 }, { "epoch": 0.8362553045565605, "grad_norm": 0.17578125, "learning_rate": 6.277328675594058e-05, "loss": 0.0405, "step": 6503 }, { "epoch": 0.8363838998671181, "grad_norm": 0.1796875, "learning_rate": 6.276351507351372e-05, "loss": 0.0431, "step": 6504 }, { "epoch": 0.8365124951776759, "grad_norm": 0.1708984375, "learning_rate": 6.275374286958455e-05, "loss": 0.0441, "step": 6505 }, { "epoch": 0.8366410904882335, "grad_norm": 0.1962890625, "learning_rate": 6.274397014455236e-05, "loss": 0.0401, "step": 6506 }, { "epoch": 0.8367696857987912, "grad_norm": 0.1962890625, "learning_rate": 6.273419689881644e-05, "loss": 0.0461, "step": 6507 }, { "epoch": 0.8368982811093488, "grad_norm": 0.16015625, "learning_rate": 6.272442313277612e-05, "loss": 0.0381, "step": 6508 }, { "epoch": 0.8370268764199066, "grad_norm": 0.1865234375, "learning_rate": 6.271464884683075e-05, "loss": 0.0457, "step": 6509 }, { "epoch": 0.8371554717304642, "grad_norm": 0.169921875, "learning_rate": 6.27048740413797e-05, "loss": 0.0411, "step": 6510 }, { "epoch": 0.8372840670410219, "grad_norm": 0.1884765625, "learning_rate": 6.269509871682235e-05, "loss": 0.0506, "step": 6511 }, { "epoch": 0.8374126623515796, "grad_norm": 0.1572265625, "learning_rate": 6.268532287355812e-05, "loss": 0.033, "step": 6512 }, { "epoch": 0.8375412576621373, "grad_norm": 0.1708984375, "learning_rate": 6.26755465119864e-05, "loss": 0.0456, "step": 6513 }, { "epoch": 0.8376698529726949, "grad_norm": 0.1953125, "learning_rate": 6.26657696325067e-05, "loss": 0.0493, "step": 6514 }, { "epoch": 0.8377984482832526, "grad_norm": 0.16015625, "learning_rate": 6.265599223551846e-05, "loss": 0.037, "step": 6515 }, { "epoch": 0.8379270435938103, "grad_norm": 0.1669921875, "learning_rate": 6.264621432142115e-05, "loss": 0.0347, "step": 6516 }, { "epoch": 0.838055638904368, "grad_norm": 0.1767578125, "learning_rate": 6.263643589061434e-05, "loss": 0.0412, "step": 6517 }, { "epoch": 0.8381842342149256, "grad_norm": 0.1787109375, "learning_rate": 6.262665694349752e-05, "loss": 0.0448, "step": 6518 }, { "epoch": 0.8383128295254832, "grad_norm": 0.16015625, "learning_rate": 6.261687748047025e-05, "loss": 0.0351, "step": 6519 }, { "epoch": 0.838441424836041, "grad_norm": 0.16015625, "learning_rate": 6.260709750193214e-05, "loss": 0.0357, "step": 6520 }, { "epoch": 0.8385700201465986, "grad_norm": 0.171875, "learning_rate": 6.259731700828275e-05, "loss": 0.0443, "step": 6521 }, { "epoch": 0.8386986154571563, "grad_norm": 0.1728515625, "learning_rate": 6.258753599992172e-05, "loss": 0.0403, "step": 6522 }, { "epoch": 0.838827210767714, "grad_norm": 0.16015625, "learning_rate": 6.257775447724868e-05, "loss": 0.0338, "step": 6523 }, { "epoch": 0.8389558060782717, "grad_norm": 0.1708984375, "learning_rate": 6.256797244066331e-05, "loss": 0.0357, "step": 6524 }, { "epoch": 0.8390844013888293, "grad_norm": 0.1826171875, "learning_rate": 6.255818989056527e-05, "loss": 0.0414, "step": 6525 }, { "epoch": 0.839212996699387, "grad_norm": 0.1943359375, "learning_rate": 6.25484068273543e-05, "loss": 0.053, "step": 6526 }, { "epoch": 0.8393415920099447, "grad_norm": 0.15234375, "learning_rate": 6.253862325143007e-05, "loss": 0.0345, "step": 6527 }, { "epoch": 0.8394701873205024, "grad_norm": 0.1845703125, "learning_rate": 6.252883916319237e-05, "loss": 0.0449, "step": 6528 }, { "epoch": 0.83959878263106, "grad_norm": 0.1904296875, "learning_rate": 6.251905456304095e-05, "loss": 0.0445, "step": 6529 }, { "epoch": 0.8397273779416178, "grad_norm": 0.1572265625, "learning_rate": 6.250926945137558e-05, "loss": 0.0379, "step": 6530 }, { "epoch": 0.8398559732521754, "grad_norm": 0.15625, "learning_rate": 6.249948382859612e-05, "loss": 0.037, "step": 6531 }, { "epoch": 0.8399845685627331, "grad_norm": 0.1689453125, "learning_rate": 6.248969769510234e-05, "loss": 0.0339, "step": 6532 }, { "epoch": 0.8401131638732907, "grad_norm": 0.166015625, "learning_rate": 6.247991105129411e-05, "loss": 0.0406, "step": 6533 }, { "epoch": 0.8402417591838485, "grad_norm": 0.1826171875, "learning_rate": 6.247012389757132e-05, "loss": 0.0423, "step": 6534 }, { "epoch": 0.8403703544944061, "grad_norm": 0.17578125, "learning_rate": 6.246033623433384e-05, "loss": 0.0358, "step": 6535 }, { "epoch": 0.8404989498049638, "grad_norm": 0.150390625, "learning_rate": 6.24505480619816e-05, "loss": 0.0324, "step": 6536 }, { "epoch": 0.8406275451155214, "grad_norm": 0.1669921875, "learning_rate": 6.244075938091452e-05, "loss": 0.0374, "step": 6537 }, { "epoch": 0.8407561404260792, "grad_norm": 0.171875, "learning_rate": 6.243097019153255e-05, "loss": 0.0365, "step": 6538 }, { "epoch": 0.8408847357366368, "grad_norm": 0.166015625, "learning_rate": 6.242118049423568e-05, "loss": 0.0341, "step": 6539 }, { "epoch": 0.8410133310471944, "grad_norm": 0.1796875, "learning_rate": 6.241139028942389e-05, "loss": 0.0444, "step": 6540 }, { "epoch": 0.8411419263577522, "grad_norm": 0.1611328125, "learning_rate": 6.240159957749722e-05, "loss": 0.0353, "step": 6541 }, { "epoch": 0.8412705216683098, "grad_norm": 0.1728515625, "learning_rate": 6.239180835885569e-05, "loss": 0.0392, "step": 6542 }, { "epoch": 0.8413991169788675, "grad_norm": 0.1806640625, "learning_rate": 6.238201663389935e-05, "loss": 0.0423, "step": 6543 }, { "epoch": 0.8415277122894251, "grad_norm": 0.16796875, "learning_rate": 6.237222440302828e-05, "loss": 0.0409, "step": 6544 }, { "epoch": 0.8416563075999829, "grad_norm": 0.1796875, "learning_rate": 6.23624316666426e-05, "loss": 0.0416, "step": 6545 }, { "epoch": 0.8417849029105405, "grad_norm": 0.1796875, "learning_rate": 6.235263842514241e-05, "loss": 0.0446, "step": 6546 }, { "epoch": 0.8419134982210982, "grad_norm": 0.181640625, "learning_rate": 6.234284467892786e-05, "loss": 0.0398, "step": 6547 }, { "epoch": 0.8420420935316559, "grad_norm": 0.171875, "learning_rate": 6.233305042839911e-05, "loss": 0.0427, "step": 6548 }, { "epoch": 0.8421706888422136, "grad_norm": 0.2060546875, "learning_rate": 6.232325567395634e-05, "loss": 0.0384, "step": 6549 }, { "epoch": 0.8422992841527712, "grad_norm": 0.1962890625, "learning_rate": 6.231346041599974e-05, "loss": 0.0411, "step": 6550 }, { "epoch": 0.8424278794633289, "grad_norm": 0.1630859375, "learning_rate": 6.230366465492957e-05, "loss": 0.0385, "step": 6551 }, { "epoch": 0.8425564747738866, "grad_norm": 0.1552734375, "learning_rate": 6.229386839114603e-05, "loss": 0.0306, "step": 6552 }, { "epoch": 0.8426850700844443, "grad_norm": 0.1884765625, "learning_rate": 6.228407162504939e-05, "loss": 0.0448, "step": 6553 }, { "epoch": 0.8428136653950019, "grad_norm": 0.181640625, "learning_rate": 6.227427435703997e-05, "loss": 0.0453, "step": 6554 }, { "epoch": 0.8429422607055596, "grad_norm": 0.1630859375, "learning_rate": 6.226447658751804e-05, "loss": 0.0361, "step": 6555 }, { "epoch": 0.8430708560161173, "grad_norm": 0.171875, "learning_rate": 6.225467831688393e-05, "loss": 0.0403, "step": 6556 }, { "epoch": 0.843199451326675, "grad_norm": 0.1591796875, "learning_rate": 6.224487954553799e-05, "loss": 0.0311, "step": 6557 }, { "epoch": 0.8433280466372326, "grad_norm": 0.2021484375, "learning_rate": 6.22350802738806e-05, "loss": 0.0516, "step": 6558 }, { "epoch": 0.8434566419477904, "grad_norm": 0.1796875, "learning_rate": 6.222528050231213e-05, "loss": 0.0406, "step": 6559 }, { "epoch": 0.843585237258348, "grad_norm": 0.185546875, "learning_rate": 6.221548023123301e-05, "loss": 0.0459, "step": 6560 }, { "epoch": 0.8437138325689056, "grad_norm": 0.1708984375, "learning_rate": 6.220567946104363e-05, "loss": 0.0362, "step": 6561 }, { "epoch": 0.8438424278794633, "grad_norm": 0.1826171875, "learning_rate": 6.219587819214448e-05, "loss": 0.051, "step": 6562 }, { "epoch": 0.843971023190021, "grad_norm": 0.1767578125, "learning_rate": 6.218607642493598e-05, "loss": 0.0454, "step": 6563 }, { "epoch": 0.8440996185005787, "grad_norm": 0.1630859375, "learning_rate": 6.217627415981866e-05, "loss": 0.0401, "step": 6564 }, { "epoch": 0.8442282138111363, "grad_norm": 0.1416015625, "learning_rate": 6.216647139719301e-05, "loss": 0.0301, "step": 6565 }, { "epoch": 0.844356809121694, "grad_norm": 0.1708984375, "learning_rate": 6.215666813745958e-05, "loss": 0.0381, "step": 6566 }, { "epoch": 0.8444854044322517, "grad_norm": 0.1611328125, "learning_rate": 6.21468643810189e-05, "loss": 0.0343, "step": 6567 }, { "epoch": 0.8446139997428094, "grad_norm": 0.15625, "learning_rate": 6.213706012827151e-05, "loss": 0.0347, "step": 6568 }, { "epoch": 0.844742595053367, "grad_norm": 0.1904296875, "learning_rate": 6.212725537961807e-05, "loss": 0.057, "step": 6569 }, { "epoch": 0.8448711903639248, "grad_norm": 0.1904296875, "learning_rate": 6.211745013545913e-05, "loss": 0.0446, "step": 6570 }, { "epoch": 0.8449997856744824, "grad_norm": 0.1435546875, "learning_rate": 6.210764439619535e-05, "loss": 0.0318, "step": 6571 }, { "epoch": 0.8451283809850401, "grad_norm": 0.166015625, "learning_rate": 6.20978381622274e-05, "loss": 0.0372, "step": 6572 }, { "epoch": 0.8452569762955977, "grad_norm": 0.181640625, "learning_rate": 6.20880314339559e-05, "loss": 0.041, "step": 6573 }, { "epoch": 0.8453855716061555, "grad_norm": 0.1611328125, "learning_rate": 6.207822421178157e-05, "loss": 0.0335, "step": 6574 }, { "epoch": 0.8455141669167131, "grad_norm": 0.1787109375, "learning_rate": 6.206841649610514e-05, "loss": 0.0417, "step": 6575 }, { "epoch": 0.8456427622272708, "grad_norm": 0.1630859375, "learning_rate": 6.205860828732732e-05, "loss": 0.0314, "step": 6576 }, { "epoch": 0.8457713575378285, "grad_norm": 0.1689453125, "learning_rate": 6.204879958584887e-05, "loss": 0.0362, "step": 6577 }, { "epoch": 0.8458999528483861, "grad_norm": 0.1669921875, "learning_rate": 6.203899039207052e-05, "loss": 0.0427, "step": 6578 }, { "epoch": 0.8460285481589438, "grad_norm": 0.1962890625, "learning_rate": 6.202918070639314e-05, "loss": 0.0513, "step": 6579 }, { "epoch": 0.8461571434695014, "grad_norm": 0.2119140625, "learning_rate": 6.201937052921747e-05, "loss": 0.0363, "step": 6580 }, { "epoch": 0.8462857387800592, "grad_norm": 0.1728515625, "learning_rate": 6.200955986094441e-05, "loss": 0.0431, "step": 6581 }, { "epoch": 0.8464143340906168, "grad_norm": 0.1748046875, "learning_rate": 6.199974870197475e-05, "loss": 0.0363, "step": 6582 }, { "epoch": 0.8465429294011745, "grad_norm": 0.173828125, "learning_rate": 6.198993705270939e-05, "loss": 0.035, "step": 6583 }, { "epoch": 0.8466715247117321, "grad_norm": 0.16796875, "learning_rate": 6.198012491354922e-05, "loss": 0.0419, "step": 6584 }, { "epoch": 0.8468001200222899, "grad_norm": 0.1708984375, "learning_rate": 6.197031228489516e-05, "loss": 0.0362, "step": 6585 }, { "epoch": 0.8469287153328475, "grad_norm": 0.171875, "learning_rate": 6.196049916714814e-05, "loss": 0.0458, "step": 6586 }, { "epoch": 0.8470573106434052, "grad_norm": 0.3046875, "learning_rate": 6.195068556070909e-05, "loss": 0.0355, "step": 6587 }, { "epoch": 0.8471859059539629, "grad_norm": 0.1611328125, "learning_rate": 6.194087146597902e-05, "loss": 0.0297, "step": 6588 }, { "epoch": 0.8473145012645206, "grad_norm": 0.162109375, "learning_rate": 6.193105688335889e-05, "loss": 0.0416, "step": 6589 }, { "epoch": 0.8474430965750782, "grad_norm": 0.224609375, "learning_rate": 6.192124181324973e-05, "loss": 0.0472, "step": 6590 }, { "epoch": 0.8475716918856359, "grad_norm": 0.173828125, "learning_rate": 6.191142625605259e-05, "loss": 0.0422, "step": 6591 }, { "epoch": 0.8477002871961936, "grad_norm": 0.1748046875, "learning_rate": 6.190161021216848e-05, "loss": 0.0389, "step": 6592 }, { "epoch": 0.8478288825067513, "grad_norm": 0.1796875, "learning_rate": 6.18917936819985e-05, "loss": 0.0391, "step": 6593 }, { "epoch": 0.8479574778173089, "grad_norm": 0.1904296875, "learning_rate": 6.18819766659437e-05, "loss": 0.0473, "step": 6594 }, { "epoch": 0.8480860731278667, "grad_norm": 0.1728515625, "learning_rate": 6.187215916440525e-05, "loss": 0.0389, "step": 6595 }, { "epoch": 0.8482146684384243, "grad_norm": 0.169921875, "learning_rate": 6.186234117778426e-05, "loss": 0.0363, "step": 6596 }, { "epoch": 0.848343263748982, "grad_norm": 0.19140625, "learning_rate": 6.185252270648189e-05, "loss": 0.0479, "step": 6597 }, { "epoch": 0.8484718590595396, "grad_norm": 0.1650390625, "learning_rate": 6.18427037508993e-05, "loss": 0.0363, "step": 6598 }, { "epoch": 0.8486004543700973, "grad_norm": 0.1552734375, "learning_rate": 6.183288431143766e-05, "loss": 0.0362, "step": 6599 }, { "epoch": 0.848729049680655, "grad_norm": 0.1953125, "learning_rate": 6.18230643884982e-05, "loss": 0.047, "step": 6600 }, { "epoch": 0.8488576449912126, "grad_norm": 0.1640625, "learning_rate": 6.181324398248218e-05, "loss": 0.0406, "step": 6601 }, { "epoch": 0.8489862403017703, "grad_norm": 0.173828125, "learning_rate": 6.180342309379081e-05, "loss": 0.039, "step": 6602 }, { "epoch": 0.849114835612328, "grad_norm": 0.1708984375, "learning_rate": 6.179360172282538e-05, "loss": 0.0403, "step": 6603 }, { "epoch": 0.8492434309228857, "grad_norm": 0.1806640625, "learning_rate": 6.178377986998715e-05, "loss": 0.0447, "step": 6604 }, { "epoch": 0.8493720262334433, "grad_norm": 0.1640625, "learning_rate": 6.177395753567749e-05, "loss": 0.038, "step": 6605 }, { "epoch": 0.8495006215440011, "grad_norm": 0.177734375, "learning_rate": 6.176413472029766e-05, "loss": 0.0433, "step": 6606 }, { "epoch": 0.8496292168545587, "grad_norm": 0.17578125, "learning_rate": 6.175431142424905e-05, "loss": 0.0357, "step": 6607 }, { "epoch": 0.8497578121651164, "grad_norm": 0.197265625, "learning_rate": 6.174448764793302e-05, "loss": 0.054, "step": 6608 }, { "epoch": 0.849886407475674, "grad_norm": 0.1630859375, "learning_rate": 6.173466339175094e-05, "loss": 0.0335, "step": 6609 }, { "epoch": 0.8500150027862318, "grad_norm": 0.18359375, "learning_rate": 6.172483865610427e-05, "loss": 0.0478, "step": 6610 }, { "epoch": 0.8501435980967894, "grad_norm": 0.1611328125, "learning_rate": 6.171501344139436e-05, "loss": 0.0365, "step": 6611 }, { "epoch": 0.850272193407347, "grad_norm": 0.16796875, "learning_rate": 6.170518774802273e-05, "loss": 0.039, "step": 6612 }, { "epoch": 0.8504007887179047, "grad_norm": 0.1630859375, "learning_rate": 6.169536157639082e-05, "loss": 0.0444, "step": 6613 }, { "epoch": 0.8505293840284625, "grad_norm": 0.181640625, "learning_rate": 6.168553492690007e-05, "loss": 0.0477, "step": 6614 }, { "epoch": 0.8506579793390201, "grad_norm": 0.15234375, "learning_rate": 6.167570779995207e-05, "loss": 0.0325, "step": 6615 }, { "epoch": 0.8507865746495777, "grad_norm": 0.169921875, "learning_rate": 6.166588019594827e-05, "loss": 0.0409, "step": 6616 }, { "epoch": 0.8509151699601355, "grad_norm": 0.16796875, "learning_rate": 6.165605211529027e-05, "loss": 0.0463, "step": 6617 }, { "epoch": 0.8510437652706931, "grad_norm": 0.17578125, "learning_rate": 6.164622355837958e-05, "loss": 0.0439, "step": 6618 }, { "epoch": 0.8511723605812508, "grad_norm": 0.1884765625, "learning_rate": 6.163639452561784e-05, "loss": 0.0568, "step": 6619 }, { "epoch": 0.8513009558918084, "grad_norm": 0.1708984375, "learning_rate": 6.16265650174066e-05, "loss": 0.0337, "step": 6620 }, { "epoch": 0.8514295512023662, "grad_norm": 0.173828125, "learning_rate": 6.161673503414752e-05, "loss": 0.0395, "step": 6621 }, { "epoch": 0.8515581465129238, "grad_norm": 0.1826171875, "learning_rate": 6.160690457624223e-05, "loss": 0.041, "step": 6622 }, { "epoch": 0.8516867418234815, "grad_norm": 0.1796875, "learning_rate": 6.159707364409238e-05, "loss": 0.0434, "step": 6623 }, { "epoch": 0.8518153371340392, "grad_norm": 0.1640625, "learning_rate": 6.158724223809969e-05, "loss": 0.0389, "step": 6624 }, { "epoch": 0.8519439324445969, "grad_norm": 0.2021484375, "learning_rate": 6.157741035866581e-05, "loss": 0.0476, "step": 6625 }, { "epoch": 0.8520725277551545, "grad_norm": 0.1650390625, "learning_rate": 6.156757800619247e-05, "loss": 0.0368, "step": 6626 }, { "epoch": 0.8522011230657122, "grad_norm": 0.1591796875, "learning_rate": 6.155774518108143e-05, "loss": 0.0377, "step": 6627 }, { "epoch": 0.8523297183762699, "grad_norm": 0.1748046875, "learning_rate": 6.154791188373443e-05, "loss": 0.0446, "step": 6628 }, { "epoch": 0.8524583136868276, "grad_norm": 0.185546875, "learning_rate": 6.153807811455326e-05, "loss": 0.045, "step": 6629 }, { "epoch": 0.8525869089973852, "grad_norm": 0.1376953125, "learning_rate": 6.152824387393969e-05, "loss": 0.0255, "step": 6630 }, { "epoch": 0.8527155043079429, "grad_norm": 0.15625, "learning_rate": 6.151840916229557e-05, "loss": 0.0367, "step": 6631 }, { "epoch": 0.8528440996185006, "grad_norm": 0.1796875, "learning_rate": 6.150857398002273e-05, "loss": 0.0497, "step": 6632 }, { "epoch": 0.8529726949290583, "grad_norm": 0.171875, "learning_rate": 6.149873832752298e-05, "loss": 0.0431, "step": 6633 }, { "epoch": 0.8531012902396159, "grad_norm": 0.150390625, "learning_rate": 6.148890220519826e-05, "loss": 0.0339, "step": 6634 }, { "epoch": 0.8532298855501737, "grad_norm": 0.1611328125, "learning_rate": 6.147906561345041e-05, "loss": 0.0338, "step": 6635 }, { "epoch": 0.8533584808607313, "grad_norm": 0.1884765625, "learning_rate": 6.146922855268137e-05, "loss": 0.045, "step": 6636 }, { "epoch": 0.8534870761712889, "grad_norm": 0.16015625, "learning_rate": 6.145939102329307e-05, "loss": 0.038, "step": 6637 }, { "epoch": 0.8536156714818466, "grad_norm": 0.166015625, "learning_rate": 6.144955302568742e-05, "loss": 0.0371, "step": 6638 }, { "epoch": 0.8537442667924043, "grad_norm": 0.1767578125, "learning_rate": 6.143971456026645e-05, "loss": 0.0411, "step": 6639 }, { "epoch": 0.853872862102962, "grad_norm": 0.171875, "learning_rate": 6.14298756274321e-05, "loss": 0.04, "step": 6640 }, { "epoch": 0.8540014574135196, "grad_norm": 0.15625, "learning_rate": 6.14200362275864e-05, "loss": 0.0342, "step": 6641 }, { "epoch": 0.8541300527240774, "grad_norm": 0.171875, "learning_rate": 6.141019636113139e-05, "loss": 0.0445, "step": 6642 }, { "epoch": 0.854258648034635, "grad_norm": 0.158203125, "learning_rate": 6.140035602846909e-05, "loss": 0.0312, "step": 6643 }, { "epoch": 0.8543872433451927, "grad_norm": 0.162109375, "learning_rate": 6.139051523000156e-05, "loss": 0.0417, "step": 6644 }, { "epoch": 0.8545158386557503, "grad_norm": 0.1552734375, "learning_rate": 6.13806739661309e-05, "loss": 0.0414, "step": 6645 }, { "epoch": 0.8546444339663081, "grad_norm": 0.2021484375, "learning_rate": 6.137083223725921e-05, "loss": 0.0461, "step": 6646 }, { "epoch": 0.8547730292768657, "grad_norm": 0.1748046875, "learning_rate": 6.13609900437886e-05, "loss": 0.0461, "step": 6647 }, { "epoch": 0.8549016245874234, "grad_norm": 0.162109375, "learning_rate": 6.135114738612124e-05, "loss": 0.0412, "step": 6648 }, { "epoch": 0.855030219897981, "grad_norm": 0.1806640625, "learning_rate": 6.134130426465927e-05, "loss": 0.0384, "step": 6649 }, { "epoch": 0.8551588152085388, "grad_norm": 0.1982421875, "learning_rate": 6.133146067980484e-05, "loss": 0.0498, "step": 6650 }, { "epoch": 0.8552874105190964, "grad_norm": 0.162109375, "learning_rate": 6.13216166319602e-05, "loss": 0.0369, "step": 6651 }, { "epoch": 0.855416005829654, "grad_norm": 0.19140625, "learning_rate": 6.131177212152753e-05, "loss": 0.0506, "step": 6652 }, { "epoch": 0.8555446011402118, "grad_norm": 0.1611328125, "learning_rate": 6.13019271489091e-05, "loss": 0.0379, "step": 6653 }, { "epoch": 0.8556731964507694, "grad_norm": 0.1875, "learning_rate": 6.12920817145071e-05, "loss": 0.0373, "step": 6654 }, { "epoch": 0.8558017917613271, "grad_norm": 0.16796875, "learning_rate": 6.128223581872388e-05, "loss": 0.0415, "step": 6655 }, { "epoch": 0.8559303870718847, "grad_norm": 0.17578125, "learning_rate": 6.127238946196168e-05, "loss": 0.0385, "step": 6656 }, { "epoch": 0.8560589823824425, "grad_norm": 0.1796875, "learning_rate": 6.126254264462285e-05, "loss": 0.0445, "step": 6657 }, { "epoch": 0.8561875776930001, "grad_norm": 0.1943359375, "learning_rate": 6.125269536710965e-05, "loss": 0.0449, "step": 6658 }, { "epoch": 0.8563161730035578, "grad_norm": 0.173828125, "learning_rate": 6.12428476298245e-05, "loss": 0.0397, "step": 6659 }, { "epoch": 0.8564447683141154, "grad_norm": 0.1416015625, "learning_rate": 6.123299943316975e-05, "loss": 0.0314, "step": 6660 }, { "epoch": 0.8565733636246732, "grad_norm": 0.16796875, "learning_rate": 6.122315077754778e-05, "loss": 0.0462, "step": 6661 }, { "epoch": 0.8567019589352308, "grad_norm": 0.1669921875, "learning_rate": 6.121330166336098e-05, "loss": 0.0368, "step": 6662 }, { "epoch": 0.8568305542457885, "grad_norm": 0.1767578125, "learning_rate": 6.120345209101178e-05, "loss": 0.0368, "step": 6663 }, { "epoch": 0.8569591495563462, "grad_norm": 0.154296875, "learning_rate": 6.119360206090263e-05, "loss": 0.0337, "step": 6664 }, { "epoch": 0.8570877448669039, "grad_norm": 0.166015625, "learning_rate": 6.1183751573436e-05, "loss": 0.0366, "step": 6665 }, { "epoch": 0.8572163401774615, "grad_norm": 0.1806640625, "learning_rate": 6.117390062901434e-05, "loss": 0.0469, "step": 6666 }, { "epoch": 0.8573449354880192, "grad_norm": 0.181640625, "learning_rate": 6.116404922804018e-05, "loss": 0.0474, "step": 6667 }, { "epoch": 0.8574735307985769, "grad_norm": 0.162109375, "learning_rate": 6.1154197370916e-05, "loss": 0.0396, "step": 6668 }, { "epoch": 0.8576021261091346, "grad_norm": 0.16015625, "learning_rate": 6.114434505804438e-05, "loss": 0.038, "step": 6669 }, { "epoch": 0.8577307214196922, "grad_norm": 0.169921875, "learning_rate": 6.113449228982784e-05, "loss": 0.0415, "step": 6670 }, { "epoch": 0.85785931673025, "grad_norm": 0.1708984375, "learning_rate": 6.112463906666895e-05, "loss": 0.0377, "step": 6671 }, { "epoch": 0.8579879120408076, "grad_norm": 0.166015625, "learning_rate": 6.111478538897036e-05, "loss": 0.0396, "step": 6672 }, { "epoch": 0.8581165073513652, "grad_norm": 0.1826171875, "learning_rate": 6.110493125713459e-05, "loss": 0.0424, "step": 6673 }, { "epoch": 0.8582451026619229, "grad_norm": 0.16796875, "learning_rate": 6.109507667156433e-05, "loss": 0.0373, "step": 6674 }, { "epoch": 0.8583736979724806, "grad_norm": 0.1708984375, "learning_rate": 6.10852216326622e-05, "loss": 0.0394, "step": 6675 }, { "epoch": 0.8585022932830383, "grad_norm": 0.15625, "learning_rate": 6.107536614083088e-05, "loss": 0.0356, "step": 6676 }, { "epoch": 0.8586308885935959, "grad_norm": 0.1748046875, "learning_rate": 6.106551019647307e-05, "loss": 0.0356, "step": 6677 }, { "epoch": 0.8587594839041536, "grad_norm": 0.154296875, "learning_rate": 6.105565379999142e-05, "loss": 0.0365, "step": 6678 }, { "epoch": 0.8588880792147113, "grad_norm": 0.1533203125, "learning_rate": 6.10457969517887e-05, "loss": 0.037, "step": 6679 }, { "epoch": 0.859016674525269, "grad_norm": 0.1689453125, "learning_rate": 6.103593965226762e-05, "loss": 0.0401, "step": 6680 }, { "epoch": 0.8591452698358266, "grad_norm": 0.1982421875, "learning_rate": 6.1026081901830954e-05, "loss": 0.0434, "step": 6681 }, { "epoch": 0.8592738651463844, "grad_norm": 0.173828125, "learning_rate": 6.101622370088148e-05, "loss": 0.0379, "step": 6682 }, { "epoch": 0.859402460456942, "grad_norm": 0.1875, "learning_rate": 6.100636504982198e-05, "loss": 0.0438, "step": 6683 }, { "epoch": 0.8595310557674997, "grad_norm": 0.189453125, "learning_rate": 6.0996505949055286e-05, "loss": 0.0461, "step": 6684 }, { "epoch": 0.8596596510780573, "grad_norm": 0.173828125, "learning_rate": 6.098664639898419e-05, "loss": 0.0391, "step": 6685 }, { "epoch": 0.8597882463886151, "grad_norm": 0.1904296875, "learning_rate": 6.09767864000116e-05, "loss": 0.0465, "step": 6686 }, { "epoch": 0.8599168416991727, "grad_norm": 0.158203125, "learning_rate": 6.096692595254032e-05, "loss": 0.0415, "step": 6687 }, { "epoch": 0.8600454370097304, "grad_norm": 0.171875, "learning_rate": 6.095706505697328e-05, "loss": 0.0423, "step": 6688 }, { "epoch": 0.8601740323202881, "grad_norm": 0.16796875, "learning_rate": 6.094720371371338e-05, "loss": 0.0375, "step": 6689 }, { "epoch": 0.8603026276308458, "grad_norm": 0.177734375, "learning_rate": 6.0937341923163526e-05, "loss": 0.0449, "step": 6690 }, { "epoch": 0.8604312229414034, "grad_norm": 0.2109375, "learning_rate": 6.092747968572669e-05, "loss": 0.0503, "step": 6691 }, { "epoch": 0.860559818251961, "grad_norm": 0.1474609375, "learning_rate": 6.091761700180578e-05, "loss": 0.0314, "step": 6692 }, { "epoch": 0.8606884135625188, "grad_norm": 0.20703125, "learning_rate": 6.090775387180383e-05, "loss": 0.0393, "step": 6693 }, { "epoch": 0.8608170088730764, "grad_norm": 0.1572265625, "learning_rate": 6.0897890296123794e-05, "loss": 0.0372, "step": 6694 }, { "epoch": 0.8609456041836341, "grad_norm": 0.1767578125, "learning_rate": 6.088802627516872e-05, "loss": 0.0397, "step": 6695 }, { "epoch": 0.8610741994941917, "grad_norm": 0.18359375, "learning_rate": 6.087816180934162e-05, "loss": 0.0478, "step": 6696 }, { "epoch": 0.8612027948047495, "grad_norm": 0.162109375, "learning_rate": 6.086829689904553e-05, "loss": 0.037, "step": 6697 }, { "epoch": 0.8613313901153071, "grad_norm": 0.1708984375, "learning_rate": 6.0858431544683545e-05, "loss": 0.0412, "step": 6698 }, { "epoch": 0.8614599854258648, "grad_norm": 0.1962890625, "learning_rate": 6.084856574665874e-05, "loss": 0.0559, "step": 6699 }, { "epoch": 0.8615885807364225, "grad_norm": 0.16015625, "learning_rate": 6.083869950537422e-05, "loss": 0.0344, "step": 6700 }, { "epoch": 0.8617171760469802, "grad_norm": 0.1552734375, "learning_rate": 6.082883282123313e-05, "loss": 0.0395, "step": 6701 }, { "epoch": 0.8618457713575378, "grad_norm": 0.1572265625, "learning_rate": 6.081896569463857e-05, "loss": 0.0422, "step": 6702 }, { "epoch": 0.8619743666680955, "grad_norm": 0.1669921875, "learning_rate": 6.080909812599373e-05, "loss": 0.0436, "step": 6703 }, { "epoch": 0.8621029619786532, "grad_norm": 0.1552734375, "learning_rate": 6.0799230115701774e-05, "loss": 0.0367, "step": 6704 }, { "epoch": 0.8622315572892109, "grad_norm": 0.16015625, "learning_rate": 6.0789361664165925e-05, "loss": 0.0333, "step": 6705 }, { "epoch": 0.8623601525997685, "grad_norm": 0.150390625, "learning_rate": 6.0779492771789345e-05, "loss": 0.0398, "step": 6706 }, { "epoch": 0.8624887479103263, "grad_norm": 0.1796875, "learning_rate": 6.076962343897531e-05, "loss": 0.0448, "step": 6707 }, { "epoch": 0.8626173432208839, "grad_norm": 0.1640625, "learning_rate": 6.0759753666127064e-05, "loss": 0.0379, "step": 6708 }, { "epoch": 0.8627459385314415, "grad_norm": 0.1611328125, "learning_rate": 6.074988345364784e-05, "loss": 0.0412, "step": 6709 }, { "epoch": 0.8628745338419992, "grad_norm": 0.158203125, "learning_rate": 6.074001280194096e-05, "loss": 0.0368, "step": 6710 }, { "epoch": 0.863003129152557, "grad_norm": 0.185546875, "learning_rate": 6.0730141711409706e-05, "loss": 0.0403, "step": 6711 }, { "epoch": 0.8631317244631146, "grad_norm": 0.1630859375, "learning_rate": 6.0720270182457416e-05, "loss": 0.0384, "step": 6712 }, { "epoch": 0.8632603197736722, "grad_norm": 0.1669921875, "learning_rate": 6.0710398215487406e-05, "loss": 0.0386, "step": 6713 }, { "epoch": 0.8633889150842299, "grad_norm": 0.158203125, "learning_rate": 6.0700525810903056e-05, "loss": 0.0374, "step": 6714 }, { "epoch": 0.8635175103947876, "grad_norm": 0.154296875, "learning_rate": 6.069065296910774e-05, "loss": 0.0379, "step": 6715 }, { "epoch": 0.8636461057053453, "grad_norm": 0.150390625, "learning_rate": 6.068077969050483e-05, "loss": 0.0296, "step": 6716 }, { "epoch": 0.8637747010159029, "grad_norm": 0.1806640625, "learning_rate": 6.0670905975497775e-05, "loss": 0.0426, "step": 6717 }, { "epoch": 0.8639032963264607, "grad_norm": 0.162109375, "learning_rate": 6.0661031824489944e-05, "loss": 0.0398, "step": 6718 }, { "epoch": 0.8640318916370183, "grad_norm": 0.1845703125, "learning_rate": 6.065115723788485e-05, "loss": 0.052, "step": 6719 }, { "epoch": 0.864160486947576, "grad_norm": 0.177734375, "learning_rate": 6.0641282216085914e-05, "loss": 0.0377, "step": 6720 }, { "epoch": 0.8642890822581336, "grad_norm": 0.1787109375, "learning_rate": 6.063140675949661e-05, "loss": 0.0441, "step": 6721 }, { "epoch": 0.8644176775686914, "grad_norm": 0.205078125, "learning_rate": 6.0621530868520483e-05, "loss": 0.0469, "step": 6722 }, { "epoch": 0.864546272879249, "grad_norm": 0.1826171875, "learning_rate": 6.0611654543561e-05, "loss": 0.0427, "step": 6723 }, { "epoch": 0.8646748681898067, "grad_norm": 0.193359375, "learning_rate": 6.060177778502174e-05, "loss": 0.0476, "step": 6724 }, { "epoch": 0.8648034635003643, "grad_norm": 0.1689453125, "learning_rate": 6.0591900593306225e-05, "loss": 0.0359, "step": 6725 }, { "epoch": 0.864932058810922, "grad_norm": 0.16015625, "learning_rate": 6.0582022968818034e-05, "loss": 0.0327, "step": 6726 }, { "epoch": 0.8650606541214797, "grad_norm": 0.185546875, "learning_rate": 6.057214491196077e-05, "loss": 0.0395, "step": 6727 }, { "epoch": 0.8651892494320373, "grad_norm": 0.1904296875, "learning_rate": 6.0562266423138e-05, "loss": 0.0473, "step": 6728 }, { "epoch": 0.8653178447425951, "grad_norm": 0.1640625, "learning_rate": 6.05523875027534e-05, "loss": 0.034, "step": 6729 }, { "epoch": 0.8654464400531527, "grad_norm": 0.173828125, "learning_rate": 6.0542508151210574e-05, "loss": 0.0427, "step": 6730 }, { "epoch": 0.8655750353637104, "grad_norm": 0.185546875, "learning_rate": 6.05326283689132e-05, "loss": 0.039, "step": 6731 }, { "epoch": 0.865703630674268, "grad_norm": 0.203125, "learning_rate": 6.052274815626494e-05, "loss": 0.0373, "step": 6732 }, { "epoch": 0.8658322259848258, "grad_norm": 0.185546875, "learning_rate": 6.051286751366949e-05, "loss": 0.037, "step": 6733 }, { "epoch": 0.8659608212953834, "grad_norm": 0.18359375, "learning_rate": 6.0502986441530576e-05, "loss": 0.0458, "step": 6734 }, { "epoch": 0.8660894166059411, "grad_norm": 0.19140625, "learning_rate": 6.04931049402519e-05, "loss": 0.0444, "step": 6735 }, { "epoch": 0.8662180119164988, "grad_norm": 0.158203125, "learning_rate": 6.0483223010237234e-05, "loss": 0.0423, "step": 6736 }, { "epoch": 0.8663466072270565, "grad_norm": 0.1650390625, "learning_rate": 6.047334065189033e-05, "loss": 0.0363, "step": 6737 }, { "epoch": 0.8664752025376141, "grad_norm": 0.171875, "learning_rate": 6.046345786561497e-05, "loss": 0.0424, "step": 6738 }, { "epoch": 0.8666037978481718, "grad_norm": 0.1533203125, "learning_rate": 6.045357465181497e-05, "loss": 0.0336, "step": 6739 }, { "epoch": 0.8667323931587295, "grad_norm": 0.171875, "learning_rate": 6.044369101089411e-05, "loss": 0.0381, "step": 6740 }, { "epoch": 0.8668609884692872, "grad_norm": 0.17578125, "learning_rate": 6.043380694325626e-05, "loss": 0.0415, "step": 6741 }, { "epoch": 0.8669895837798448, "grad_norm": 0.2099609375, "learning_rate": 6.042392244930526e-05, "loss": 0.0492, "step": 6742 }, { "epoch": 0.8671181790904025, "grad_norm": 0.1708984375, "learning_rate": 6.041403752944498e-05, "loss": 0.0424, "step": 6743 }, { "epoch": 0.8672467744009602, "grad_norm": 0.1767578125, "learning_rate": 6.0404152184079296e-05, "loss": 0.0433, "step": 6744 }, { "epoch": 0.8673753697115179, "grad_norm": 0.169921875, "learning_rate": 6.039426641361212e-05, "loss": 0.0438, "step": 6745 }, { "epoch": 0.8675039650220755, "grad_norm": 0.154296875, "learning_rate": 6.038438021844737e-05, "loss": 0.0369, "step": 6746 }, { "epoch": 0.8676325603326333, "grad_norm": 0.1767578125, "learning_rate": 6.037449359898898e-05, "loss": 0.0392, "step": 6747 }, { "epoch": 0.8677611556431909, "grad_norm": 0.1748046875, "learning_rate": 6.0364606555640926e-05, "loss": 0.0423, "step": 6748 }, { "epoch": 0.8678897509537485, "grad_norm": 0.1484375, "learning_rate": 6.0354719088807154e-05, "loss": 0.0336, "step": 6749 }, { "epoch": 0.8680183462643062, "grad_norm": 0.18359375, "learning_rate": 6.0344831198891674e-05, "loss": 0.0469, "step": 6750 }, { "epoch": 0.8681469415748639, "grad_norm": 0.1728515625, "learning_rate": 6.0334942886298487e-05, "loss": 0.046, "step": 6751 }, { "epoch": 0.8682755368854216, "grad_norm": 0.142578125, "learning_rate": 6.0325054151431605e-05, "loss": 0.0325, "step": 6752 }, { "epoch": 0.8684041321959792, "grad_norm": 0.2001953125, "learning_rate": 6.031516499469509e-05, "loss": 0.0433, "step": 6753 }, { "epoch": 0.868532727506537, "grad_norm": 0.177734375, "learning_rate": 6.030527541649299e-05, "loss": 0.0377, "step": 6754 }, { "epoch": 0.8686613228170946, "grad_norm": 0.1962890625, "learning_rate": 6.02953854172294e-05, "loss": 0.0412, "step": 6755 }, { "epoch": 0.8687899181276523, "grad_norm": 0.1640625, "learning_rate": 6.02854949973084e-05, "loss": 0.0322, "step": 6756 }, { "epoch": 0.8689185134382099, "grad_norm": 0.162109375, "learning_rate": 6.027560415713408e-05, "loss": 0.035, "step": 6757 }, { "epoch": 0.8690471087487677, "grad_norm": 0.205078125, "learning_rate": 6.0265712897110607e-05, "loss": 0.0529, "step": 6758 }, { "epoch": 0.8691757040593253, "grad_norm": 0.22265625, "learning_rate": 6.02558212176421e-05, "loss": 0.0453, "step": 6759 }, { "epoch": 0.869304299369883, "grad_norm": 0.1689453125, "learning_rate": 6.0245929119132736e-05, "loss": 0.0381, "step": 6760 }, { "epoch": 0.8694328946804406, "grad_norm": 0.1748046875, "learning_rate": 6.023603660198668e-05, "loss": 0.0398, "step": 6761 }, { "epoch": 0.8695614899909984, "grad_norm": 0.1630859375, "learning_rate": 6.022614366660814e-05, "loss": 0.0385, "step": 6762 }, { "epoch": 0.869690085301556, "grad_norm": 0.1923828125, "learning_rate": 6.021625031340132e-05, "loss": 0.04, "step": 6763 }, { "epoch": 0.8698186806121136, "grad_norm": 0.1796875, "learning_rate": 6.020635654277046e-05, "loss": 0.0422, "step": 6764 }, { "epoch": 0.8699472759226714, "grad_norm": 0.173828125, "learning_rate": 6.019646235511981e-05, "loss": 0.0425, "step": 6765 }, { "epoch": 0.870075871233229, "grad_norm": 0.2001953125, "learning_rate": 6.018656775085362e-05, "loss": 0.043, "step": 6766 }, { "epoch": 0.8702044665437867, "grad_norm": 0.2080078125, "learning_rate": 6.01766727303762e-05, "loss": 0.0419, "step": 6767 }, { "epoch": 0.8703330618543443, "grad_norm": 0.1767578125, "learning_rate": 6.016677729409182e-05, "loss": 0.0387, "step": 6768 }, { "epoch": 0.8704616571649021, "grad_norm": 0.1962890625, "learning_rate": 6.01568814424048e-05, "loss": 0.053, "step": 6769 }, { "epoch": 0.8705902524754597, "grad_norm": 0.177734375, "learning_rate": 6.0146985175719496e-05, "loss": 0.0372, "step": 6770 }, { "epoch": 0.8707188477860174, "grad_norm": 0.17578125, "learning_rate": 6.013708849444023e-05, "loss": 0.0408, "step": 6771 }, { "epoch": 0.870847443096575, "grad_norm": 0.173828125, "learning_rate": 6.012719139897138e-05, "loss": 0.0394, "step": 6772 }, { "epoch": 0.8709760384071328, "grad_norm": 0.1611328125, "learning_rate": 6.011729388971734e-05, "loss": 0.0369, "step": 6773 }, { "epoch": 0.8711046337176904, "grad_norm": 0.1728515625, "learning_rate": 6.01073959670825e-05, "loss": 0.043, "step": 6774 }, { "epoch": 0.8712332290282481, "grad_norm": 0.189453125, "learning_rate": 6.0097497631471265e-05, "loss": 0.0424, "step": 6775 }, { "epoch": 0.8713618243388058, "grad_norm": 0.1865234375, "learning_rate": 6.008759888328809e-05, "loss": 0.043, "step": 6776 }, { "epoch": 0.8714904196493635, "grad_norm": 0.1728515625, "learning_rate": 6.007769972293742e-05, "loss": 0.0353, "step": 6777 }, { "epoch": 0.8716190149599211, "grad_norm": 0.208984375, "learning_rate": 6.006780015082372e-05, "loss": 0.0445, "step": 6778 }, { "epoch": 0.8717476102704788, "grad_norm": 0.177734375, "learning_rate": 6.00579001673515e-05, "loss": 0.0366, "step": 6779 }, { "epoch": 0.8718762055810365, "grad_norm": 0.17578125, "learning_rate": 6.004799977292521e-05, "loss": 0.0419, "step": 6780 }, { "epoch": 0.8720048008915942, "grad_norm": 0.1650390625, "learning_rate": 6.0038098967949406e-05, "loss": 0.0419, "step": 6781 }, { "epoch": 0.8721333962021518, "grad_norm": 0.181640625, "learning_rate": 6.002819775282862e-05, "loss": 0.0462, "step": 6782 }, { "epoch": 0.8722619915127096, "grad_norm": 0.169921875, "learning_rate": 6.001829612796739e-05, "loss": 0.0365, "step": 6783 }, { "epoch": 0.8723905868232672, "grad_norm": 0.173828125, "learning_rate": 6.0008394093770305e-05, "loss": 0.0453, "step": 6784 }, { "epoch": 0.8725191821338248, "grad_norm": 0.1708984375, "learning_rate": 5.999849165064193e-05, "loss": 0.0419, "step": 6785 }, { "epoch": 0.8726477774443825, "grad_norm": 0.18359375, "learning_rate": 5.998858879898689e-05, "loss": 0.0466, "step": 6786 }, { "epoch": 0.8727763727549402, "grad_norm": 0.1611328125, "learning_rate": 5.9978685539209764e-05, "loss": 0.0364, "step": 6787 }, { "epoch": 0.8729049680654979, "grad_norm": 0.1904296875, "learning_rate": 5.996878187171523e-05, "loss": 0.0469, "step": 6788 }, { "epoch": 0.8730335633760555, "grad_norm": 0.162109375, "learning_rate": 5.995887779690793e-05, "loss": 0.042, "step": 6789 }, { "epoch": 0.8731621586866132, "grad_norm": 0.203125, "learning_rate": 5.9948973315192524e-05, "loss": 0.0495, "step": 6790 }, { "epoch": 0.8732907539971709, "grad_norm": 0.18359375, "learning_rate": 5.993906842697371e-05, "loss": 0.0374, "step": 6791 }, { "epoch": 0.8734193493077286, "grad_norm": 0.1884765625, "learning_rate": 5.992916313265616e-05, "loss": 0.0466, "step": 6792 }, { "epoch": 0.8735479446182862, "grad_norm": 0.1630859375, "learning_rate": 5.9919257432644626e-05, "loss": 0.0377, "step": 6793 }, { "epoch": 0.873676539928844, "grad_norm": 0.1552734375, "learning_rate": 5.990935132734383e-05, "loss": 0.0329, "step": 6794 }, { "epoch": 0.8738051352394016, "grad_norm": 0.17578125, "learning_rate": 5.989944481715852e-05, "loss": 0.0444, "step": 6795 }, { "epoch": 0.8739337305499593, "grad_norm": 0.1767578125, "learning_rate": 5.9889537902493484e-05, "loss": 0.039, "step": 6796 }, { "epoch": 0.8740623258605169, "grad_norm": 0.203125, "learning_rate": 5.9879630583753475e-05, "loss": 0.0496, "step": 6797 }, { "epoch": 0.8741909211710747, "grad_norm": 0.177734375, "learning_rate": 5.9869722861343326e-05, "loss": 0.0439, "step": 6798 }, { "epoch": 0.8743195164816323, "grad_norm": 0.1640625, "learning_rate": 5.985981473566783e-05, "loss": 0.0367, "step": 6799 }, { "epoch": 0.87444811179219, "grad_norm": 0.1669921875, "learning_rate": 5.984990620713183e-05, "loss": 0.0354, "step": 6800 }, { "epoch": 0.8745767071027477, "grad_norm": 0.177734375, "learning_rate": 5.983999727614019e-05, "loss": 0.0361, "step": 6801 }, { "epoch": 0.8747053024133054, "grad_norm": 0.1943359375, "learning_rate": 5.983008794309776e-05, "loss": 0.0479, "step": 6802 }, { "epoch": 0.874833897723863, "grad_norm": 0.1552734375, "learning_rate": 5.982017820840944e-05, "loss": 0.037, "step": 6803 }, { "epoch": 0.8749624930344206, "grad_norm": 0.1953125, "learning_rate": 5.981026807248011e-05, "loss": 0.0411, "step": 6804 }, { "epoch": 0.8750910883449784, "grad_norm": 0.2060546875, "learning_rate": 5.98003575357147e-05, "loss": 0.0381, "step": 6805 }, { "epoch": 0.875219683655536, "grad_norm": 0.193359375, "learning_rate": 5.979044659851814e-05, "loss": 0.036, "step": 6806 }, { "epoch": 0.8753482789660937, "grad_norm": 0.1728515625, "learning_rate": 5.978053526129537e-05, "loss": 0.0415, "step": 6807 }, { "epoch": 0.8754768742766513, "grad_norm": 0.1669921875, "learning_rate": 5.9770623524451376e-05, "loss": 0.0385, "step": 6808 }, { "epoch": 0.8756054695872091, "grad_norm": 0.1767578125, "learning_rate": 5.9760711388391124e-05, "loss": 0.042, "step": 6809 }, { "epoch": 0.8757340648977667, "grad_norm": 0.1904296875, "learning_rate": 5.975079885351962e-05, "loss": 0.0485, "step": 6810 }, { "epoch": 0.8758626602083244, "grad_norm": 0.1787109375, "learning_rate": 5.974088592024187e-05, "loss": 0.042, "step": 6811 }, { "epoch": 0.8759912555188821, "grad_norm": 0.181640625, "learning_rate": 5.973097258896292e-05, "loss": 0.039, "step": 6812 }, { "epoch": 0.8761198508294398, "grad_norm": 0.1689453125, "learning_rate": 5.97210588600878e-05, "loss": 0.04, "step": 6813 }, { "epoch": 0.8762484461399974, "grad_norm": 0.1826171875, "learning_rate": 5.971114473402157e-05, "loss": 0.0388, "step": 6814 }, { "epoch": 0.8763770414505551, "grad_norm": 0.177734375, "learning_rate": 5.970123021116936e-05, "loss": 0.0455, "step": 6815 }, { "epoch": 0.8765056367611128, "grad_norm": 0.16015625, "learning_rate": 5.969131529193619e-05, "loss": 0.0321, "step": 6816 }, { "epoch": 0.8766342320716705, "grad_norm": 0.1669921875, "learning_rate": 5.968139997672723e-05, "loss": 0.041, "step": 6817 }, { "epoch": 0.8767628273822281, "grad_norm": 0.1865234375, "learning_rate": 5.9671484265947565e-05, "loss": 0.0492, "step": 6818 }, { "epoch": 0.8768914226927857, "grad_norm": 0.150390625, "learning_rate": 5.9661568160002366e-05, "loss": 0.0368, "step": 6819 }, { "epoch": 0.8770200180033435, "grad_norm": 0.1396484375, "learning_rate": 5.9651651659296805e-05, "loss": 0.0276, "step": 6820 }, { "epoch": 0.8771486133139011, "grad_norm": 0.18359375, "learning_rate": 5.964173476423602e-05, "loss": 0.0499, "step": 6821 }, { "epoch": 0.8772772086244588, "grad_norm": 0.177734375, "learning_rate": 5.963181747522525e-05, "loss": 0.0392, "step": 6822 }, { "epoch": 0.8774058039350165, "grad_norm": 0.197265625, "learning_rate": 5.962189979266966e-05, "loss": 0.0526, "step": 6823 }, { "epoch": 0.8775343992455742, "grad_norm": 0.1630859375, "learning_rate": 5.961198171697451e-05, "loss": 0.0368, "step": 6824 }, { "epoch": 0.8776629945561318, "grad_norm": 0.18359375, "learning_rate": 5.9602063248545e-05, "loss": 0.0451, "step": 6825 }, { "epoch": 0.8777915898666895, "grad_norm": 0.1650390625, "learning_rate": 5.9592144387786444e-05, "loss": 0.0383, "step": 6826 }, { "epoch": 0.8779201851772472, "grad_norm": 0.1630859375, "learning_rate": 5.9582225135104075e-05, "loss": 0.042, "step": 6827 }, { "epoch": 0.8780487804878049, "grad_norm": 0.171875, "learning_rate": 5.957230549090318e-05, "loss": 0.0376, "step": 6828 }, { "epoch": 0.8781773757983625, "grad_norm": 0.1787109375, "learning_rate": 5.9562385455589074e-05, "loss": 0.0429, "step": 6829 }, { "epoch": 0.8783059711089203, "grad_norm": 0.17578125, "learning_rate": 5.955246502956708e-05, "loss": 0.0446, "step": 6830 }, { "epoch": 0.8784345664194779, "grad_norm": 0.177734375, "learning_rate": 5.954254421324253e-05, "loss": 0.0366, "step": 6831 }, { "epoch": 0.8785631617300356, "grad_norm": 0.1708984375, "learning_rate": 5.95326230070208e-05, "loss": 0.0373, "step": 6832 }, { "epoch": 0.8786917570405932, "grad_norm": 0.1552734375, "learning_rate": 5.952270141130723e-05, "loss": 0.0368, "step": 6833 }, { "epoch": 0.878820352351151, "grad_norm": 0.1826171875, "learning_rate": 5.951277942650722e-05, "loss": 0.0475, "step": 6834 }, { "epoch": 0.8789489476617086, "grad_norm": 0.1728515625, "learning_rate": 5.950285705302615e-05, "loss": 0.0425, "step": 6835 }, { "epoch": 0.8790775429722663, "grad_norm": 0.15625, "learning_rate": 5.9492934291269464e-05, "loss": 0.0332, "step": 6836 }, { "epoch": 0.8792061382828239, "grad_norm": 0.1728515625, "learning_rate": 5.9483011141642574e-05, "loss": 0.0317, "step": 6837 }, { "epoch": 0.8793347335933817, "grad_norm": 0.171875, "learning_rate": 5.947308760455095e-05, "loss": 0.0421, "step": 6838 }, { "epoch": 0.8794633289039393, "grad_norm": 0.169921875, "learning_rate": 5.9463163680400034e-05, "loss": 0.0415, "step": 6839 }, { "epoch": 0.879591924214497, "grad_norm": 0.16015625, "learning_rate": 5.945323936959531e-05, "loss": 0.0393, "step": 6840 }, { "epoch": 0.8797205195250547, "grad_norm": 0.1708984375, "learning_rate": 5.944331467254228e-05, "loss": 0.0375, "step": 6841 }, { "epoch": 0.8798491148356123, "grad_norm": 0.1787109375, "learning_rate": 5.943338958964646e-05, "loss": 0.0421, "step": 6842 }, { "epoch": 0.87997771014617, "grad_norm": 0.166015625, "learning_rate": 5.942346412131335e-05, "loss": 0.0373, "step": 6843 }, { "epoch": 0.8801063054567276, "grad_norm": 0.1591796875, "learning_rate": 5.9413538267948546e-05, "loss": 0.0357, "step": 6844 }, { "epoch": 0.8802349007672854, "grad_norm": 0.181640625, "learning_rate": 5.940361202995756e-05, "loss": 0.046, "step": 6845 }, { "epoch": 0.880363496077843, "grad_norm": 0.1640625, "learning_rate": 5.939368540774598e-05, "loss": 0.0401, "step": 6846 }, { "epoch": 0.8804920913884007, "grad_norm": 0.2060546875, "learning_rate": 5.938375840171939e-05, "loss": 0.0469, "step": 6847 }, { "epoch": 0.8806206866989584, "grad_norm": 0.1796875, "learning_rate": 5.937383101228342e-05, "loss": 0.0487, "step": 6848 }, { "epoch": 0.8807492820095161, "grad_norm": 0.158203125, "learning_rate": 5.9363903239843665e-05, "loss": 0.0322, "step": 6849 }, { "epoch": 0.8808778773200737, "grad_norm": 0.169921875, "learning_rate": 5.935397508480578e-05, "loss": 0.0394, "step": 6850 }, { "epoch": 0.8810064726306314, "grad_norm": 0.171875, "learning_rate": 5.9344046547575405e-05, "loss": 0.0425, "step": 6851 }, { "epoch": 0.8811350679411891, "grad_norm": 0.173828125, "learning_rate": 5.9334117628558205e-05, "loss": 0.0369, "step": 6852 }, { "epoch": 0.8812636632517468, "grad_norm": 0.162109375, "learning_rate": 5.9324188328159894e-05, "loss": 0.0369, "step": 6853 }, { "epoch": 0.8813922585623044, "grad_norm": 0.169921875, "learning_rate": 5.9314258646786126e-05, "loss": 0.0409, "step": 6854 }, { "epoch": 0.881520853872862, "grad_norm": 0.185546875, "learning_rate": 5.9304328584842664e-05, "loss": 0.0407, "step": 6855 }, { "epoch": 0.8816494491834198, "grad_norm": 0.17578125, "learning_rate": 5.929439814273518e-05, "loss": 0.0395, "step": 6856 }, { "epoch": 0.8817780444939775, "grad_norm": 0.181640625, "learning_rate": 5.928446732086947e-05, "loss": 0.0455, "step": 6857 }, { "epoch": 0.8819066398045351, "grad_norm": 0.19140625, "learning_rate": 5.92745361196513e-05, "loss": 0.0537, "step": 6858 }, { "epoch": 0.8820352351150929, "grad_norm": 0.1796875, "learning_rate": 5.926460453948641e-05, "loss": 0.0435, "step": 6859 }, { "epoch": 0.8821638304256505, "grad_norm": 0.1640625, "learning_rate": 5.9254672580780626e-05, "loss": 0.0446, "step": 6860 }, { "epoch": 0.8822924257362081, "grad_norm": 0.302734375, "learning_rate": 5.9244740243939724e-05, "loss": 0.0589, "step": 6861 }, { "epoch": 0.8824210210467658, "grad_norm": 0.18359375, "learning_rate": 5.9234807529369554e-05, "loss": 0.037, "step": 6862 }, { "epoch": 0.8825496163573235, "grad_norm": 0.158203125, "learning_rate": 5.9224874437475954e-05, "loss": 0.0332, "step": 6863 }, { "epoch": 0.8826782116678812, "grad_norm": 0.1923828125, "learning_rate": 5.921494096866475e-05, "loss": 0.0453, "step": 6864 }, { "epoch": 0.8828068069784388, "grad_norm": 0.177734375, "learning_rate": 5.920500712334185e-05, "loss": 0.0414, "step": 6865 }, { "epoch": 0.8829354022889965, "grad_norm": 0.158203125, "learning_rate": 5.919507290191311e-05, "loss": 0.0403, "step": 6866 }, { "epoch": 0.8830639975995542, "grad_norm": 0.1533203125, "learning_rate": 5.918513830478445e-05, "loss": 0.0378, "step": 6867 }, { "epoch": 0.8831925929101119, "grad_norm": 0.166015625, "learning_rate": 5.917520333236176e-05, "loss": 0.0449, "step": 6868 }, { "epoch": 0.8833211882206695, "grad_norm": 0.1572265625, "learning_rate": 5.916526798505101e-05, "loss": 0.034, "step": 6869 }, { "epoch": 0.8834497835312273, "grad_norm": 0.1611328125, "learning_rate": 5.9155332263258125e-05, "loss": 0.0372, "step": 6870 }, { "epoch": 0.8835783788417849, "grad_norm": 0.19140625, "learning_rate": 5.914539616738905e-05, "loss": 0.0412, "step": 6871 }, { "epoch": 0.8837069741523426, "grad_norm": 0.16796875, "learning_rate": 5.91354596978498e-05, "loss": 0.0405, "step": 6872 }, { "epoch": 0.8838355694629002, "grad_norm": 0.169921875, "learning_rate": 5.912552285504634e-05, "loss": 0.0397, "step": 6873 }, { "epoch": 0.883964164773458, "grad_norm": 0.1611328125, "learning_rate": 5.9115585639384697e-05, "loss": 0.0393, "step": 6874 }, { "epoch": 0.8840927600840156, "grad_norm": 0.1689453125, "learning_rate": 5.910564805127088e-05, "loss": 0.0402, "step": 6875 }, { "epoch": 0.8842213553945732, "grad_norm": 0.17578125, "learning_rate": 5.9095710091110914e-05, "loss": 0.0407, "step": 6876 }, { "epoch": 0.884349950705131, "grad_norm": 0.1865234375, "learning_rate": 5.908577175931089e-05, "loss": 0.0469, "step": 6877 }, { "epoch": 0.8844785460156886, "grad_norm": 0.1640625, "learning_rate": 5.907583305627684e-05, "loss": 0.0401, "step": 6878 }, { "epoch": 0.8846071413262463, "grad_norm": 0.1552734375, "learning_rate": 5.906589398241487e-05, "loss": 0.0388, "step": 6879 }, { "epoch": 0.8847357366368039, "grad_norm": 0.208984375, "learning_rate": 5.905595453813106e-05, "loss": 0.0516, "step": 6880 }, { "epoch": 0.8848643319473617, "grad_norm": 0.16796875, "learning_rate": 5.9046014723831546e-05, "loss": 0.047, "step": 6881 }, { "epoch": 0.8849929272579193, "grad_norm": 0.1689453125, "learning_rate": 5.903607453992245e-05, "loss": 0.0414, "step": 6882 }, { "epoch": 0.885121522568477, "grad_norm": 0.1650390625, "learning_rate": 5.9026133986809895e-05, "loss": 0.0372, "step": 6883 }, { "epoch": 0.8852501178790346, "grad_norm": 0.189453125, "learning_rate": 5.9016193064900085e-05, "loss": 0.0428, "step": 6884 }, { "epoch": 0.8853787131895924, "grad_norm": 0.1650390625, "learning_rate": 5.900625177459915e-05, "loss": 0.0434, "step": 6885 }, { "epoch": 0.88550730850015, "grad_norm": 0.154296875, "learning_rate": 5.8996310116313314e-05, "loss": 0.0324, "step": 6886 }, { "epoch": 0.8856359038107077, "grad_norm": 0.142578125, "learning_rate": 5.898636809044876e-05, "loss": 0.0326, "step": 6887 }, { "epoch": 0.8857644991212654, "grad_norm": 0.1689453125, "learning_rate": 5.897642569741171e-05, "loss": 0.0372, "step": 6888 }, { "epoch": 0.8858930944318231, "grad_norm": 0.1640625, "learning_rate": 5.8966482937608406e-05, "loss": 0.044, "step": 6889 }, { "epoch": 0.8860216897423807, "grad_norm": 0.1572265625, "learning_rate": 5.895653981144508e-05, "loss": 0.0352, "step": 6890 }, { "epoch": 0.8861502850529384, "grad_norm": 0.1767578125, "learning_rate": 5.894659631932802e-05, "loss": 0.0438, "step": 6891 }, { "epoch": 0.8862788803634961, "grad_norm": 0.1767578125, "learning_rate": 5.893665246166348e-05, "loss": 0.0396, "step": 6892 }, { "epoch": 0.8864074756740538, "grad_norm": 0.16015625, "learning_rate": 5.892670823885778e-05, "loss": 0.035, "step": 6893 }, { "epoch": 0.8865360709846114, "grad_norm": 0.1982421875, "learning_rate": 5.891676365131722e-05, "loss": 0.0471, "step": 6894 }, { "epoch": 0.8866646662951692, "grad_norm": 0.1455078125, "learning_rate": 5.890681869944812e-05, "loss": 0.0293, "step": 6895 }, { "epoch": 0.8867932616057268, "grad_norm": 0.1689453125, "learning_rate": 5.8896873383656826e-05, "loss": 0.0339, "step": 6896 }, { "epoch": 0.8869218569162844, "grad_norm": 0.1982421875, "learning_rate": 5.888692770434968e-05, "loss": 0.0438, "step": 6897 }, { "epoch": 0.8870504522268421, "grad_norm": 0.1572265625, "learning_rate": 5.8876981661933074e-05, "loss": 0.0378, "step": 6898 }, { "epoch": 0.8871790475373998, "grad_norm": 0.2353515625, "learning_rate": 5.8867035256813376e-05, "loss": 0.0471, "step": 6899 }, { "epoch": 0.8873076428479575, "grad_norm": 0.177734375, "learning_rate": 5.885708848939697e-05, "loss": 0.045, "step": 6900 }, { "epoch": 0.8874362381585151, "grad_norm": 0.1806640625, "learning_rate": 5.8847141360090294e-05, "loss": 0.0422, "step": 6901 }, { "epoch": 0.8875648334690728, "grad_norm": 0.1875, "learning_rate": 5.883719386929976e-05, "loss": 0.0445, "step": 6902 }, { "epoch": 0.8876934287796305, "grad_norm": 0.19921875, "learning_rate": 5.882724601743183e-05, "loss": 0.0499, "step": 6903 }, { "epoch": 0.8878220240901882, "grad_norm": 0.1787109375, "learning_rate": 5.8817297804892925e-05, "loss": 0.0423, "step": 6904 }, { "epoch": 0.8879506194007458, "grad_norm": 0.1904296875, "learning_rate": 5.880734923208956e-05, "loss": 0.0408, "step": 6905 }, { "epoch": 0.8880792147113036, "grad_norm": 0.1552734375, "learning_rate": 5.8797400299428194e-05, "loss": 0.0396, "step": 6906 }, { "epoch": 0.8882078100218612, "grad_norm": 0.158203125, "learning_rate": 5.878745100731533e-05, "loss": 0.0307, "step": 6907 }, { "epoch": 0.8883364053324189, "grad_norm": 0.1708984375, "learning_rate": 5.8777501356157514e-05, "loss": 0.0404, "step": 6908 }, { "epoch": 0.8884650006429765, "grad_norm": 0.1572265625, "learning_rate": 5.876755134636125e-05, "loss": 0.036, "step": 6909 }, { "epoch": 0.8885935959535343, "grad_norm": 0.1630859375, "learning_rate": 5.875760097833309e-05, "loss": 0.0365, "step": 6910 }, { "epoch": 0.8887221912640919, "grad_norm": 0.14453125, "learning_rate": 5.874765025247959e-05, "loss": 0.0294, "step": 6911 }, { "epoch": 0.8888507865746496, "grad_norm": 0.1796875, "learning_rate": 5.8737699169207315e-05, "loss": 0.0462, "step": 6912 }, { "epoch": 0.8889793818852072, "grad_norm": 0.1640625, "learning_rate": 5.872774772892289e-05, "loss": 0.0362, "step": 6913 }, { "epoch": 0.889107977195765, "grad_norm": 0.19921875, "learning_rate": 5.871779593203288e-05, "loss": 0.0474, "step": 6914 }, { "epoch": 0.8892365725063226, "grad_norm": 0.16015625, "learning_rate": 5.870784377894394e-05, "loss": 0.0395, "step": 6915 }, { "epoch": 0.8893651678168802, "grad_norm": 0.1708984375, "learning_rate": 5.869789127006267e-05, "loss": 0.0443, "step": 6916 }, { "epoch": 0.889493763127438, "grad_norm": 0.1875, "learning_rate": 5.868793840579574e-05, "loss": 0.046, "step": 6917 }, { "epoch": 0.8896223584379956, "grad_norm": 0.154296875, "learning_rate": 5.8677985186549786e-05, "loss": 0.0356, "step": 6918 }, { "epoch": 0.8897509537485533, "grad_norm": 0.1591796875, "learning_rate": 5.866803161273151e-05, "loss": 0.0355, "step": 6919 }, { "epoch": 0.8898795490591109, "grad_norm": 0.1767578125, "learning_rate": 5.865807768474761e-05, "loss": 0.0422, "step": 6920 }, { "epoch": 0.8900081443696687, "grad_norm": 0.177734375, "learning_rate": 5.8648123403004786e-05, "loss": 0.0362, "step": 6921 }, { "epoch": 0.8901367396802263, "grad_norm": 0.1533203125, "learning_rate": 5.863816876790975e-05, "loss": 0.0347, "step": 6922 }, { "epoch": 0.890265334990784, "grad_norm": 0.1787109375, "learning_rate": 5.862821377986922e-05, "loss": 0.0473, "step": 6923 }, { "epoch": 0.8903939303013417, "grad_norm": 0.1552734375, "learning_rate": 5.861825843928996e-05, "loss": 0.0335, "step": 6924 }, { "epoch": 0.8905225256118994, "grad_norm": 0.162109375, "learning_rate": 5.860830274657876e-05, "loss": 0.0379, "step": 6925 }, { "epoch": 0.890651120922457, "grad_norm": 0.181640625, "learning_rate": 5.859834670214236e-05, "loss": 0.0464, "step": 6926 }, { "epoch": 0.8907797162330147, "grad_norm": 0.173828125, "learning_rate": 5.8588390306387574e-05, "loss": 0.0435, "step": 6927 }, { "epoch": 0.8909083115435724, "grad_norm": 0.1533203125, "learning_rate": 5.857843355972119e-05, "loss": 0.0368, "step": 6928 }, { "epoch": 0.8910369068541301, "grad_norm": 0.1767578125, "learning_rate": 5.856847646255006e-05, "loss": 0.0419, "step": 6929 }, { "epoch": 0.8911655021646877, "grad_norm": 0.1572265625, "learning_rate": 5.855851901528098e-05, "loss": 0.0334, "step": 6930 }, { "epoch": 0.8912940974752454, "grad_norm": 0.189453125, "learning_rate": 5.854856121832082e-05, "loss": 0.0466, "step": 6931 }, { "epoch": 0.8914226927858031, "grad_norm": 0.1962890625, "learning_rate": 5.8538603072076456e-05, "loss": 0.0548, "step": 6932 }, { "epoch": 0.8915512880963608, "grad_norm": 0.1669921875, "learning_rate": 5.852864457695476e-05, "loss": 0.0363, "step": 6933 }, { "epoch": 0.8916798834069184, "grad_norm": 0.15625, "learning_rate": 5.851868573336261e-05, "loss": 0.0397, "step": 6934 }, { "epoch": 0.8918084787174761, "grad_norm": 0.20703125, "learning_rate": 5.850872654170691e-05, "loss": 0.0511, "step": 6935 }, { "epoch": 0.8919370740280338, "grad_norm": 0.1513671875, "learning_rate": 5.84987670023946e-05, "loss": 0.0343, "step": 6936 }, { "epoch": 0.8920656693385914, "grad_norm": 0.1767578125, "learning_rate": 5.8488807115832624e-05, "loss": 0.0452, "step": 6937 }, { "epoch": 0.8921942646491491, "grad_norm": 0.1640625, "learning_rate": 5.8478846882427885e-05, "loss": 0.0378, "step": 6938 }, { "epoch": 0.8923228599597068, "grad_norm": 0.1728515625, "learning_rate": 5.84688863025874e-05, "loss": 0.0397, "step": 6939 }, { "epoch": 0.8924514552702645, "grad_norm": 0.17578125, "learning_rate": 5.845892537671811e-05, "loss": 0.0442, "step": 6940 }, { "epoch": 0.8925800505808221, "grad_norm": 0.15625, "learning_rate": 5.844896410522702e-05, "loss": 0.0305, "step": 6941 }, { "epoch": 0.8927086458913799, "grad_norm": 0.1630859375, "learning_rate": 5.843900248852114e-05, "loss": 0.0339, "step": 6942 }, { "epoch": 0.8928372412019375, "grad_norm": 0.16796875, "learning_rate": 5.842904052700748e-05, "loss": 0.0416, "step": 6943 }, { "epoch": 0.8929658365124952, "grad_norm": 0.1455078125, "learning_rate": 5.841907822109308e-05, "loss": 0.0346, "step": 6944 }, { "epoch": 0.8930944318230528, "grad_norm": 0.1591796875, "learning_rate": 5.8409115571185e-05, "loss": 0.0377, "step": 6945 }, { "epoch": 0.8932230271336106, "grad_norm": 0.18359375, "learning_rate": 5.8399152577690284e-05, "loss": 0.0438, "step": 6946 }, { "epoch": 0.8933516224441682, "grad_norm": 0.166015625, "learning_rate": 5.8389189241016005e-05, "loss": 0.0401, "step": 6947 }, { "epoch": 0.8934802177547259, "grad_norm": 0.162109375, "learning_rate": 5.837922556156927e-05, "loss": 0.0341, "step": 6948 }, { "epoch": 0.8936088130652835, "grad_norm": 0.177734375, "learning_rate": 5.836926153975718e-05, "loss": 0.044, "step": 6949 }, { "epoch": 0.8937374083758413, "grad_norm": 0.171875, "learning_rate": 5.835929717598684e-05, "loss": 0.0432, "step": 6950 }, { "epoch": 0.8938660036863989, "grad_norm": 0.166015625, "learning_rate": 5.83493324706654e-05, "loss": 0.0399, "step": 6951 }, { "epoch": 0.8939945989969565, "grad_norm": 0.1455078125, "learning_rate": 5.8339367424199984e-05, "loss": 0.0346, "step": 6952 }, { "epoch": 0.8941231943075143, "grad_norm": 0.18359375, "learning_rate": 5.832940203699777e-05, "loss": 0.0457, "step": 6953 }, { "epoch": 0.894251789618072, "grad_norm": 0.1708984375, "learning_rate": 5.831943630946593e-05, "loss": 0.0446, "step": 6954 }, { "epoch": 0.8943803849286296, "grad_norm": 0.1611328125, "learning_rate": 5.830947024201166e-05, "loss": 0.0343, "step": 6955 }, { "epoch": 0.8945089802391872, "grad_norm": 0.17578125, "learning_rate": 5.829950383504213e-05, "loss": 0.0487, "step": 6956 }, { "epoch": 0.894637575549745, "grad_norm": 0.1669921875, "learning_rate": 5.8289537088964595e-05, "loss": 0.0366, "step": 6957 }, { "epoch": 0.8947661708603026, "grad_norm": 0.15625, "learning_rate": 5.827957000418627e-05, "loss": 0.0324, "step": 6958 }, { "epoch": 0.8948947661708603, "grad_norm": 0.16796875, "learning_rate": 5.826960258111438e-05, "loss": 0.0405, "step": 6959 }, { "epoch": 0.8950233614814179, "grad_norm": 0.1689453125, "learning_rate": 5.8259634820156206e-05, "loss": 0.0359, "step": 6960 }, { "epoch": 0.8951519567919757, "grad_norm": 0.17578125, "learning_rate": 5.824966672171901e-05, "loss": 0.0395, "step": 6961 }, { "epoch": 0.8952805521025333, "grad_norm": 0.1748046875, "learning_rate": 5.8239698286210067e-05, "loss": 0.0405, "step": 6962 }, { "epoch": 0.895409147413091, "grad_norm": 0.169921875, "learning_rate": 5.8229729514036705e-05, "loss": 0.0422, "step": 6963 }, { "epoch": 0.8955377427236487, "grad_norm": 0.16796875, "learning_rate": 5.821976040560619e-05, "loss": 0.0363, "step": 6964 }, { "epoch": 0.8956663380342064, "grad_norm": 0.185546875, "learning_rate": 5.820979096132591e-05, "loss": 0.0432, "step": 6965 }, { "epoch": 0.895794933344764, "grad_norm": 0.169921875, "learning_rate": 5.819982118160314e-05, "loss": 0.0392, "step": 6966 }, { "epoch": 0.8959235286553217, "grad_norm": 0.181640625, "learning_rate": 5.818985106684529e-05, "loss": 0.0466, "step": 6967 }, { "epoch": 0.8960521239658794, "grad_norm": 0.1767578125, "learning_rate": 5.8179880617459683e-05, "loss": 0.0454, "step": 6968 }, { "epoch": 0.896180719276437, "grad_norm": 0.19140625, "learning_rate": 5.816990983385373e-05, "loss": 0.0468, "step": 6969 }, { "epoch": 0.8963093145869947, "grad_norm": 0.2255859375, "learning_rate": 5.8159938716434815e-05, "loss": 0.0356, "step": 6970 }, { "epoch": 0.8964379098975525, "grad_norm": 0.1953125, "learning_rate": 5.814996726561033e-05, "loss": 0.0479, "step": 6971 }, { "epoch": 0.8965665052081101, "grad_norm": 0.1669921875, "learning_rate": 5.813999548178772e-05, "loss": 0.04, "step": 6972 }, { "epoch": 0.8966951005186677, "grad_norm": 0.185546875, "learning_rate": 5.813002336537441e-05, "loss": 0.0423, "step": 6973 }, { "epoch": 0.8968236958292254, "grad_norm": 0.1904296875, "learning_rate": 5.812005091677783e-05, "loss": 0.0411, "step": 6974 }, { "epoch": 0.8969522911397831, "grad_norm": 0.158203125, "learning_rate": 5.81100781364055e-05, "loss": 0.0398, "step": 6975 }, { "epoch": 0.8970808864503408, "grad_norm": 0.171875, "learning_rate": 5.8100105024664844e-05, "loss": 0.0377, "step": 6976 }, { "epoch": 0.8972094817608984, "grad_norm": 0.1552734375, "learning_rate": 5.809013158196337e-05, "loss": 0.0326, "step": 6977 }, { "epoch": 0.8973380770714561, "grad_norm": 0.1845703125, "learning_rate": 5.808015780870857e-05, "loss": 0.0486, "step": 6978 }, { "epoch": 0.8974666723820138, "grad_norm": 0.16796875, "learning_rate": 5.807018370530799e-05, "loss": 0.044, "step": 6979 }, { "epoch": 0.8975952676925715, "grad_norm": 0.1513671875, "learning_rate": 5.8060209272169116e-05, "loss": 0.0314, "step": 6980 }, { "epoch": 0.8977238630031291, "grad_norm": 0.1669921875, "learning_rate": 5.805023450969953e-05, "loss": 0.0375, "step": 6981 }, { "epoch": 0.8978524583136869, "grad_norm": 0.169921875, "learning_rate": 5.8040259418306776e-05, "loss": 0.0395, "step": 6982 }, { "epoch": 0.8979810536242445, "grad_norm": 0.16796875, "learning_rate": 5.803028399839842e-05, "loss": 0.0389, "step": 6983 }, { "epoch": 0.8981096489348022, "grad_norm": 0.1953125, "learning_rate": 5.802030825038205e-05, "loss": 0.0479, "step": 6984 }, { "epoch": 0.8982382442453598, "grad_norm": 0.1640625, "learning_rate": 5.801033217466525e-05, "loss": 0.0374, "step": 6985 }, { "epoch": 0.8983668395559176, "grad_norm": 0.1728515625, "learning_rate": 5.8000355771655646e-05, "loss": 0.0415, "step": 6986 }, { "epoch": 0.8984954348664752, "grad_norm": 0.1669921875, "learning_rate": 5.799037904176087e-05, "loss": 0.0385, "step": 6987 }, { "epoch": 0.8986240301770329, "grad_norm": 0.1787109375, "learning_rate": 5.798040198538853e-05, "loss": 0.0505, "step": 6988 }, { "epoch": 0.8987526254875906, "grad_norm": 0.171875, "learning_rate": 5.7970424602946325e-05, "loss": 0.0439, "step": 6989 }, { "epoch": 0.8988812207981483, "grad_norm": 0.16796875, "learning_rate": 5.7960446894841866e-05, "loss": 0.0449, "step": 6990 }, { "epoch": 0.8990098161087059, "grad_norm": 0.1884765625, "learning_rate": 5.795046886148287e-05, "loss": 0.0418, "step": 6991 }, { "epoch": 0.8991384114192635, "grad_norm": 0.1865234375, "learning_rate": 5.7940490503277004e-05, "loss": 0.0452, "step": 6992 }, { "epoch": 0.8992670067298213, "grad_norm": 0.16796875, "learning_rate": 5.793051182063198e-05, "loss": 0.0354, "step": 6993 }, { "epoch": 0.8993956020403789, "grad_norm": 0.162109375, "learning_rate": 5.792053281395552e-05, "loss": 0.0359, "step": 6994 }, { "epoch": 0.8995241973509366, "grad_norm": 0.154296875, "learning_rate": 5.791055348365535e-05, "loss": 0.0343, "step": 6995 }, { "epoch": 0.8996527926614942, "grad_norm": 0.1748046875, "learning_rate": 5.790057383013921e-05, "loss": 0.0423, "step": 6996 }, { "epoch": 0.899781387972052, "grad_norm": 0.1826171875, "learning_rate": 5.7890593853814854e-05, "loss": 0.0417, "step": 6997 }, { "epoch": 0.8999099832826096, "grad_norm": 0.205078125, "learning_rate": 5.788061355509007e-05, "loss": 0.0601, "step": 6998 }, { "epoch": 0.9000385785931673, "grad_norm": 0.166015625, "learning_rate": 5.7870632934372624e-05, "loss": 0.0405, "step": 6999 }, { "epoch": 0.900167173903725, "grad_norm": 0.1591796875, "learning_rate": 5.7860651992070317e-05, "loss": 0.0399, "step": 7000 }, { "epoch": 0.900167173903725, "eval_loss": 0.03945862874388695, "eval_runtime": 1042.2698, "eval_samples_per_second": 94.242, "eval_steps_per_second": 1.178, "step": 7000 }, { "epoch": 0.9002957692142827, "grad_norm": 0.197265625, "learning_rate": 5.785067072859097e-05, "loss": 0.0511, "step": 7001 }, { "epoch": 0.9004243645248403, "grad_norm": 0.1767578125, "learning_rate": 5.784068914434238e-05, "loss": 0.0437, "step": 7002 }, { "epoch": 0.900552959835398, "grad_norm": 0.158203125, "learning_rate": 5.783070723973243e-05, "loss": 0.0299, "step": 7003 }, { "epoch": 0.9006815551459557, "grad_norm": 0.171875, "learning_rate": 5.782072501516892e-05, "loss": 0.0368, "step": 7004 }, { "epoch": 0.9008101504565134, "grad_norm": 0.181640625, "learning_rate": 5.781074247105972e-05, "loss": 0.0483, "step": 7005 }, { "epoch": 0.900938745767071, "grad_norm": 0.1669921875, "learning_rate": 5.780075960781273e-05, "loss": 0.0408, "step": 7006 }, { "epoch": 0.9010673410776286, "grad_norm": 0.189453125, "learning_rate": 5.7790776425835816e-05, "loss": 0.0477, "step": 7007 }, { "epoch": 0.9011959363881864, "grad_norm": 0.1767578125, "learning_rate": 5.778079292553689e-05, "loss": 0.0377, "step": 7008 }, { "epoch": 0.901324531698744, "grad_norm": 0.1669921875, "learning_rate": 5.7770809107323856e-05, "loss": 0.0382, "step": 7009 }, { "epoch": 0.9014531270093017, "grad_norm": 0.1591796875, "learning_rate": 5.7760824971604656e-05, "loss": 0.0385, "step": 7010 }, { "epoch": 0.9015817223198594, "grad_norm": 0.1787109375, "learning_rate": 5.775084051878721e-05, "loss": 0.044, "step": 7011 }, { "epoch": 0.9017103176304171, "grad_norm": 0.1787109375, "learning_rate": 5.7740855749279475e-05, "loss": 0.0395, "step": 7012 }, { "epoch": 0.9018389129409747, "grad_norm": 0.1875, "learning_rate": 5.773087066348946e-05, "loss": 0.0505, "step": 7013 }, { "epoch": 0.9019675082515324, "grad_norm": 0.1669921875, "learning_rate": 5.772088526182508e-05, "loss": 0.0367, "step": 7014 }, { "epoch": 0.9020961035620901, "grad_norm": 0.15625, "learning_rate": 5.771089954469436e-05, "loss": 0.0331, "step": 7015 }, { "epoch": 0.9022246988726478, "grad_norm": 0.1640625, "learning_rate": 5.770091351250532e-05, "loss": 0.038, "step": 7016 }, { "epoch": 0.9023532941832054, "grad_norm": 0.1953125, "learning_rate": 5.769092716566593e-05, "loss": 0.0461, "step": 7017 }, { "epoch": 0.9024818894937632, "grad_norm": 0.166015625, "learning_rate": 5.768094050458427e-05, "loss": 0.0436, "step": 7018 }, { "epoch": 0.9026104848043208, "grad_norm": 0.189453125, "learning_rate": 5.7670953529668345e-05, "loss": 0.0367, "step": 7019 }, { "epoch": 0.9027390801148785, "grad_norm": 0.193359375, "learning_rate": 5.766096624132624e-05, "loss": 0.0456, "step": 7020 }, { "epoch": 0.9028676754254361, "grad_norm": 0.17578125, "learning_rate": 5.7650978639966e-05, "loss": 0.0394, "step": 7021 }, { "epoch": 0.9029962707359939, "grad_norm": 0.1611328125, "learning_rate": 5.764099072599574e-05, "loss": 0.0401, "step": 7022 }, { "epoch": 0.9031248660465515, "grad_norm": 0.1611328125, "learning_rate": 5.7631002499823507e-05, "loss": 0.0382, "step": 7023 }, { "epoch": 0.9032534613571092, "grad_norm": 0.1611328125, "learning_rate": 5.7621013961857437e-05, "loss": 0.0388, "step": 7024 }, { "epoch": 0.9033820566676668, "grad_norm": 0.181640625, "learning_rate": 5.761102511250567e-05, "loss": 0.0386, "step": 7025 }, { "epoch": 0.9035106519782246, "grad_norm": 0.1767578125, "learning_rate": 5.760103595217629e-05, "loss": 0.0342, "step": 7026 }, { "epoch": 0.9036392472887822, "grad_norm": 0.201171875, "learning_rate": 5.759104648127749e-05, "loss": 0.0362, "step": 7027 }, { "epoch": 0.9037678425993398, "grad_norm": 0.1728515625, "learning_rate": 5.7581056700217394e-05, "loss": 0.0439, "step": 7028 }, { "epoch": 0.9038964379098976, "grad_norm": 0.173828125, "learning_rate": 5.757106660940419e-05, "loss": 0.0387, "step": 7029 }, { "epoch": 0.9040250332204552, "grad_norm": 0.1884765625, "learning_rate": 5.7561076209246054e-05, "loss": 0.0385, "step": 7030 }, { "epoch": 0.9041536285310129, "grad_norm": 0.1796875, "learning_rate": 5.755108550015118e-05, "loss": 0.0394, "step": 7031 }, { "epoch": 0.9042822238415705, "grad_norm": 0.232421875, "learning_rate": 5.754109448252779e-05, "loss": 0.0337, "step": 7032 }, { "epoch": 0.9044108191521283, "grad_norm": 0.189453125, "learning_rate": 5.7531103156784095e-05, "loss": 0.0383, "step": 7033 }, { "epoch": 0.9045394144626859, "grad_norm": 0.1650390625, "learning_rate": 5.752111152332833e-05, "loss": 0.0335, "step": 7034 }, { "epoch": 0.9046680097732436, "grad_norm": 0.171875, "learning_rate": 5.751111958256873e-05, "loss": 0.0384, "step": 7035 }, { "epoch": 0.9047966050838013, "grad_norm": 0.1953125, "learning_rate": 5.7501127334913586e-05, "loss": 0.0499, "step": 7036 }, { "epoch": 0.904925200394359, "grad_norm": 0.17578125, "learning_rate": 5.749113478077113e-05, "loss": 0.0387, "step": 7037 }, { "epoch": 0.9050537957049166, "grad_norm": 0.1748046875, "learning_rate": 5.7481141920549674e-05, "loss": 0.0397, "step": 7038 }, { "epoch": 0.9051823910154743, "grad_norm": 0.1728515625, "learning_rate": 5.747114875465751e-05, "loss": 0.034, "step": 7039 }, { "epoch": 0.905310986326032, "grad_norm": 0.203125, "learning_rate": 5.746115528350295e-05, "loss": 0.0566, "step": 7040 }, { "epoch": 0.9054395816365897, "grad_norm": 0.1728515625, "learning_rate": 5.745116150749429e-05, "loss": 0.0466, "step": 7041 }, { "epoch": 0.9055681769471473, "grad_norm": 0.1640625, "learning_rate": 5.74411674270399e-05, "loss": 0.0348, "step": 7042 }, { "epoch": 0.905696772257705, "grad_norm": 0.1669921875, "learning_rate": 5.7431173042548104e-05, "loss": 0.0306, "step": 7043 }, { "epoch": 0.9058253675682627, "grad_norm": 0.2275390625, "learning_rate": 5.742117835442726e-05, "loss": 0.0487, "step": 7044 }, { "epoch": 0.9059539628788204, "grad_norm": 0.1865234375, "learning_rate": 5.741118336308575e-05, "loss": 0.0426, "step": 7045 }, { "epoch": 0.906082558189378, "grad_norm": 0.1748046875, "learning_rate": 5.740118806893196e-05, "loss": 0.0383, "step": 7046 }, { "epoch": 0.9062111534999358, "grad_norm": 0.1484375, "learning_rate": 5.7391192472374254e-05, "loss": 0.0322, "step": 7047 }, { "epoch": 0.9063397488104934, "grad_norm": 0.177734375, "learning_rate": 5.7381196573821096e-05, "loss": 0.04, "step": 7048 }, { "epoch": 0.906468344121051, "grad_norm": 0.1923828125, "learning_rate": 5.7371200373680854e-05, "loss": 0.041, "step": 7049 }, { "epoch": 0.9065969394316087, "grad_norm": 0.14453125, "learning_rate": 5.736120387236199e-05, "loss": 0.0285, "step": 7050 }, { "epoch": 0.9067255347421664, "grad_norm": 0.158203125, "learning_rate": 5.735120707027295e-05, "loss": 0.0334, "step": 7051 }, { "epoch": 0.9068541300527241, "grad_norm": 0.171875, "learning_rate": 5.734120996782219e-05, "loss": 0.0406, "step": 7052 }, { "epoch": 0.9069827253632817, "grad_norm": 0.1787109375, "learning_rate": 5.7331212565418166e-05, "loss": 0.0468, "step": 7053 }, { "epoch": 0.9071113206738394, "grad_norm": 0.1923828125, "learning_rate": 5.7321214863469365e-05, "loss": 0.0519, "step": 7054 }, { "epoch": 0.9072399159843971, "grad_norm": 0.1875, "learning_rate": 5.731121686238428e-05, "loss": 0.0398, "step": 7055 }, { "epoch": 0.9073685112949548, "grad_norm": 0.1689453125, "learning_rate": 5.7301218562571435e-05, "loss": 0.0378, "step": 7056 }, { "epoch": 0.9074971066055124, "grad_norm": 0.1669921875, "learning_rate": 5.729121996443933e-05, "loss": 0.0398, "step": 7057 }, { "epoch": 0.9076257019160702, "grad_norm": 0.1689453125, "learning_rate": 5.7281221068396495e-05, "loss": 0.0357, "step": 7058 }, { "epoch": 0.9077542972266278, "grad_norm": 0.181640625, "learning_rate": 5.727122187485149e-05, "loss": 0.0421, "step": 7059 }, { "epoch": 0.9078828925371855, "grad_norm": 0.162109375, "learning_rate": 5.7261222384212864e-05, "loss": 0.0348, "step": 7060 }, { "epoch": 0.9080114878477431, "grad_norm": 0.1806640625, "learning_rate": 5.725122259688917e-05, "loss": 0.0471, "step": 7061 }, { "epoch": 0.9081400831583009, "grad_norm": 0.1787109375, "learning_rate": 5.724122251328899e-05, "loss": 0.0432, "step": 7062 }, { "epoch": 0.9082686784688585, "grad_norm": 0.1787109375, "learning_rate": 5.723122213382095e-05, "loss": 0.0412, "step": 7063 }, { "epoch": 0.9083972737794161, "grad_norm": 0.16015625, "learning_rate": 5.722122145889363e-05, "loss": 0.0318, "step": 7064 }, { "epoch": 0.9085258690899739, "grad_norm": 0.1806640625, "learning_rate": 5.721122048891563e-05, "loss": 0.0406, "step": 7065 }, { "epoch": 0.9086544644005315, "grad_norm": 0.18359375, "learning_rate": 5.720121922429559e-05, "loss": 0.0484, "step": 7066 }, { "epoch": 0.9087830597110892, "grad_norm": 0.1650390625, "learning_rate": 5.719121766544216e-05, "loss": 0.0382, "step": 7067 }, { "epoch": 0.9089116550216468, "grad_norm": 0.16796875, "learning_rate": 5.7181215812763975e-05, "loss": 0.0335, "step": 7068 }, { "epoch": 0.9090402503322046, "grad_norm": 0.1796875, "learning_rate": 5.717121366666971e-05, "loss": 0.0438, "step": 7069 }, { "epoch": 0.9091688456427622, "grad_norm": 0.1923828125, "learning_rate": 5.7161211227568054e-05, "loss": 0.0423, "step": 7070 }, { "epoch": 0.9092974409533199, "grad_norm": 0.1787109375, "learning_rate": 5.715120849586766e-05, "loss": 0.0421, "step": 7071 }, { "epoch": 0.9094260362638775, "grad_norm": 0.171875, "learning_rate": 5.714120547197726e-05, "loss": 0.044, "step": 7072 }, { "epoch": 0.9095546315744353, "grad_norm": 0.171875, "learning_rate": 5.713120215630555e-05, "loss": 0.0363, "step": 7073 }, { "epoch": 0.9096832268849929, "grad_norm": 0.162109375, "learning_rate": 5.712119854926126e-05, "loss": 0.0322, "step": 7074 }, { "epoch": 0.9098118221955506, "grad_norm": 0.1748046875, "learning_rate": 5.711119465125313e-05, "loss": 0.0378, "step": 7075 }, { "epoch": 0.9099404175061083, "grad_norm": 0.1689453125, "learning_rate": 5.7101190462689905e-05, "loss": 0.0377, "step": 7076 }, { "epoch": 0.910069012816666, "grad_norm": 0.171875, "learning_rate": 5.709118598398033e-05, "loss": 0.0421, "step": 7077 }, { "epoch": 0.9101976081272236, "grad_norm": 0.17578125, "learning_rate": 5.7081181215533184e-05, "loss": 0.0424, "step": 7078 }, { "epoch": 0.9103262034377813, "grad_norm": 0.1669921875, "learning_rate": 5.707117615775724e-05, "loss": 0.0406, "step": 7079 }, { "epoch": 0.910454798748339, "grad_norm": 0.16015625, "learning_rate": 5.7061170811061335e-05, "loss": 0.0381, "step": 7080 }, { "epoch": 0.9105833940588967, "grad_norm": 0.1650390625, "learning_rate": 5.705116517585423e-05, "loss": 0.0403, "step": 7081 }, { "epoch": 0.9107119893694543, "grad_norm": 0.1630859375, "learning_rate": 5.704115925254477e-05, "loss": 0.0403, "step": 7082 }, { "epoch": 0.910840584680012, "grad_norm": 0.1611328125, "learning_rate": 5.7031153041541764e-05, "loss": 0.0375, "step": 7083 }, { "epoch": 0.9109691799905697, "grad_norm": 0.1416015625, "learning_rate": 5.7021146543254075e-05, "loss": 0.0306, "step": 7084 }, { "epoch": 0.9110977753011273, "grad_norm": 0.177734375, "learning_rate": 5.701113975809054e-05, "loss": 0.0398, "step": 7085 }, { "epoch": 0.911226370611685, "grad_norm": 0.1552734375, "learning_rate": 5.7001132686460044e-05, "loss": 0.0363, "step": 7086 }, { "epoch": 0.9113549659222427, "grad_norm": 0.1650390625, "learning_rate": 5.6991125328771446e-05, "loss": 0.0336, "step": 7087 }, { "epoch": 0.9114835612328004, "grad_norm": 0.154296875, "learning_rate": 5.698111768543364e-05, "loss": 0.0296, "step": 7088 }, { "epoch": 0.911612156543358, "grad_norm": 0.1796875, "learning_rate": 5.6971109756855535e-05, "loss": 0.0409, "step": 7089 }, { "epoch": 0.9117407518539157, "grad_norm": 0.177734375, "learning_rate": 5.6961101543446014e-05, "loss": 0.0402, "step": 7090 }, { "epoch": 0.9118693471644734, "grad_norm": 0.169921875, "learning_rate": 5.6951093045614044e-05, "loss": 0.0431, "step": 7091 }, { "epoch": 0.9119979424750311, "grad_norm": 0.158203125, "learning_rate": 5.694108426376853e-05, "loss": 0.034, "step": 7092 }, { "epoch": 0.9121265377855887, "grad_norm": 0.1826171875, "learning_rate": 5.6931075198318425e-05, "loss": 0.0425, "step": 7093 }, { "epoch": 0.9122551330961465, "grad_norm": 0.1767578125, "learning_rate": 5.6921065849672706e-05, "loss": 0.0382, "step": 7094 }, { "epoch": 0.9123837284067041, "grad_norm": 0.1796875, "learning_rate": 5.691105621824031e-05, "loss": 0.0427, "step": 7095 }, { "epoch": 0.9125123237172618, "grad_norm": 0.1923828125, "learning_rate": 5.6901046304430264e-05, "loss": 0.0466, "step": 7096 }, { "epoch": 0.9126409190278194, "grad_norm": 0.166015625, "learning_rate": 5.6891036108651504e-05, "loss": 0.039, "step": 7097 }, { "epoch": 0.9127695143383772, "grad_norm": 0.1884765625, "learning_rate": 5.688102563131309e-05, "loss": 0.0361, "step": 7098 }, { "epoch": 0.9128981096489348, "grad_norm": 0.1611328125, "learning_rate": 5.687101487282401e-05, "loss": 0.0355, "step": 7099 }, { "epoch": 0.9130267049594925, "grad_norm": 0.1611328125, "learning_rate": 5.6861003833593284e-05, "loss": 0.0374, "step": 7100 }, { "epoch": 0.9131553002700502, "grad_norm": 0.16796875, "learning_rate": 5.6850992514029974e-05, "loss": 0.0423, "step": 7101 }, { "epoch": 0.9132838955806079, "grad_norm": 0.1484375, "learning_rate": 5.684098091454312e-05, "loss": 0.0317, "step": 7102 }, { "epoch": 0.9134124908911655, "grad_norm": 0.1796875, "learning_rate": 5.683096903554178e-05, "loss": 0.0439, "step": 7103 }, { "epoch": 0.9135410862017231, "grad_norm": 0.1796875, "learning_rate": 5.682095687743503e-05, "loss": 0.0401, "step": 7104 }, { "epoch": 0.9136696815122809, "grad_norm": 0.185546875, "learning_rate": 5.6810944440631956e-05, "loss": 0.0483, "step": 7105 }, { "epoch": 0.9137982768228385, "grad_norm": 0.1748046875, "learning_rate": 5.680093172554166e-05, "loss": 0.0409, "step": 7106 }, { "epoch": 0.9139268721333962, "grad_norm": 0.18359375, "learning_rate": 5.679091873257324e-05, "loss": 0.0438, "step": 7107 }, { "epoch": 0.9140554674439538, "grad_norm": 0.169921875, "learning_rate": 5.6780905462135834e-05, "loss": 0.0424, "step": 7108 }, { "epoch": 0.9141840627545116, "grad_norm": 0.16015625, "learning_rate": 5.677089191463855e-05, "loss": 0.0359, "step": 7109 }, { "epoch": 0.9143126580650692, "grad_norm": 0.1904296875, "learning_rate": 5.676087809049054e-05, "loss": 0.0485, "step": 7110 }, { "epoch": 0.9144412533756269, "grad_norm": 0.1796875, "learning_rate": 5.6750863990100964e-05, "loss": 0.0395, "step": 7111 }, { "epoch": 0.9145698486861846, "grad_norm": 0.1669921875, "learning_rate": 5.674084961387897e-05, "loss": 0.036, "step": 7112 }, { "epoch": 0.9146984439967423, "grad_norm": 0.1650390625, "learning_rate": 5.673083496223375e-05, "loss": 0.0358, "step": 7113 }, { "epoch": 0.9148270393072999, "grad_norm": 0.1552734375, "learning_rate": 5.6720820035574464e-05, "loss": 0.0364, "step": 7114 }, { "epoch": 0.9149556346178576, "grad_norm": 0.16015625, "learning_rate": 5.6710804834310347e-05, "loss": 0.0346, "step": 7115 }, { "epoch": 0.9150842299284153, "grad_norm": 0.169921875, "learning_rate": 5.6700789358850584e-05, "loss": 0.0393, "step": 7116 }, { "epoch": 0.915212825238973, "grad_norm": 0.1689453125, "learning_rate": 5.66907736096044e-05, "loss": 0.0384, "step": 7117 }, { "epoch": 0.9153414205495306, "grad_norm": 0.18359375, "learning_rate": 5.6680757586981036e-05, "loss": 0.0324, "step": 7118 }, { "epoch": 0.9154700158600882, "grad_norm": 0.18359375, "learning_rate": 5.667074129138972e-05, "loss": 0.0469, "step": 7119 }, { "epoch": 0.915598611170646, "grad_norm": 0.2021484375, "learning_rate": 5.6660724723239736e-05, "loss": 0.0436, "step": 7120 }, { "epoch": 0.9157272064812036, "grad_norm": 0.15625, "learning_rate": 5.66507078829403e-05, "loss": 0.0331, "step": 7121 }, { "epoch": 0.9158558017917613, "grad_norm": 0.1806640625, "learning_rate": 5.6640690770900725e-05, "loss": 0.0446, "step": 7122 }, { "epoch": 0.915984397102319, "grad_norm": 0.1640625, "learning_rate": 5.663067338753031e-05, "loss": 0.0363, "step": 7123 }, { "epoch": 0.9161129924128767, "grad_norm": 0.162109375, "learning_rate": 5.66206557332383e-05, "loss": 0.033, "step": 7124 }, { "epoch": 0.9162415877234343, "grad_norm": 0.189453125, "learning_rate": 5.661063780843407e-05, "loss": 0.0466, "step": 7125 }, { "epoch": 0.916370183033992, "grad_norm": 0.171875, "learning_rate": 5.660061961352688e-05, "loss": 0.0468, "step": 7126 }, { "epoch": 0.9164987783445497, "grad_norm": 0.1669921875, "learning_rate": 5.6590601148926115e-05, "loss": 0.0407, "step": 7127 }, { "epoch": 0.9166273736551074, "grad_norm": 0.16796875, "learning_rate": 5.658058241504107e-05, "loss": 0.0383, "step": 7128 }, { "epoch": 0.916755968965665, "grad_norm": 0.166015625, "learning_rate": 5.657056341228113e-05, "loss": 0.0409, "step": 7129 }, { "epoch": 0.9168845642762228, "grad_norm": 0.1767578125, "learning_rate": 5.6560544141055647e-05, "loss": 0.0424, "step": 7130 }, { "epoch": 0.9170131595867804, "grad_norm": 0.166015625, "learning_rate": 5.6550524601774014e-05, "loss": 0.0389, "step": 7131 }, { "epoch": 0.9171417548973381, "grad_norm": 0.18359375, "learning_rate": 5.6540504794845604e-05, "loss": 0.0379, "step": 7132 }, { "epoch": 0.9172703502078957, "grad_norm": 0.18359375, "learning_rate": 5.65304847206798e-05, "loss": 0.045, "step": 7133 }, { "epoch": 0.9173989455184535, "grad_norm": 0.177734375, "learning_rate": 5.6520464379686046e-05, "loss": 0.041, "step": 7134 }, { "epoch": 0.9175275408290111, "grad_norm": 0.1669921875, "learning_rate": 5.651044377227372e-05, "loss": 0.0346, "step": 7135 }, { "epoch": 0.9176561361395688, "grad_norm": 0.1552734375, "learning_rate": 5.650042289885229e-05, "loss": 0.0327, "step": 7136 }, { "epoch": 0.9177847314501264, "grad_norm": 0.16015625, "learning_rate": 5.649040175983118e-05, "loss": 0.0337, "step": 7137 }, { "epoch": 0.9179133267606842, "grad_norm": 0.1787109375, "learning_rate": 5.648038035561983e-05, "loss": 0.0484, "step": 7138 }, { "epoch": 0.9180419220712418, "grad_norm": 0.16796875, "learning_rate": 5.6470358686627735e-05, "loss": 0.0353, "step": 7139 }, { "epoch": 0.9181705173817994, "grad_norm": 0.1748046875, "learning_rate": 5.646033675326433e-05, "loss": 0.038, "step": 7140 }, { "epoch": 0.9182991126923572, "grad_norm": 0.1875, "learning_rate": 5.6450314555939134e-05, "loss": 0.0467, "step": 7141 }, { "epoch": 0.9184277080029148, "grad_norm": 0.1748046875, "learning_rate": 5.644029209506161e-05, "loss": 0.0364, "step": 7142 }, { "epoch": 0.9185563033134725, "grad_norm": 0.1591796875, "learning_rate": 5.643026937104129e-05, "loss": 0.0345, "step": 7143 }, { "epoch": 0.9186848986240301, "grad_norm": 0.18359375, "learning_rate": 5.642024638428769e-05, "loss": 0.0473, "step": 7144 }, { "epoch": 0.9188134939345879, "grad_norm": 0.16015625, "learning_rate": 5.641022313521032e-05, "loss": 0.0365, "step": 7145 }, { "epoch": 0.9189420892451455, "grad_norm": 0.1787109375, "learning_rate": 5.640019962421873e-05, "loss": 0.0399, "step": 7146 }, { "epoch": 0.9190706845557032, "grad_norm": 0.19140625, "learning_rate": 5.639017585172247e-05, "loss": 0.0423, "step": 7147 }, { "epoch": 0.9191992798662609, "grad_norm": 0.1708984375, "learning_rate": 5.638015181813109e-05, "loss": 0.037, "step": 7148 }, { "epoch": 0.9193278751768186, "grad_norm": 0.1669921875, "learning_rate": 5.6370127523854165e-05, "loss": 0.0423, "step": 7149 }, { "epoch": 0.9194564704873762, "grad_norm": 0.2001953125, "learning_rate": 5.6360102969301274e-05, "loss": 0.046, "step": 7150 }, { "epoch": 0.9195850657979339, "grad_norm": 0.1494140625, "learning_rate": 5.6350078154882035e-05, "loss": 0.0327, "step": 7151 }, { "epoch": 0.9197136611084916, "grad_norm": 0.16796875, "learning_rate": 5.634005308100601e-05, "loss": 0.0369, "step": 7152 }, { "epoch": 0.9198422564190493, "grad_norm": 0.1728515625, "learning_rate": 5.633002774808283e-05, "loss": 0.036, "step": 7153 }, { "epoch": 0.9199708517296069, "grad_norm": 0.1640625, "learning_rate": 5.632000215652211e-05, "loss": 0.0332, "step": 7154 }, { "epoch": 0.9200994470401646, "grad_norm": 0.1826171875, "learning_rate": 5.630997630673349e-05, "loss": 0.037, "step": 7155 }, { "epoch": 0.9202280423507223, "grad_norm": 0.2060546875, "learning_rate": 5.629995019912664e-05, "loss": 0.0443, "step": 7156 }, { "epoch": 0.92035663766128, "grad_norm": 0.189453125, "learning_rate": 5.628992383411117e-05, "loss": 0.0426, "step": 7157 }, { "epoch": 0.9204852329718376, "grad_norm": 0.1875, "learning_rate": 5.627989721209678e-05, "loss": 0.044, "step": 7158 }, { "epoch": 0.9206138282823954, "grad_norm": 0.185546875, "learning_rate": 5.6269870333493134e-05, "loss": 0.0429, "step": 7159 }, { "epoch": 0.920742423592953, "grad_norm": 0.1689453125, "learning_rate": 5.6259843198709895e-05, "loss": 0.0376, "step": 7160 }, { "epoch": 0.9208710189035106, "grad_norm": 0.1630859375, "learning_rate": 5.6249815808156804e-05, "loss": 0.0359, "step": 7161 }, { "epoch": 0.9209996142140683, "grad_norm": 0.224609375, "learning_rate": 5.6239788162243534e-05, "loss": 0.0383, "step": 7162 }, { "epoch": 0.921128209524626, "grad_norm": 0.201171875, "learning_rate": 5.6229760261379824e-05, "loss": 0.0338, "step": 7163 }, { "epoch": 0.9212568048351837, "grad_norm": 0.1650390625, "learning_rate": 5.621973210597539e-05, "loss": 0.037, "step": 7164 }, { "epoch": 0.9213854001457413, "grad_norm": 0.1767578125, "learning_rate": 5.6209703696439976e-05, "loss": 0.0401, "step": 7165 }, { "epoch": 0.921513995456299, "grad_norm": 0.18359375, "learning_rate": 5.619967503318332e-05, "loss": 0.0459, "step": 7166 }, { "epoch": 0.9216425907668567, "grad_norm": 0.1494140625, "learning_rate": 5.6189646116615194e-05, "loss": 0.0323, "step": 7167 }, { "epoch": 0.9217711860774144, "grad_norm": 0.181640625, "learning_rate": 5.6179616947145374e-05, "loss": 0.0434, "step": 7168 }, { "epoch": 0.921899781387972, "grad_norm": 0.1748046875, "learning_rate": 5.616958752518362e-05, "loss": 0.0437, "step": 7169 }, { "epoch": 0.9220283766985298, "grad_norm": 0.173828125, "learning_rate": 5.6159557851139755e-05, "loss": 0.0364, "step": 7170 }, { "epoch": 0.9221569720090874, "grad_norm": 0.1708984375, "learning_rate": 5.614952792542353e-05, "loss": 0.0372, "step": 7171 }, { "epoch": 0.9222855673196451, "grad_norm": 0.1982421875, "learning_rate": 5.613949774844479e-05, "loss": 0.0432, "step": 7172 }, { "epoch": 0.9224141626302027, "grad_norm": 0.1611328125, "learning_rate": 5.612946732061336e-05, "loss": 0.0361, "step": 7173 }, { "epoch": 0.9225427579407605, "grad_norm": 0.1875, "learning_rate": 5.611943664233905e-05, "loss": 0.042, "step": 7174 }, { "epoch": 0.9226713532513181, "grad_norm": 0.16796875, "learning_rate": 5.6109405714031734e-05, "loss": 0.0336, "step": 7175 }, { "epoch": 0.9227999485618757, "grad_norm": 0.2119140625, "learning_rate": 5.609937453610122e-05, "loss": 0.0577, "step": 7176 }, { "epoch": 0.9229285438724335, "grad_norm": 0.16015625, "learning_rate": 5.6089343108957415e-05, "loss": 0.0371, "step": 7177 }, { "epoch": 0.9230571391829911, "grad_norm": 0.1416015625, "learning_rate": 5.607931143301016e-05, "loss": 0.0311, "step": 7178 }, { "epoch": 0.9231857344935488, "grad_norm": 0.1884765625, "learning_rate": 5.606927950866936e-05, "loss": 0.0459, "step": 7179 }, { "epoch": 0.9233143298041064, "grad_norm": 0.1865234375, "learning_rate": 5.605924733634488e-05, "loss": 0.0434, "step": 7180 }, { "epoch": 0.9234429251146642, "grad_norm": 0.201171875, "learning_rate": 5.604921491644664e-05, "loss": 0.0357, "step": 7181 }, { "epoch": 0.9235715204252218, "grad_norm": 0.1767578125, "learning_rate": 5.603918224938456e-05, "loss": 0.0426, "step": 7182 }, { "epoch": 0.9237001157357795, "grad_norm": 0.162109375, "learning_rate": 5.6029149335568544e-05, "loss": 0.0335, "step": 7183 }, { "epoch": 0.9238287110463371, "grad_norm": 0.16796875, "learning_rate": 5.6019116175408534e-05, "loss": 0.0355, "step": 7184 }, { "epoch": 0.9239573063568949, "grad_norm": 0.1630859375, "learning_rate": 5.600908276931449e-05, "loss": 0.034, "step": 7185 }, { "epoch": 0.9240859016674525, "grad_norm": 0.1572265625, "learning_rate": 5.599904911769634e-05, "loss": 0.0297, "step": 7186 }, { "epoch": 0.9242144969780102, "grad_norm": 0.1630859375, "learning_rate": 5.598901522096406e-05, "loss": 0.0358, "step": 7187 }, { "epoch": 0.9243430922885679, "grad_norm": 0.1572265625, "learning_rate": 5.5978981079527614e-05, "loss": 0.0359, "step": 7188 }, { "epoch": 0.9244716875991256, "grad_norm": 0.1669921875, "learning_rate": 5.5968946693797e-05, "loss": 0.0362, "step": 7189 }, { "epoch": 0.9246002829096832, "grad_norm": 0.1826171875, "learning_rate": 5.5958912064182204e-05, "loss": 0.0432, "step": 7190 }, { "epoch": 0.9247288782202409, "grad_norm": 0.150390625, "learning_rate": 5.594887719109322e-05, "loss": 0.0314, "step": 7191 }, { "epoch": 0.9248574735307986, "grad_norm": 0.2138671875, "learning_rate": 5.593884207494007e-05, "loss": 0.0552, "step": 7192 }, { "epoch": 0.9249860688413563, "grad_norm": 0.1806640625, "learning_rate": 5.5928806716132776e-05, "loss": 0.0377, "step": 7193 }, { "epoch": 0.9251146641519139, "grad_norm": 0.1767578125, "learning_rate": 5.591877111508139e-05, "loss": 0.0392, "step": 7194 }, { "epoch": 0.9252432594624717, "grad_norm": 0.1494140625, "learning_rate": 5.590873527219592e-05, "loss": 0.0335, "step": 7195 }, { "epoch": 0.9253718547730293, "grad_norm": 0.1806640625, "learning_rate": 5.589869918788644e-05, "loss": 0.0435, "step": 7196 }, { "epoch": 0.925500450083587, "grad_norm": 0.166015625, "learning_rate": 5.5888662862563e-05, "loss": 0.0377, "step": 7197 }, { "epoch": 0.9256290453941446, "grad_norm": 0.162109375, "learning_rate": 5.587862629663568e-05, "loss": 0.0346, "step": 7198 }, { "epoch": 0.9257576407047023, "grad_norm": 0.1845703125, "learning_rate": 5.5868589490514576e-05, "loss": 0.0446, "step": 7199 }, { "epoch": 0.92588623601526, "grad_norm": 0.1748046875, "learning_rate": 5.585855244460976e-05, "loss": 0.0374, "step": 7200 }, { "epoch": 0.9260148313258176, "grad_norm": 0.1416015625, "learning_rate": 5.584851515933135e-05, "loss": 0.029, "step": 7201 }, { "epoch": 0.9261434266363753, "grad_norm": 0.1630859375, "learning_rate": 5.583847763508943e-05, "loss": 0.0408, "step": 7202 }, { "epoch": 0.926272021946933, "grad_norm": 0.1591796875, "learning_rate": 5.5828439872294156e-05, "loss": 0.0401, "step": 7203 }, { "epoch": 0.9264006172574907, "grad_norm": 0.1787109375, "learning_rate": 5.581840187135563e-05, "loss": 0.0434, "step": 7204 }, { "epoch": 0.9265292125680483, "grad_norm": 0.19921875, "learning_rate": 5.5808363632684014e-05, "loss": 0.0531, "step": 7205 }, { "epoch": 0.9266578078786061, "grad_norm": 0.1962890625, "learning_rate": 5.5798325156689466e-05, "loss": 0.0488, "step": 7206 }, { "epoch": 0.9267864031891637, "grad_norm": 0.1669921875, "learning_rate": 5.5788286443782114e-05, "loss": 0.0415, "step": 7207 }, { "epoch": 0.9269149984997214, "grad_norm": 0.1611328125, "learning_rate": 5.577824749437216e-05, "loss": 0.0377, "step": 7208 }, { "epoch": 0.927043593810279, "grad_norm": 0.173828125, "learning_rate": 5.576820830886975e-05, "loss": 0.0336, "step": 7209 }, { "epoch": 0.9271721891208368, "grad_norm": 0.158203125, "learning_rate": 5.575816888768509e-05, "loss": 0.0337, "step": 7210 }, { "epoch": 0.9273007844313944, "grad_norm": 0.205078125, "learning_rate": 5.574812923122841e-05, "loss": 0.0553, "step": 7211 }, { "epoch": 0.927429379741952, "grad_norm": 0.1640625, "learning_rate": 5.573808933990986e-05, "loss": 0.0373, "step": 7212 }, { "epoch": 0.9275579750525097, "grad_norm": 0.185546875, "learning_rate": 5.572804921413971e-05, "loss": 0.0488, "step": 7213 }, { "epoch": 0.9276865703630675, "grad_norm": 0.150390625, "learning_rate": 5.571800885432815e-05, "loss": 0.0315, "step": 7214 }, { "epoch": 0.9278151656736251, "grad_norm": 0.1728515625, "learning_rate": 5.570796826088546e-05, "loss": 0.0371, "step": 7215 }, { "epoch": 0.9279437609841827, "grad_norm": 0.1611328125, "learning_rate": 5.569792743422183e-05, "loss": 0.032, "step": 7216 }, { "epoch": 0.9280723562947405, "grad_norm": 0.18359375, "learning_rate": 5.5687886374747557e-05, "loss": 0.0416, "step": 7217 }, { "epoch": 0.9282009516052981, "grad_norm": 0.158203125, "learning_rate": 5.567784508287292e-05, "loss": 0.0359, "step": 7218 }, { "epoch": 0.9283295469158558, "grad_norm": 0.177734375, "learning_rate": 5.5667803559008146e-05, "loss": 0.0395, "step": 7219 }, { "epoch": 0.9284581422264134, "grad_norm": 0.166015625, "learning_rate": 5.5657761803563556e-05, "loss": 0.0359, "step": 7220 }, { "epoch": 0.9285867375369712, "grad_norm": 0.1806640625, "learning_rate": 5.564771981694943e-05, "loss": 0.0472, "step": 7221 }, { "epoch": 0.9287153328475288, "grad_norm": 0.1572265625, "learning_rate": 5.563767759957607e-05, "loss": 0.0382, "step": 7222 }, { "epoch": 0.9288439281580865, "grad_norm": 0.1865234375, "learning_rate": 5.5627635151853795e-05, "loss": 0.0388, "step": 7223 }, { "epoch": 0.9289725234686442, "grad_norm": 0.1943359375, "learning_rate": 5.5617592474192936e-05, "loss": 0.0455, "step": 7224 }, { "epoch": 0.9291011187792019, "grad_norm": 0.169921875, "learning_rate": 5.5607549567003814e-05, "loss": 0.0396, "step": 7225 }, { "epoch": 0.9292297140897595, "grad_norm": 0.169921875, "learning_rate": 5.5597506430696766e-05, "loss": 0.0372, "step": 7226 }, { "epoch": 0.9293583094003172, "grad_norm": 0.1943359375, "learning_rate": 5.558746306568217e-05, "loss": 0.0499, "step": 7227 }, { "epoch": 0.9294869047108749, "grad_norm": 0.1787109375, "learning_rate": 5.557741947237034e-05, "loss": 0.0474, "step": 7228 }, { "epoch": 0.9296155000214326, "grad_norm": 0.19921875, "learning_rate": 5.5567375651171694e-05, "loss": 0.0487, "step": 7229 }, { "epoch": 0.9297440953319902, "grad_norm": 0.1669921875, "learning_rate": 5.5557331602496584e-05, "loss": 0.0468, "step": 7230 }, { "epoch": 0.9298726906425479, "grad_norm": 0.1650390625, "learning_rate": 5.5547287326755394e-05, "loss": 0.0342, "step": 7231 }, { "epoch": 0.9300012859531056, "grad_norm": 0.18359375, "learning_rate": 5.5537242824358535e-05, "loss": 0.0345, "step": 7232 }, { "epoch": 0.9301298812636632, "grad_norm": 0.1650390625, "learning_rate": 5.5527198095716404e-05, "loss": 0.0369, "step": 7233 }, { "epoch": 0.9302584765742209, "grad_norm": 0.1953125, "learning_rate": 5.551715314123943e-05, "loss": 0.0536, "step": 7234 }, { "epoch": 0.9303870718847786, "grad_norm": 0.185546875, "learning_rate": 5.5507107961338026e-05, "loss": 0.0431, "step": 7235 }, { "epoch": 0.9305156671953363, "grad_norm": 0.1708984375, "learning_rate": 5.549706255642262e-05, "loss": 0.0421, "step": 7236 }, { "epoch": 0.9306442625058939, "grad_norm": 0.16015625, "learning_rate": 5.5487016926903665e-05, "loss": 0.0326, "step": 7237 }, { "epoch": 0.9307728578164516, "grad_norm": 0.1728515625, "learning_rate": 5.547697107319162e-05, "loss": 0.0434, "step": 7238 }, { "epoch": 0.9309014531270093, "grad_norm": 0.1630859375, "learning_rate": 5.546692499569695e-05, "loss": 0.037, "step": 7239 }, { "epoch": 0.931030048437567, "grad_norm": 0.185546875, "learning_rate": 5.5456878694830106e-05, "loss": 0.0437, "step": 7240 }, { "epoch": 0.9311586437481246, "grad_norm": 0.1650390625, "learning_rate": 5.5446832171001586e-05, "loss": 0.0359, "step": 7241 }, { "epoch": 0.9312872390586824, "grad_norm": 0.171875, "learning_rate": 5.5436785424621875e-05, "loss": 0.035, "step": 7242 }, { "epoch": 0.93141583436924, "grad_norm": 0.173828125, "learning_rate": 5.542673845610146e-05, "loss": 0.0361, "step": 7243 }, { "epoch": 0.9315444296797977, "grad_norm": 0.1650390625, "learning_rate": 5.541669126585085e-05, "loss": 0.0352, "step": 7244 }, { "epoch": 0.9316730249903553, "grad_norm": 0.177734375, "learning_rate": 5.5406643854280585e-05, "loss": 0.0395, "step": 7245 }, { "epoch": 0.9318016203009131, "grad_norm": 0.1943359375, "learning_rate": 5.539659622180117e-05, "loss": 0.0484, "step": 7246 }, { "epoch": 0.9319302156114707, "grad_norm": 0.158203125, "learning_rate": 5.538654836882312e-05, "loss": 0.036, "step": 7247 }, { "epoch": 0.9320588109220284, "grad_norm": 0.1640625, "learning_rate": 5.537650029575702e-05, "loss": 0.0375, "step": 7248 }, { "epoch": 0.932187406232586, "grad_norm": 0.1748046875, "learning_rate": 5.5366452003013406e-05, "loss": 0.0419, "step": 7249 }, { "epoch": 0.9323160015431438, "grad_norm": 0.1865234375, "learning_rate": 5.5356403491002827e-05, "loss": 0.0392, "step": 7250 }, { "epoch": 0.9324445968537014, "grad_norm": 0.185546875, "learning_rate": 5.534635476013589e-05, "loss": 0.0427, "step": 7251 }, { "epoch": 0.932573192164259, "grad_norm": 0.1640625, "learning_rate": 5.5336305810823117e-05, "loss": 0.0401, "step": 7252 }, { "epoch": 0.9327017874748168, "grad_norm": 0.185546875, "learning_rate": 5.532625664347515e-05, "loss": 0.0454, "step": 7253 }, { "epoch": 0.9328303827853744, "grad_norm": 0.169921875, "learning_rate": 5.531620725850256e-05, "loss": 0.0349, "step": 7254 }, { "epoch": 0.9329589780959321, "grad_norm": 0.1953125, "learning_rate": 5.530615765631595e-05, "loss": 0.0504, "step": 7255 }, { "epoch": 0.9330875734064897, "grad_norm": 0.1533203125, "learning_rate": 5.529610783732595e-05, "loss": 0.0327, "step": 7256 }, { "epoch": 0.9332161687170475, "grad_norm": 0.1806640625, "learning_rate": 5.528605780194317e-05, "loss": 0.0421, "step": 7257 }, { "epoch": 0.9333447640276051, "grad_norm": 0.1845703125, "learning_rate": 5.5276007550578256e-05, "loss": 0.0425, "step": 7258 }, { "epoch": 0.9334733593381628, "grad_norm": 0.173828125, "learning_rate": 5.5265957083641825e-05, "loss": 0.0412, "step": 7259 }, { "epoch": 0.9336019546487204, "grad_norm": 0.1669921875, "learning_rate": 5.525590640154456e-05, "loss": 0.0406, "step": 7260 }, { "epoch": 0.9337305499592782, "grad_norm": 0.1748046875, "learning_rate": 5.5245855504697097e-05, "loss": 0.039, "step": 7261 }, { "epoch": 0.9338591452698358, "grad_norm": 0.16796875, "learning_rate": 5.5235804393510116e-05, "loss": 0.0384, "step": 7262 }, { "epoch": 0.9339877405803935, "grad_norm": 0.19140625, "learning_rate": 5.522575306839429e-05, "loss": 0.0464, "step": 7263 }, { "epoch": 0.9341163358909512, "grad_norm": 0.171875, "learning_rate": 5.52157015297603e-05, "loss": 0.0399, "step": 7264 }, { "epoch": 0.9342449312015089, "grad_norm": 0.1572265625, "learning_rate": 5.520564977801885e-05, "loss": 0.0309, "step": 7265 }, { "epoch": 0.9343735265120665, "grad_norm": 0.1591796875, "learning_rate": 5.5195597813580644e-05, "loss": 0.0349, "step": 7266 }, { "epoch": 0.9345021218226242, "grad_norm": 0.1845703125, "learning_rate": 5.518554563685637e-05, "loss": 0.0433, "step": 7267 }, { "epoch": 0.9346307171331819, "grad_norm": 0.1640625, "learning_rate": 5.517549324825677e-05, "loss": 0.0396, "step": 7268 }, { "epoch": 0.9347593124437396, "grad_norm": 0.18359375, "learning_rate": 5.516544064819257e-05, "loss": 0.0437, "step": 7269 }, { "epoch": 0.9348879077542972, "grad_norm": 0.1611328125, "learning_rate": 5.5155387837074504e-05, "loss": 0.0402, "step": 7270 }, { "epoch": 0.935016503064855, "grad_norm": 0.177734375, "learning_rate": 5.514533481531331e-05, "loss": 0.0445, "step": 7271 }, { "epoch": 0.9351450983754126, "grad_norm": 0.1611328125, "learning_rate": 5.513528158331975e-05, "loss": 0.0365, "step": 7272 }, { "epoch": 0.9352736936859702, "grad_norm": 0.1787109375, "learning_rate": 5.51252281415046e-05, "loss": 0.0453, "step": 7273 }, { "epoch": 0.9354022889965279, "grad_norm": 0.1474609375, "learning_rate": 5.511517449027861e-05, "loss": 0.0375, "step": 7274 }, { "epoch": 0.9355308843070856, "grad_norm": 0.1865234375, "learning_rate": 5.5105120630052586e-05, "loss": 0.0493, "step": 7275 }, { "epoch": 0.9356594796176433, "grad_norm": 0.15234375, "learning_rate": 5.5095066561237296e-05, "loss": 0.0362, "step": 7276 }, { "epoch": 0.9357880749282009, "grad_norm": 0.154296875, "learning_rate": 5.508501228424354e-05, "loss": 0.0335, "step": 7277 }, { "epoch": 0.9359166702387586, "grad_norm": 0.185546875, "learning_rate": 5.507495779948214e-05, "loss": 0.0463, "step": 7278 }, { "epoch": 0.9360452655493163, "grad_norm": 0.1669921875, "learning_rate": 5.5064903107363876e-05, "loss": 0.0324, "step": 7279 }, { "epoch": 0.936173860859874, "grad_norm": 0.1884765625, "learning_rate": 5.5054848208299616e-05, "loss": 0.0458, "step": 7280 }, { "epoch": 0.9363024561704316, "grad_norm": 0.2119140625, "learning_rate": 5.5044793102700156e-05, "loss": 0.0579, "step": 7281 }, { "epoch": 0.9364310514809894, "grad_norm": 0.16796875, "learning_rate": 5.5034737790976345e-05, "loss": 0.0419, "step": 7282 }, { "epoch": 0.936559646791547, "grad_norm": 0.169921875, "learning_rate": 5.5024682273539044e-05, "loss": 0.0418, "step": 7283 }, { "epoch": 0.9366882421021047, "grad_norm": 0.1591796875, "learning_rate": 5.5014626550799096e-05, "loss": 0.0396, "step": 7284 }, { "epoch": 0.9368168374126623, "grad_norm": 0.2001953125, "learning_rate": 5.500457062316736e-05, "loss": 0.0389, "step": 7285 }, { "epoch": 0.9369454327232201, "grad_norm": 0.1787109375, "learning_rate": 5.499451449105473e-05, "loss": 0.0464, "step": 7286 }, { "epoch": 0.9370740280337777, "grad_norm": 0.166015625, "learning_rate": 5.4984458154872074e-05, "loss": 0.0423, "step": 7287 }, { "epoch": 0.9372026233443354, "grad_norm": 0.166015625, "learning_rate": 5.4974401615030294e-05, "loss": 0.0394, "step": 7288 }, { "epoch": 0.9373312186548931, "grad_norm": 0.1865234375, "learning_rate": 5.496434487194029e-05, "loss": 0.0501, "step": 7289 }, { "epoch": 0.9374598139654508, "grad_norm": 0.33203125, "learning_rate": 5.4954287926012936e-05, "loss": 0.0422, "step": 7290 }, { "epoch": 0.9375884092760084, "grad_norm": 0.169921875, "learning_rate": 5.4944230777659164e-05, "loss": 0.0382, "step": 7291 }, { "epoch": 0.937717004586566, "grad_norm": 0.16015625, "learning_rate": 5.4934173427289924e-05, "loss": 0.0347, "step": 7292 }, { "epoch": 0.9378455998971238, "grad_norm": 0.169921875, "learning_rate": 5.4924115875316115e-05, "loss": 0.0352, "step": 7293 }, { "epoch": 0.9379741952076814, "grad_norm": 0.1650390625, "learning_rate": 5.4914058122148705e-05, "loss": 0.0353, "step": 7294 }, { "epoch": 0.9381027905182391, "grad_norm": 0.17578125, "learning_rate": 5.4904000168198614e-05, "loss": 0.0374, "step": 7295 }, { "epoch": 0.9382313858287967, "grad_norm": 0.158203125, "learning_rate": 5.4893942013876807e-05, "loss": 0.0349, "step": 7296 }, { "epoch": 0.9383599811393545, "grad_norm": 0.1806640625, "learning_rate": 5.488388365959425e-05, "loss": 0.049, "step": 7297 }, { "epoch": 0.9384885764499121, "grad_norm": 0.1923828125, "learning_rate": 5.4873825105761914e-05, "loss": 0.0485, "step": 7298 }, { "epoch": 0.9386171717604698, "grad_norm": 0.1845703125, "learning_rate": 5.48637663527908e-05, "loss": 0.0428, "step": 7299 }, { "epoch": 0.9387457670710275, "grad_norm": 0.1826171875, "learning_rate": 5.485370740109186e-05, "loss": 0.044, "step": 7300 }, { "epoch": 0.9388743623815852, "grad_norm": 0.1640625, "learning_rate": 5.484364825107614e-05, "loss": 0.0358, "step": 7301 }, { "epoch": 0.9390029576921428, "grad_norm": 0.173828125, "learning_rate": 5.4833588903154596e-05, "loss": 0.0432, "step": 7302 }, { "epoch": 0.9391315530027005, "grad_norm": 0.16796875, "learning_rate": 5.4823529357738254e-05, "loss": 0.037, "step": 7303 }, { "epoch": 0.9392601483132582, "grad_norm": 0.181640625, "learning_rate": 5.4813469615238154e-05, "loss": 0.046, "step": 7304 }, { "epoch": 0.9393887436238159, "grad_norm": 0.1552734375, "learning_rate": 5.48034096760653e-05, "loss": 0.035, "step": 7305 }, { "epoch": 0.9395173389343735, "grad_norm": 0.1845703125, "learning_rate": 5.4793349540630756e-05, "loss": 0.0438, "step": 7306 }, { "epoch": 0.9396459342449311, "grad_norm": 0.1689453125, "learning_rate": 5.4783289209345546e-05, "loss": 0.034, "step": 7307 }, { "epoch": 0.9397745295554889, "grad_norm": 0.1572265625, "learning_rate": 5.4773228682620745e-05, "loss": 0.0337, "step": 7308 }, { "epoch": 0.9399031248660465, "grad_norm": 0.171875, "learning_rate": 5.4763167960867386e-05, "loss": 0.0381, "step": 7309 }, { "epoch": 0.9400317201766042, "grad_norm": 0.14453125, "learning_rate": 5.4753107044496553e-05, "loss": 0.0303, "step": 7310 }, { "epoch": 0.940160315487162, "grad_norm": 0.1748046875, "learning_rate": 5.4743045933919334e-05, "loss": 0.0421, "step": 7311 }, { "epoch": 0.9402889107977196, "grad_norm": 0.16796875, "learning_rate": 5.4732984629546805e-05, "loss": 0.0413, "step": 7312 }, { "epoch": 0.9404175061082772, "grad_norm": 0.16796875, "learning_rate": 5.472292313179006e-05, "loss": 0.0366, "step": 7313 }, { "epoch": 0.9405461014188349, "grad_norm": 0.17578125, "learning_rate": 5.471286144106018e-05, "loss": 0.0397, "step": 7314 }, { "epoch": 0.9406746967293926, "grad_norm": 0.1611328125, "learning_rate": 5.47027995577683e-05, "loss": 0.034, "step": 7315 }, { "epoch": 0.9408032920399503, "grad_norm": 0.171875, "learning_rate": 5.469273748232554e-05, "loss": 0.0345, "step": 7316 }, { "epoch": 0.9409318873505079, "grad_norm": 0.2119140625, "learning_rate": 5.4682675215142996e-05, "loss": 0.0525, "step": 7317 }, { "epoch": 0.9410604826610657, "grad_norm": 0.185546875, "learning_rate": 5.467261275663183e-05, "loss": 0.041, "step": 7318 }, { "epoch": 0.9411890779716233, "grad_norm": 0.1708984375, "learning_rate": 5.466255010720317e-05, "loss": 0.0317, "step": 7319 }, { "epoch": 0.941317673282181, "grad_norm": 0.1943359375, "learning_rate": 5.465248726726816e-05, "loss": 0.0512, "step": 7320 }, { "epoch": 0.9414462685927386, "grad_norm": 0.201171875, "learning_rate": 5.464242423723795e-05, "loss": 0.037, "step": 7321 }, { "epoch": 0.9415748639032964, "grad_norm": 0.185546875, "learning_rate": 5.463236101752374e-05, "loss": 0.0582, "step": 7322 }, { "epoch": 0.941703459213854, "grad_norm": 0.15625, "learning_rate": 5.462229760853666e-05, "loss": 0.031, "step": 7323 }, { "epoch": 0.9418320545244117, "grad_norm": 0.17578125, "learning_rate": 5.461223401068789e-05, "loss": 0.0365, "step": 7324 }, { "epoch": 0.9419606498349693, "grad_norm": 0.19140625, "learning_rate": 5.460217022438866e-05, "loss": 0.0458, "step": 7325 }, { "epoch": 0.942089245145527, "grad_norm": 0.150390625, "learning_rate": 5.4592106250050115e-05, "loss": 0.0298, "step": 7326 }, { "epoch": 0.9422178404560847, "grad_norm": 0.1767578125, "learning_rate": 5.458204208808347e-05, "loss": 0.041, "step": 7327 }, { "epoch": 0.9423464357666423, "grad_norm": 0.1875, "learning_rate": 5.457197773889997e-05, "loss": 0.0417, "step": 7328 }, { "epoch": 0.9424750310772001, "grad_norm": 0.1767578125, "learning_rate": 5.456191320291079e-05, "loss": 0.0362, "step": 7329 }, { "epoch": 0.9426036263877577, "grad_norm": 0.16796875, "learning_rate": 5.455184848052717e-05, "loss": 0.0366, "step": 7330 }, { "epoch": 0.9427322216983154, "grad_norm": 0.158203125, "learning_rate": 5.454178357216034e-05, "loss": 0.0325, "step": 7331 }, { "epoch": 0.942860817008873, "grad_norm": 0.181640625, "learning_rate": 5.453171847822155e-05, "loss": 0.041, "step": 7332 }, { "epoch": 0.9429894123194308, "grad_norm": 0.1650390625, "learning_rate": 5.452165319912204e-05, "loss": 0.0398, "step": 7333 }, { "epoch": 0.9431180076299884, "grad_norm": 0.13671875, "learning_rate": 5.4511587735273075e-05, "loss": 0.0301, "step": 7334 }, { "epoch": 0.9432466029405461, "grad_norm": 0.1953125, "learning_rate": 5.450152208708589e-05, "loss": 0.0466, "step": 7335 }, { "epoch": 0.9433751982511038, "grad_norm": 0.15625, "learning_rate": 5.449145625497178e-05, "loss": 0.0297, "step": 7336 }, { "epoch": 0.9435037935616615, "grad_norm": 0.2001953125, "learning_rate": 5.4481390239342043e-05, "loss": 0.0496, "step": 7337 }, { "epoch": 0.9436323888722191, "grad_norm": 0.166015625, "learning_rate": 5.447132404060792e-05, "loss": 0.0372, "step": 7338 }, { "epoch": 0.9437609841827768, "grad_norm": 0.17578125, "learning_rate": 5.446125765918074e-05, "loss": 0.0373, "step": 7339 }, { "epoch": 0.9438895794933345, "grad_norm": 0.1806640625, "learning_rate": 5.445119109547177e-05, "loss": 0.0446, "step": 7340 }, { "epoch": 0.9440181748038922, "grad_norm": 0.16796875, "learning_rate": 5.444112434989234e-05, "loss": 0.0345, "step": 7341 }, { "epoch": 0.9441467701144498, "grad_norm": 0.2412109375, "learning_rate": 5.443105742285377e-05, "loss": 0.0421, "step": 7342 }, { "epoch": 0.9442753654250075, "grad_norm": 0.1806640625, "learning_rate": 5.442099031476736e-05, "loss": 0.047, "step": 7343 }, { "epoch": 0.9444039607355652, "grad_norm": 0.169921875, "learning_rate": 5.441092302604447e-05, "loss": 0.0385, "step": 7344 }, { "epoch": 0.9445325560461229, "grad_norm": 0.1650390625, "learning_rate": 5.440085555709642e-05, "loss": 0.0341, "step": 7345 }, { "epoch": 0.9446611513566805, "grad_norm": 0.1591796875, "learning_rate": 5.439078790833455e-05, "loss": 0.0333, "step": 7346 }, { "epoch": 0.9447897466672383, "grad_norm": 0.1767578125, "learning_rate": 5.438072008017022e-05, "loss": 0.0461, "step": 7347 }, { "epoch": 0.9449183419777959, "grad_norm": 0.171875, "learning_rate": 5.43706520730148e-05, "loss": 0.0363, "step": 7348 }, { "epoch": 0.9450469372883535, "grad_norm": 0.1728515625, "learning_rate": 5.4360583887279646e-05, "loss": 0.0404, "step": 7349 }, { "epoch": 0.9451755325989112, "grad_norm": 0.16015625, "learning_rate": 5.4350515523376124e-05, "loss": 0.0332, "step": 7350 }, { "epoch": 0.9453041279094689, "grad_norm": 0.1708984375, "learning_rate": 5.434044698171563e-05, "loss": 0.0399, "step": 7351 }, { "epoch": 0.9454327232200266, "grad_norm": 0.1904296875, "learning_rate": 5.433037826270955e-05, "loss": 0.0524, "step": 7352 }, { "epoch": 0.9455613185305842, "grad_norm": 0.15625, "learning_rate": 5.4320309366769264e-05, "loss": 0.038, "step": 7353 }, { "epoch": 0.9456899138411419, "grad_norm": 0.1953125, "learning_rate": 5.43102402943062e-05, "loss": 0.0481, "step": 7354 }, { "epoch": 0.9458185091516996, "grad_norm": 0.1640625, "learning_rate": 5.430017104573175e-05, "loss": 0.036, "step": 7355 }, { "epoch": 0.9459471044622573, "grad_norm": 0.1796875, "learning_rate": 5.4290101621457346e-05, "loss": 0.0416, "step": 7356 }, { "epoch": 0.9460756997728149, "grad_norm": 0.150390625, "learning_rate": 5.42800320218944e-05, "loss": 0.0332, "step": 7357 }, { "epoch": 0.9462042950833727, "grad_norm": 0.1748046875, "learning_rate": 5.4269962247454357e-05, "loss": 0.046, "step": 7358 }, { "epoch": 0.9463328903939303, "grad_norm": 0.1845703125, "learning_rate": 5.4259892298548645e-05, "loss": 0.0425, "step": 7359 }, { "epoch": 0.946461485704488, "grad_norm": 0.189453125, "learning_rate": 5.4249822175588714e-05, "loss": 0.0487, "step": 7360 }, { "epoch": 0.9465900810150456, "grad_norm": 0.1611328125, "learning_rate": 5.423975187898602e-05, "loss": 0.0334, "step": 7361 }, { "epoch": 0.9467186763256034, "grad_norm": 0.1572265625, "learning_rate": 5.422968140915202e-05, "loss": 0.0387, "step": 7362 }, { "epoch": 0.946847271636161, "grad_norm": 0.173828125, "learning_rate": 5.4219610766498185e-05, "loss": 0.044, "step": 7363 }, { "epoch": 0.9469758669467186, "grad_norm": 0.1923828125, "learning_rate": 5.420953995143598e-05, "loss": 0.0516, "step": 7364 }, { "epoch": 0.9471044622572764, "grad_norm": 0.1708984375, "learning_rate": 5.41994689643769e-05, "loss": 0.0347, "step": 7365 }, { "epoch": 0.947233057567834, "grad_norm": 0.1552734375, "learning_rate": 5.418939780573243e-05, "loss": 0.0382, "step": 7366 }, { "epoch": 0.9473616528783917, "grad_norm": 0.21484375, "learning_rate": 5.417932647591406e-05, "loss": 0.041, "step": 7367 }, { "epoch": 0.9474902481889493, "grad_norm": 0.1630859375, "learning_rate": 5.4169254975333296e-05, "loss": 0.0367, "step": 7368 }, { "epoch": 0.9476188434995071, "grad_norm": 0.2060546875, "learning_rate": 5.4159183304401654e-05, "loss": 0.0405, "step": 7369 }, { "epoch": 0.9477474388100647, "grad_norm": 0.1787109375, "learning_rate": 5.4149111463530645e-05, "loss": 0.0408, "step": 7370 }, { "epoch": 0.9478760341206224, "grad_norm": 0.1806640625, "learning_rate": 5.413903945313178e-05, "loss": 0.0391, "step": 7371 }, { "epoch": 0.94800462943118, "grad_norm": 0.150390625, "learning_rate": 5.4128967273616625e-05, "loss": 0.034, "step": 7372 }, { "epoch": 0.9481332247417378, "grad_norm": 0.1708984375, "learning_rate": 5.411889492539669e-05, "loss": 0.0397, "step": 7373 }, { "epoch": 0.9482618200522954, "grad_norm": 0.1572265625, "learning_rate": 5.410882240888351e-05, "loss": 0.0318, "step": 7374 }, { "epoch": 0.9483904153628531, "grad_norm": 0.1884765625, "learning_rate": 5.4098749724488664e-05, "loss": 0.0518, "step": 7375 }, { "epoch": 0.9485190106734108, "grad_norm": 0.1796875, "learning_rate": 5.408867687262368e-05, "loss": 0.0438, "step": 7376 }, { "epoch": 0.9486476059839685, "grad_norm": 0.1806640625, "learning_rate": 5.407860385370016e-05, "loss": 0.0409, "step": 7377 }, { "epoch": 0.9487762012945261, "grad_norm": 0.19921875, "learning_rate": 5.4068530668129636e-05, "loss": 0.0467, "step": 7378 }, { "epoch": 0.9489047966050838, "grad_norm": 0.17578125, "learning_rate": 5.405845731632371e-05, "loss": 0.0422, "step": 7379 }, { "epoch": 0.9490333919156415, "grad_norm": 0.18359375, "learning_rate": 5.404838379869397e-05, "loss": 0.0476, "step": 7380 }, { "epoch": 0.9491619872261992, "grad_norm": 0.1484375, "learning_rate": 5.403831011565199e-05, "loss": 0.0325, "step": 7381 }, { "epoch": 0.9492905825367568, "grad_norm": 0.19140625, "learning_rate": 5.40282362676094e-05, "loss": 0.0389, "step": 7382 }, { "epoch": 0.9494191778473146, "grad_norm": 0.1552734375, "learning_rate": 5.401816225497777e-05, "loss": 0.0346, "step": 7383 }, { "epoch": 0.9495477731578722, "grad_norm": 0.1708984375, "learning_rate": 5.4008088078168736e-05, "loss": 0.0445, "step": 7384 }, { "epoch": 0.9496763684684298, "grad_norm": 0.1650390625, "learning_rate": 5.399801373759392e-05, "loss": 0.0382, "step": 7385 }, { "epoch": 0.9498049637789875, "grad_norm": 0.173828125, "learning_rate": 5.398793923366492e-05, "loss": 0.0367, "step": 7386 }, { "epoch": 0.9499335590895452, "grad_norm": 0.16015625, "learning_rate": 5.397786456679339e-05, "loss": 0.0345, "step": 7387 }, { "epoch": 0.9500621544001029, "grad_norm": 0.205078125, "learning_rate": 5.396778973739096e-05, "loss": 0.0556, "step": 7388 }, { "epoch": 0.9501907497106605, "grad_norm": 0.1572265625, "learning_rate": 5.3957714745869294e-05, "loss": 0.0377, "step": 7389 }, { "epoch": 0.9503193450212182, "grad_norm": 0.1650390625, "learning_rate": 5.394763959264002e-05, "loss": 0.0438, "step": 7390 }, { "epoch": 0.9504479403317759, "grad_norm": 0.150390625, "learning_rate": 5.39375642781148e-05, "loss": 0.0369, "step": 7391 }, { "epoch": 0.9505765356423336, "grad_norm": 0.1474609375, "learning_rate": 5.392748880270533e-05, "loss": 0.0326, "step": 7392 }, { "epoch": 0.9507051309528912, "grad_norm": 0.1650390625, "learning_rate": 5.3917413166823236e-05, "loss": 0.0409, "step": 7393 }, { "epoch": 0.950833726263449, "grad_norm": 0.181640625, "learning_rate": 5.390733737088024e-05, "loss": 0.0464, "step": 7394 }, { "epoch": 0.9509623215740066, "grad_norm": 0.1953125, "learning_rate": 5.389726141528799e-05, "loss": 0.0474, "step": 7395 }, { "epoch": 0.9510909168845643, "grad_norm": 0.14453125, "learning_rate": 5.388718530045822e-05, "loss": 0.0279, "step": 7396 }, { "epoch": 0.9512195121951219, "grad_norm": 0.1826171875, "learning_rate": 5.38771090268026e-05, "loss": 0.0496, "step": 7397 }, { "epoch": 0.9513481075056797, "grad_norm": 0.16796875, "learning_rate": 5.386703259473281e-05, "loss": 0.0454, "step": 7398 }, { "epoch": 0.9514767028162373, "grad_norm": 0.1640625, "learning_rate": 5.385695600466062e-05, "loss": 0.0378, "step": 7399 }, { "epoch": 0.951605298126795, "grad_norm": 0.1630859375, "learning_rate": 5.3846879256997694e-05, "loss": 0.0351, "step": 7400 }, { "epoch": 0.9517338934373526, "grad_norm": 0.181640625, "learning_rate": 5.383680235215579e-05, "loss": 0.046, "step": 7401 }, { "epoch": 0.9518624887479104, "grad_norm": 0.181640625, "learning_rate": 5.382672529054663e-05, "loss": 0.0341, "step": 7402 }, { "epoch": 0.951991084058468, "grad_norm": 0.158203125, "learning_rate": 5.3816648072581935e-05, "loss": 0.0365, "step": 7403 }, { "epoch": 0.9521196793690256, "grad_norm": 0.1728515625, "learning_rate": 5.3806570698673485e-05, "loss": 0.0403, "step": 7404 }, { "epoch": 0.9522482746795834, "grad_norm": 0.1728515625, "learning_rate": 5.379649316923299e-05, "loss": 0.0437, "step": 7405 }, { "epoch": 0.952376869990141, "grad_norm": 0.169921875, "learning_rate": 5.3786415484672224e-05, "loss": 0.0342, "step": 7406 }, { "epoch": 0.9525054653006987, "grad_norm": 0.181640625, "learning_rate": 5.377633764540295e-05, "loss": 0.0419, "step": 7407 }, { "epoch": 0.9526340606112563, "grad_norm": 0.171875, "learning_rate": 5.376625965183695e-05, "loss": 0.0325, "step": 7408 }, { "epoch": 0.9527626559218141, "grad_norm": 0.1748046875, "learning_rate": 5.375618150438597e-05, "loss": 0.0466, "step": 7409 }, { "epoch": 0.9528912512323717, "grad_norm": 0.19140625, "learning_rate": 5.3746103203461805e-05, "loss": 0.0493, "step": 7410 }, { "epoch": 0.9530198465429294, "grad_norm": 0.193359375, "learning_rate": 5.3736024749476254e-05, "loss": 0.0389, "step": 7411 }, { "epoch": 0.9531484418534871, "grad_norm": 0.1748046875, "learning_rate": 5.372594614284109e-05, "loss": 0.0405, "step": 7412 }, { "epoch": 0.9532770371640448, "grad_norm": 0.16796875, "learning_rate": 5.3715867383968134e-05, "loss": 0.036, "step": 7413 }, { "epoch": 0.9534056324746024, "grad_norm": 0.169921875, "learning_rate": 5.370578847326916e-05, "loss": 0.0417, "step": 7414 }, { "epoch": 0.9535342277851601, "grad_norm": 0.17578125, "learning_rate": 5.369570941115601e-05, "loss": 0.0366, "step": 7415 }, { "epoch": 0.9536628230957178, "grad_norm": 0.1748046875, "learning_rate": 5.3685630198040506e-05, "loss": 0.0402, "step": 7416 }, { "epoch": 0.9537914184062755, "grad_norm": 0.1640625, "learning_rate": 5.367555083433444e-05, "loss": 0.0386, "step": 7417 }, { "epoch": 0.9539200137168331, "grad_norm": 0.1689453125, "learning_rate": 5.366547132044969e-05, "loss": 0.0383, "step": 7418 }, { "epoch": 0.9540486090273907, "grad_norm": 0.169921875, "learning_rate": 5.365539165679805e-05, "loss": 0.0374, "step": 7419 }, { "epoch": 0.9541772043379485, "grad_norm": 0.1845703125, "learning_rate": 5.364531184379139e-05, "loss": 0.0328, "step": 7420 }, { "epoch": 0.9543057996485061, "grad_norm": 0.162109375, "learning_rate": 5.363523188184155e-05, "loss": 0.0377, "step": 7421 }, { "epoch": 0.9544343949590638, "grad_norm": 0.171875, "learning_rate": 5.362515177136038e-05, "loss": 0.0425, "step": 7422 }, { "epoch": 0.9545629902696215, "grad_norm": 0.1728515625, "learning_rate": 5.3615071512759765e-05, "loss": 0.0381, "step": 7423 }, { "epoch": 0.9546915855801792, "grad_norm": 0.162109375, "learning_rate": 5.360499110645153e-05, "loss": 0.034, "step": 7424 }, { "epoch": 0.9548201808907368, "grad_norm": 0.1650390625, "learning_rate": 5.359491055284759e-05, "loss": 0.042, "step": 7425 }, { "epoch": 0.9549487762012945, "grad_norm": 0.1728515625, "learning_rate": 5.35848298523598e-05, "loss": 0.0424, "step": 7426 }, { "epoch": 0.9550773715118522, "grad_norm": 0.18359375, "learning_rate": 5.357474900540006e-05, "loss": 0.0447, "step": 7427 }, { "epoch": 0.9552059668224099, "grad_norm": 0.197265625, "learning_rate": 5.356466801238024e-05, "loss": 0.0355, "step": 7428 }, { "epoch": 0.9553345621329675, "grad_norm": 0.181640625, "learning_rate": 5.355458687371226e-05, "loss": 0.0388, "step": 7429 }, { "epoch": 0.9554631574435253, "grad_norm": 0.1826171875, "learning_rate": 5.3544505589808024e-05, "loss": 0.0452, "step": 7430 }, { "epoch": 0.9555917527540829, "grad_norm": 0.181640625, "learning_rate": 5.353442416107942e-05, "loss": 0.0422, "step": 7431 }, { "epoch": 0.9557203480646406, "grad_norm": 0.1533203125, "learning_rate": 5.3524342587938417e-05, "loss": 0.0343, "step": 7432 }, { "epoch": 0.9558489433751982, "grad_norm": 0.1875, "learning_rate": 5.351426087079685e-05, "loss": 0.0434, "step": 7433 }, { "epoch": 0.955977538685756, "grad_norm": 0.1708984375, "learning_rate": 5.35041790100667e-05, "loss": 0.0342, "step": 7434 }, { "epoch": 0.9561061339963136, "grad_norm": 0.1728515625, "learning_rate": 5.3494097006159913e-05, "loss": 0.0377, "step": 7435 }, { "epoch": 0.9562347293068713, "grad_norm": 0.1455078125, "learning_rate": 5.3484014859488394e-05, "loss": 0.0287, "step": 7436 }, { "epoch": 0.9563633246174289, "grad_norm": 0.177734375, "learning_rate": 5.347393257046411e-05, "loss": 0.0394, "step": 7437 }, { "epoch": 0.9564919199279867, "grad_norm": 0.189453125, "learning_rate": 5.346385013949899e-05, "loss": 0.0479, "step": 7438 }, { "epoch": 0.9566205152385443, "grad_norm": 0.1904296875, "learning_rate": 5.3453767567005016e-05, "loss": 0.0447, "step": 7439 }, { "epoch": 0.9567491105491019, "grad_norm": 0.1767578125, "learning_rate": 5.3443684853394125e-05, "loss": 0.044, "step": 7440 }, { "epoch": 0.9568777058596597, "grad_norm": 0.169921875, "learning_rate": 5.34336019990783e-05, "loss": 0.039, "step": 7441 }, { "epoch": 0.9570063011702173, "grad_norm": 0.16015625, "learning_rate": 5.342351900446952e-05, "loss": 0.0385, "step": 7442 }, { "epoch": 0.957134896480775, "grad_norm": 0.1669921875, "learning_rate": 5.341343586997977e-05, "loss": 0.0364, "step": 7443 }, { "epoch": 0.9572634917913326, "grad_norm": 0.1630859375, "learning_rate": 5.340335259602101e-05, "loss": 0.0369, "step": 7444 }, { "epoch": 0.9573920871018904, "grad_norm": 0.1474609375, "learning_rate": 5.339326918300523e-05, "loss": 0.0335, "step": 7445 }, { "epoch": 0.957520682412448, "grad_norm": 0.1640625, "learning_rate": 5.338318563134444e-05, "loss": 0.0388, "step": 7446 }, { "epoch": 0.9576492777230057, "grad_norm": 0.166015625, "learning_rate": 5.337310194145064e-05, "loss": 0.039, "step": 7447 }, { "epoch": 0.9577778730335633, "grad_norm": 0.1923828125, "learning_rate": 5.3363018113735843e-05, "loss": 0.0472, "step": 7448 }, { "epoch": 0.9579064683441211, "grad_norm": 0.171875, "learning_rate": 5.335293414861207e-05, "loss": 0.0369, "step": 7449 }, { "epoch": 0.9580350636546787, "grad_norm": 0.1982421875, "learning_rate": 5.334285004649131e-05, "loss": 0.0425, "step": 7450 }, { "epoch": 0.9581636589652364, "grad_norm": 0.18359375, "learning_rate": 5.333276580778561e-05, "loss": 0.0462, "step": 7451 }, { "epoch": 0.9582922542757941, "grad_norm": 0.1650390625, "learning_rate": 5.3322681432907e-05, "loss": 0.0333, "step": 7452 }, { "epoch": 0.9584208495863518, "grad_norm": 0.1796875, "learning_rate": 5.33125969222675e-05, "loss": 0.0394, "step": 7453 }, { "epoch": 0.9585494448969094, "grad_norm": 0.177734375, "learning_rate": 5.330251227627918e-05, "loss": 0.0336, "step": 7454 }, { "epoch": 0.958678040207467, "grad_norm": 0.1845703125, "learning_rate": 5.329242749535407e-05, "loss": 0.0444, "step": 7455 }, { "epoch": 0.9588066355180248, "grad_norm": 0.171875, "learning_rate": 5.328234257990422e-05, "loss": 0.0412, "step": 7456 }, { "epoch": 0.9589352308285825, "grad_norm": 0.1640625, "learning_rate": 5.3272257530341684e-05, "loss": 0.034, "step": 7457 }, { "epoch": 0.9590638261391401, "grad_norm": 0.1708984375, "learning_rate": 5.326217234707852e-05, "loss": 0.0411, "step": 7458 }, { "epoch": 0.9591924214496979, "grad_norm": 0.162109375, "learning_rate": 5.325208703052683e-05, "loss": 0.0384, "step": 7459 }, { "epoch": 0.9593210167602555, "grad_norm": 0.1796875, "learning_rate": 5.324200158109866e-05, "loss": 0.0386, "step": 7460 }, { "epoch": 0.9594496120708131, "grad_norm": 0.18359375, "learning_rate": 5.3231915999206084e-05, "loss": 0.045, "step": 7461 }, { "epoch": 0.9595782073813708, "grad_norm": 0.1796875, "learning_rate": 5.322183028526121e-05, "loss": 0.0481, "step": 7462 }, { "epoch": 0.9597068026919285, "grad_norm": 0.1796875, "learning_rate": 5.321174443967611e-05, "loss": 0.0476, "step": 7463 }, { "epoch": 0.9598353980024862, "grad_norm": 0.1669921875, "learning_rate": 5.320165846286289e-05, "loss": 0.0369, "step": 7464 }, { "epoch": 0.9599639933130438, "grad_norm": 0.19140625, "learning_rate": 5.319157235523366e-05, "loss": 0.0501, "step": 7465 }, { "epoch": 0.9600925886236015, "grad_norm": 0.1630859375, "learning_rate": 5.318148611720051e-05, "loss": 0.0381, "step": 7466 }, { "epoch": 0.9602211839341592, "grad_norm": 0.1650390625, "learning_rate": 5.317139974917556e-05, "loss": 0.0346, "step": 7467 }, { "epoch": 0.9603497792447169, "grad_norm": 0.154296875, "learning_rate": 5.3161313251570924e-05, "loss": 0.0359, "step": 7468 }, { "epoch": 0.9604783745552745, "grad_norm": 0.1552734375, "learning_rate": 5.3151226624798713e-05, "loss": 0.0348, "step": 7469 }, { "epoch": 0.9606069698658323, "grad_norm": 0.1611328125, "learning_rate": 5.3141139869271064e-05, "loss": 0.0415, "step": 7470 }, { "epoch": 0.9607355651763899, "grad_norm": 0.1572265625, "learning_rate": 5.313105298540012e-05, "loss": 0.0338, "step": 7471 }, { "epoch": 0.9608641604869476, "grad_norm": 0.17578125, "learning_rate": 5.312096597359801e-05, "loss": 0.0388, "step": 7472 }, { "epoch": 0.9609927557975052, "grad_norm": 0.14453125, "learning_rate": 5.311087883427689e-05, "loss": 0.0283, "step": 7473 }, { "epoch": 0.961121351108063, "grad_norm": 0.1962890625, "learning_rate": 5.310079156784888e-05, "loss": 0.0378, "step": 7474 }, { "epoch": 0.9612499464186206, "grad_norm": 0.1708984375, "learning_rate": 5.309070417472617e-05, "loss": 0.0365, "step": 7475 }, { "epoch": 0.9613785417291782, "grad_norm": 0.1962890625, "learning_rate": 5.308061665532089e-05, "loss": 0.0446, "step": 7476 }, { "epoch": 0.961507137039736, "grad_norm": 0.1875, "learning_rate": 5.307052901004522e-05, "loss": 0.043, "step": 7477 }, { "epoch": 0.9616357323502936, "grad_norm": 0.18359375, "learning_rate": 5.306044123931132e-05, "loss": 0.0468, "step": 7478 }, { "epoch": 0.9617643276608513, "grad_norm": 0.197265625, "learning_rate": 5.305035334353138e-05, "loss": 0.046, "step": 7479 }, { "epoch": 0.9618929229714089, "grad_norm": 0.166015625, "learning_rate": 5.304026532311757e-05, "loss": 0.0352, "step": 7480 }, { "epoch": 0.9620215182819667, "grad_norm": 0.1904296875, "learning_rate": 5.303017717848206e-05, "loss": 0.051, "step": 7481 }, { "epoch": 0.9621501135925243, "grad_norm": 0.1552734375, "learning_rate": 5.302008891003707e-05, "loss": 0.0351, "step": 7482 }, { "epoch": 0.962278708903082, "grad_norm": 0.146484375, "learning_rate": 5.3010000518194756e-05, "loss": 0.0308, "step": 7483 }, { "epoch": 0.9624073042136396, "grad_norm": 0.1728515625, "learning_rate": 5.299991200336735e-05, "loss": 0.0405, "step": 7484 }, { "epoch": 0.9625358995241974, "grad_norm": 0.1826171875, "learning_rate": 5.298982336596705e-05, "loss": 0.0397, "step": 7485 }, { "epoch": 0.962664494834755, "grad_norm": 0.203125, "learning_rate": 5.297973460640606e-05, "loss": 0.0432, "step": 7486 }, { "epoch": 0.9627930901453127, "grad_norm": 0.162109375, "learning_rate": 5.29696457250966e-05, "loss": 0.0375, "step": 7487 }, { "epoch": 0.9629216854558704, "grad_norm": 0.17578125, "learning_rate": 5.295955672245089e-05, "loss": 0.0427, "step": 7488 }, { "epoch": 0.9630502807664281, "grad_norm": 0.1787109375, "learning_rate": 5.2949467598881155e-05, "loss": 0.045, "step": 7489 }, { "epoch": 0.9631788760769857, "grad_norm": 0.19140625, "learning_rate": 5.293937835479961e-05, "loss": 0.0487, "step": 7490 }, { "epoch": 0.9633074713875434, "grad_norm": 0.1552734375, "learning_rate": 5.292928899061852e-05, "loss": 0.0344, "step": 7491 }, { "epoch": 0.9634360666981011, "grad_norm": 0.1708984375, "learning_rate": 5.2919199506750095e-05, "loss": 0.0373, "step": 7492 }, { "epoch": 0.9635646620086588, "grad_norm": 0.185546875, "learning_rate": 5.290910990360659e-05, "loss": 0.041, "step": 7493 }, { "epoch": 0.9636932573192164, "grad_norm": 0.2470703125, "learning_rate": 5.289902018160027e-05, "loss": 0.0409, "step": 7494 }, { "epoch": 0.9638218526297742, "grad_norm": 0.1767578125, "learning_rate": 5.288893034114335e-05, "loss": 0.043, "step": 7495 }, { "epoch": 0.9639504479403318, "grad_norm": 0.1474609375, "learning_rate": 5.287884038264813e-05, "loss": 0.03, "step": 7496 }, { "epoch": 0.9640790432508894, "grad_norm": 0.1884765625, "learning_rate": 5.286875030652686e-05, "loss": 0.0438, "step": 7497 }, { "epoch": 0.9642076385614471, "grad_norm": 0.171875, "learning_rate": 5.285866011319179e-05, "loss": 0.04, "step": 7498 }, { "epoch": 0.9643362338720048, "grad_norm": 0.1494140625, "learning_rate": 5.2848569803055237e-05, "loss": 0.0309, "step": 7499 }, { "epoch": 0.9644648291825625, "grad_norm": 0.1494140625, "learning_rate": 5.283847937652942e-05, "loss": 0.0362, "step": 7500 }, { "epoch": 0.9644648291825625, "eval_loss": 0.038862455636262894, "eval_runtime": 1042.0076, "eval_samples_per_second": 94.266, "eval_steps_per_second": 1.178, "step": 7500 }, { "epoch": 0.9645934244931201, "grad_norm": 0.1650390625, "learning_rate": 5.2828388834026686e-05, "loss": 0.0357, "step": 7501 }, { "epoch": 0.9647220198036778, "grad_norm": 0.1689453125, "learning_rate": 5.281829817595928e-05, "loss": 0.0412, "step": 7502 }, { "epoch": 0.9648506151142355, "grad_norm": 0.189453125, "learning_rate": 5.280820740273951e-05, "loss": 0.0372, "step": 7503 }, { "epoch": 0.9649792104247932, "grad_norm": 0.158203125, "learning_rate": 5.279811651477966e-05, "loss": 0.0342, "step": 7504 }, { "epoch": 0.9651078057353508, "grad_norm": 0.173828125, "learning_rate": 5.2788025512492045e-05, "loss": 0.0421, "step": 7505 }, { "epoch": 0.9652364010459086, "grad_norm": 0.1552734375, "learning_rate": 5.277793439628897e-05, "loss": 0.0332, "step": 7506 }, { "epoch": 0.9653649963564662, "grad_norm": 0.1748046875, "learning_rate": 5.276784316658273e-05, "loss": 0.0312, "step": 7507 }, { "epoch": 0.9654935916670239, "grad_norm": 0.181640625, "learning_rate": 5.2757751823785665e-05, "loss": 0.0424, "step": 7508 }, { "epoch": 0.9656221869775815, "grad_norm": 0.16796875, "learning_rate": 5.274766036831008e-05, "loss": 0.0407, "step": 7509 }, { "epoch": 0.9657507822881393, "grad_norm": 0.16015625, "learning_rate": 5.2737568800568305e-05, "loss": 0.0301, "step": 7510 }, { "epoch": 0.9658793775986969, "grad_norm": 0.1630859375, "learning_rate": 5.272747712097268e-05, "loss": 0.035, "step": 7511 }, { "epoch": 0.9660079729092546, "grad_norm": 0.19140625, "learning_rate": 5.271738532993552e-05, "loss": 0.0501, "step": 7512 }, { "epoch": 0.9661365682198122, "grad_norm": 0.1689453125, "learning_rate": 5.2707293427869176e-05, "loss": 0.0348, "step": 7513 }, { "epoch": 0.96626516353037, "grad_norm": 0.15625, "learning_rate": 5.269720141518598e-05, "loss": 0.0355, "step": 7514 }, { "epoch": 0.9663937588409276, "grad_norm": 0.162109375, "learning_rate": 5.268710929229831e-05, "loss": 0.0322, "step": 7515 }, { "epoch": 0.9665223541514852, "grad_norm": 0.138671875, "learning_rate": 5.2677017059618486e-05, "loss": 0.0262, "step": 7516 }, { "epoch": 0.966650949462043, "grad_norm": 0.1552734375, "learning_rate": 5.266692471755886e-05, "loss": 0.0311, "step": 7517 }, { "epoch": 0.9667795447726006, "grad_norm": 0.185546875, "learning_rate": 5.265683226653182e-05, "loss": 0.0376, "step": 7518 }, { "epoch": 0.9669081400831583, "grad_norm": 0.1865234375, "learning_rate": 5.264673970694971e-05, "loss": 0.037, "step": 7519 }, { "epoch": 0.9670367353937159, "grad_norm": 0.162109375, "learning_rate": 5.263664703922493e-05, "loss": 0.0344, "step": 7520 }, { "epoch": 0.9671653307042737, "grad_norm": 0.18359375, "learning_rate": 5.262655426376981e-05, "loss": 0.0462, "step": 7521 }, { "epoch": 0.9672939260148313, "grad_norm": 0.1513671875, "learning_rate": 5.261646138099676e-05, "loss": 0.0314, "step": 7522 }, { "epoch": 0.967422521325389, "grad_norm": 0.1748046875, "learning_rate": 5.2606368391318174e-05, "loss": 0.0374, "step": 7523 }, { "epoch": 0.9675511166359467, "grad_norm": 0.1689453125, "learning_rate": 5.2596275295146416e-05, "loss": 0.0362, "step": 7524 }, { "epoch": 0.9676797119465044, "grad_norm": 0.185546875, "learning_rate": 5.2586182092893885e-05, "loss": 0.0459, "step": 7525 }, { "epoch": 0.967808307257062, "grad_norm": 0.1787109375, "learning_rate": 5.257608878497298e-05, "loss": 0.0391, "step": 7526 }, { "epoch": 0.9679369025676197, "grad_norm": 0.18359375, "learning_rate": 5.2565995371796084e-05, "loss": 0.0403, "step": 7527 }, { "epoch": 0.9680654978781774, "grad_norm": 0.1689453125, "learning_rate": 5.255590185377563e-05, "loss": 0.0398, "step": 7528 }, { "epoch": 0.9681940931887351, "grad_norm": 0.2119140625, "learning_rate": 5.254580823132401e-05, "loss": 0.0489, "step": 7529 }, { "epoch": 0.9683226884992927, "grad_norm": 0.1669921875, "learning_rate": 5.253571450485365e-05, "loss": 0.0366, "step": 7530 }, { "epoch": 0.9684512838098503, "grad_norm": 0.17578125, "learning_rate": 5.252562067477695e-05, "loss": 0.0373, "step": 7531 }, { "epoch": 0.9685798791204081, "grad_norm": 0.16796875, "learning_rate": 5.251552674150635e-05, "loss": 0.039, "step": 7532 }, { "epoch": 0.9687084744309657, "grad_norm": 0.181640625, "learning_rate": 5.250543270545426e-05, "loss": 0.0412, "step": 7533 }, { "epoch": 0.9688370697415234, "grad_norm": 0.1962890625, "learning_rate": 5.249533856703312e-05, "loss": 0.0468, "step": 7534 }, { "epoch": 0.9689656650520811, "grad_norm": 0.20703125, "learning_rate": 5.248524432665537e-05, "loss": 0.0564, "step": 7535 }, { "epoch": 0.9690942603626388, "grad_norm": 0.1943359375, "learning_rate": 5.247514998473344e-05, "loss": 0.0423, "step": 7536 }, { "epoch": 0.9692228556731964, "grad_norm": 0.1767578125, "learning_rate": 5.2465055541679776e-05, "loss": 0.0384, "step": 7537 }, { "epoch": 0.9693514509837541, "grad_norm": 0.1787109375, "learning_rate": 5.245496099790683e-05, "loss": 0.0405, "step": 7538 }, { "epoch": 0.9694800462943118, "grad_norm": 0.162109375, "learning_rate": 5.244486635382704e-05, "loss": 0.0372, "step": 7539 }, { "epoch": 0.9696086416048695, "grad_norm": 0.2109375, "learning_rate": 5.243477160985288e-05, "loss": 0.0603, "step": 7540 }, { "epoch": 0.9697372369154271, "grad_norm": 0.181640625, "learning_rate": 5.242467676639679e-05, "loss": 0.0455, "step": 7541 }, { "epoch": 0.9698658322259849, "grad_norm": 0.1748046875, "learning_rate": 5.2414581823871244e-05, "loss": 0.0395, "step": 7542 }, { "epoch": 0.9699944275365425, "grad_norm": 0.19921875, "learning_rate": 5.24044867826887e-05, "loss": 0.0529, "step": 7543 }, { "epoch": 0.9701230228471002, "grad_norm": 0.1689453125, "learning_rate": 5.239439164326165e-05, "loss": 0.0408, "step": 7544 }, { "epoch": 0.9702516181576578, "grad_norm": 0.2041015625, "learning_rate": 5.2384296406002544e-05, "loss": 0.0461, "step": 7545 }, { "epoch": 0.9703802134682156, "grad_norm": 0.16796875, "learning_rate": 5.237420107132388e-05, "loss": 0.0404, "step": 7546 }, { "epoch": 0.9705088087787732, "grad_norm": 0.173828125, "learning_rate": 5.236410563963814e-05, "loss": 0.0383, "step": 7547 }, { "epoch": 0.9706374040893309, "grad_norm": 0.1474609375, "learning_rate": 5.235401011135781e-05, "loss": 0.0326, "step": 7548 }, { "epoch": 0.9707659993998885, "grad_norm": 0.158203125, "learning_rate": 5.234391448689537e-05, "loss": 0.0375, "step": 7549 }, { "epoch": 0.9708945947104463, "grad_norm": 0.189453125, "learning_rate": 5.2333818766663344e-05, "loss": 0.0361, "step": 7550 }, { "epoch": 0.9710231900210039, "grad_norm": 0.16015625, "learning_rate": 5.2323722951074196e-05, "loss": 0.0363, "step": 7551 }, { "epoch": 0.9711517853315615, "grad_norm": 0.15625, "learning_rate": 5.231362704054045e-05, "loss": 0.03, "step": 7552 }, { "epoch": 0.9712803806421193, "grad_norm": 0.19921875, "learning_rate": 5.23035310354746e-05, "loss": 0.0493, "step": 7553 }, { "epoch": 0.971408975952677, "grad_norm": 0.1708984375, "learning_rate": 5.2293434936289176e-05, "loss": 0.0439, "step": 7554 }, { "epoch": 0.9715375712632346, "grad_norm": 0.1767578125, "learning_rate": 5.228333874339667e-05, "loss": 0.0468, "step": 7555 }, { "epoch": 0.9716661665737922, "grad_norm": 0.1728515625, "learning_rate": 5.227324245720962e-05, "loss": 0.0406, "step": 7556 }, { "epoch": 0.97179476188435, "grad_norm": 0.1728515625, "learning_rate": 5.2263146078140535e-05, "loss": 0.0379, "step": 7557 }, { "epoch": 0.9719233571949076, "grad_norm": 0.173828125, "learning_rate": 5.2253049606601944e-05, "loss": 0.0387, "step": 7558 }, { "epoch": 0.9720519525054653, "grad_norm": 0.1591796875, "learning_rate": 5.22429530430064e-05, "loss": 0.0393, "step": 7559 }, { "epoch": 0.9721805478160229, "grad_norm": 0.1611328125, "learning_rate": 5.2232856387766403e-05, "loss": 0.0335, "step": 7560 }, { "epoch": 0.9723091431265807, "grad_norm": 0.1533203125, "learning_rate": 5.2222759641294504e-05, "loss": 0.0413, "step": 7561 }, { "epoch": 0.9724377384371383, "grad_norm": 0.1552734375, "learning_rate": 5.2212662804003265e-05, "loss": 0.0387, "step": 7562 }, { "epoch": 0.972566333747696, "grad_norm": 0.1875, "learning_rate": 5.2202565876305184e-05, "loss": 0.0361, "step": 7563 }, { "epoch": 0.9726949290582537, "grad_norm": 0.1640625, "learning_rate": 5.2192468858612855e-05, "loss": 0.0345, "step": 7564 }, { "epoch": 0.9728235243688114, "grad_norm": 0.1669921875, "learning_rate": 5.21823717513388e-05, "loss": 0.0385, "step": 7565 }, { "epoch": 0.972952119679369, "grad_norm": 0.1591796875, "learning_rate": 5.21722745548956e-05, "loss": 0.0371, "step": 7566 }, { "epoch": 0.9730807149899267, "grad_norm": 0.171875, "learning_rate": 5.2162177269695775e-05, "loss": 0.0401, "step": 7567 }, { "epoch": 0.9732093103004844, "grad_norm": 0.1806640625, "learning_rate": 5.215207989615195e-05, "loss": 0.0372, "step": 7568 }, { "epoch": 0.973337905611042, "grad_norm": 0.15625, "learning_rate": 5.214198243467663e-05, "loss": 0.0335, "step": 7569 }, { "epoch": 0.9734665009215997, "grad_norm": 0.1630859375, "learning_rate": 5.213188488568243e-05, "loss": 0.0403, "step": 7570 }, { "epoch": 0.9735950962321575, "grad_norm": 0.1533203125, "learning_rate": 5.212178724958189e-05, "loss": 0.0369, "step": 7571 }, { "epoch": 0.9737236915427151, "grad_norm": 0.177734375, "learning_rate": 5.211168952678761e-05, "loss": 0.0468, "step": 7572 }, { "epoch": 0.9738522868532727, "grad_norm": 0.1826171875, "learning_rate": 5.210159171771217e-05, "loss": 0.0425, "step": 7573 }, { "epoch": 0.9739808821638304, "grad_norm": 0.1904296875, "learning_rate": 5.2091493822768145e-05, "loss": 0.0447, "step": 7574 }, { "epoch": 0.9741094774743881, "grad_norm": 0.1669921875, "learning_rate": 5.208139584236814e-05, "loss": 0.0354, "step": 7575 }, { "epoch": 0.9742380727849458, "grad_norm": 0.181640625, "learning_rate": 5.2071297776924724e-05, "loss": 0.0439, "step": 7576 }, { "epoch": 0.9743666680955034, "grad_norm": 0.1796875, "learning_rate": 5.2061199626850486e-05, "loss": 0.0422, "step": 7577 }, { "epoch": 0.9744952634060611, "grad_norm": 0.185546875, "learning_rate": 5.2051101392558066e-05, "loss": 0.0394, "step": 7578 }, { "epoch": 0.9746238587166188, "grad_norm": 0.1650390625, "learning_rate": 5.2041003074460024e-05, "loss": 0.0374, "step": 7579 }, { "epoch": 0.9747524540271765, "grad_norm": 0.1640625, "learning_rate": 5.2030904672969006e-05, "loss": 0.0414, "step": 7580 }, { "epoch": 0.9748810493377341, "grad_norm": 0.154296875, "learning_rate": 5.202080618849757e-05, "loss": 0.0301, "step": 7581 }, { "epoch": 0.9750096446482919, "grad_norm": 0.169921875, "learning_rate": 5.2010707621458376e-05, "loss": 0.0371, "step": 7582 }, { "epoch": 0.9751382399588495, "grad_norm": 0.1669921875, "learning_rate": 5.200060897226401e-05, "loss": 0.042, "step": 7583 }, { "epoch": 0.9752668352694072, "grad_norm": 0.2099609375, "learning_rate": 5.19905102413271e-05, "loss": 0.0466, "step": 7584 }, { "epoch": 0.9753954305799648, "grad_norm": 0.1875, "learning_rate": 5.198041142906027e-05, "loss": 0.0455, "step": 7585 }, { "epoch": 0.9755240258905226, "grad_norm": 0.177734375, "learning_rate": 5.197031253587615e-05, "loss": 0.0463, "step": 7586 }, { "epoch": 0.9756526212010802, "grad_norm": 0.15625, "learning_rate": 5.196021356218737e-05, "loss": 0.0325, "step": 7587 }, { "epoch": 0.9757812165116378, "grad_norm": 0.1796875, "learning_rate": 5.195011450840654e-05, "loss": 0.0487, "step": 7588 }, { "epoch": 0.9759098118221956, "grad_norm": 0.193359375, "learning_rate": 5.194001537494632e-05, "loss": 0.032, "step": 7589 }, { "epoch": 0.9760384071327532, "grad_norm": 0.169921875, "learning_rate": 5.192991616221934e-05, "loss": 0.0337, "step": 7590 }, { "epoch": 0.9761670024433109, "grad_norm": 0.166015625, "learning_rate": 5.1919816870638225e-05, "loss": 0.039, "step": 7591 }, { "epoch": 0.9762955977538685, "grad_norm": 0.158203125, "learning_rate": 5.1909717500615664e-05, "loss": 0.0355, "step": 7592 }, { "epoch": 0.9764241930644263, "grad_norm": 0.181640625, "learning_rate": 5.189961805256426e-05, "loss": 0.0406, "step": 7593 }, { "epoch": 0.9765527883749839, "grad_norm": 0.1787109375, "learning_rate": 5.18895185268967e-05, "loss": 0.0388, "step": 7594 }, { "epoch": 0.9766813836855416, "grad_norm": 0.1611328125, "learning_rate": 5.1879418924025605e-05, "loss": 0.0386, "step": 7595 }, { "epoch": 0.9768099789960992, "grad_norm": 0.13671875, "learning_rate": 5.1869319244363645e-05, "loss": 0.0262, "step": 7596 }, { "epoch": 0.976938574306657, "grad_norm": 0.18359375, "learning_rate": 5.1859219488323505e-05, "loss": 0.0419, "step": 7597 }, { "epoch": 0.9770671696172146, "grad_norm": 0.1767578125, "learning_rate": 5.184911965631784e-05, "loss": 0.0404, "step": 7598 }, { "epoch": 0.9771957649277723, "grad_norm": 0.18359375, "learning_rate": 5.1839019748759285e-05, "loss": 0.0468, "step": 7599 }, { "epoch": 0.97732436023833, "grad_norm": 0.1865234375, "learning_rate": 5.1828919766060534e-05, "loss": 0.0435, "step": 7600 }, { "epoch": 0.9774529555488877, "grad_norm": 0.16796875, "learning_rate": 5.181881970863427e-05, "loss": 0.0328, "step": 7601 }, { "epoch": 0.9775815508594453, "grad_norm": 0.1865234375, "learning_rate": 5.180871957689315e-05, "loss": 0.0457, "step": 7602 }, { "epoch": 0.977710146170003, "grad_norm": 0.1669921875, "learning_rate": 5.1798619371249866e-05, "loss": 0.0385, "step": 7603 }, { "epoch": 0.9778387414805607, "grad_norm": 0.1728515625, "learning_rate": 5.17885190921171e-05, "loss": 0.0373, "step": 7604 }, { "epoch": 0.9779673367911184, "grad_norm": 0.1669921875, "learning_rate": 5.177841873990754e-05, "loss": 0.034, "step": 7605 }, { "epoch": 0.978095932101676, "grad_norm": 0.1689453125, "learning_rate": 5.176831831503387e-05, "loss": 0.0337, "step": 7606 }, { "epoch": 0.9782245274122336, "grad_norm": 0.1611328125, "learning_rate": 5.175821781790877e-05, "loss": 0.036, "step": 7607 }, { "epoch": 0.9783531227227914, "grad_norm": 0.16796875, "learning_rate": 5.174811724894495e-05, "loss": 0.0331, "step": 7608 }, { "epoch": 0.978481718033349, "grad_norm": 0.1669921875, "learning_rate": 5.173801660855512e-05, "loss": 0.0396, "step": 7609 }, { "epoch": 0.9786103133439067, "grad_norm": 0.1796875, "learning_rate": 5.172791589715194e-05, "loss": 0.0481, "step": 7610 }, { "epoch": 0.9787389086544644, "grad_norm": 0.169921875, "learning_rate": 5.171781511514815e-05, "loss": 0.0374, "step": 7611 }, { "epoch": 0.9788675039650221, "grad_norm": 0.15625, "learning_rate": 5.170771426295643e-05, "loss": 0.0362, "step": 7612 }, { "epoch": 0.9789960992755797, "grad_norm": 0.1767578125, "learning_rate": 5.1697613340989505e-05, "loss": 0.0371, "step": 7613 }, { "epoch": 0.9791246945861374, "grad_norm": 0.14453125, "learning_rate": 5.1687512349660095e-05, "loss": 0.0302, "step": 7614 }, { "epoch": 0.9792532898966951, "grad_norm": 0.1591796875, "learning_rate": 5.167741128938088e-05, "loss": 0.0357, "step": 7615 }, { "epoch": 0.9793818852072528, "grad_norm": 0.171875, "learning_rate": 5.166731016056463e-05, "loss": 0.0476, "step": 7616 }, { "epoch": 0.9795104805178104, "grad_norm": 0.1494140625, "learning_rate": 5.1657208963624016e-05, "loss": 0.0328, "step": 7617 }, { "epoch": 0.9796390758283682, "grad_norm": 0.1845703125, "learning_rate": 5.16471076989718e-05, "loss": 0.0463, "step": 7618 }, { "epoch": 0.9797676711389258, "grad_norm": 0.158203125, "learning_rate": 5.163700636702067e-05, "loss": 0.0365, "step": 7619 }, { "epoch": 0.9798962664494835, "grad_norm": 0.2060546875, "learning_rate": 5.162690496818339e-05, "loss": 0.0507, "step": 7620 }, { "epoch": 0.9800248617600411, "grad_norm": 0.1884765625, "learning_rate": 5.161680350287268e-05, "loss": 0.0435, "step": 7621 }, { "epoch": 0.9801534570705989, "grad_norm": 0.1728515625, "learning_rate": 5.1606701971501245e-05, "loss": 0.0405, "step": 7622 }, { "epoch": 0.9802820523811565, "grad_norm": 0.1796875, "learning_rate": 5.1596600374481865e-05, "loss": 0.0414, "step": 7623 }, { "epoch": 0.9804106476917142, "grad_norm": 0.1708984375, "learning_rate": 5.1586498712227246e-05, "loss": 0.0361, "step": 7624 }, { "epoch": 0.9805392430022718, "grad_norm": 0.1767578125, "learning_rate": 5.157639698515015e-05, "loss": 0.0394, "step": 7625 }, { "epoch": 0.9806678383128296, "grad_norm": 0.1767578125, "learning_rate": 5.156629519366333e-05, "loss": 0.0395, "step": 7626 }, { "epoch": 0.9807964336233872, "grad_norm": 0.173828125, "learning_rate": 5.155619333817949e-05, "loss": 0.0323, "step": 7627 }, { "epoch": 0.9809250289339448, "grad_norm": 0.1669921875, "learning_rate": 5.154609141911144e-05, "loss": 0.0379, "step": 7628 }, { "epoch": 0.9810536242445026, "grad_norm": 0.18359375, "learning_rate": 5.153598943687188e-05, "loss": 0.0412, "step": 7629 }, { "epoch": 0.9811822195550602, "grad_norm": 0.1611328125, "learning_rate": 5.1525887391873605e-05, "loss": 0.0323, "step": 7630 }, { "epoch": 0.9813108148656179, "grad_norm": 0.1640625, "learning_rate": 5.151578528452935e-05, "loss": 0.0402, "step": 7631 }, { "epoch": 0.9814394101761755, "grad_norm": 0.173828125, "learning_rate": 5.150568311525188e-05, "loss": 0.0378, "step": 7632 }, { "epoch": 0.9815680054867333, "grad_norm": 0.169921875, "learning_rate": 5.149558088445396e-05, "loss": 0.0413, "step": 7633 }, { "epoch": 0.9816966007972909, "grad_norm": 0.18359375, "learning_rate": 5.148547859254834e-05, "loss": 0.0406, "step": 7634 }, { "epoch": 0.9818251961078486, "grad_norm": 0.1796875, "learning_rate": 5.1475376239947825e-05, "loss": 0.0422, "step": 7635 }, { "epoch": 0.9819537914184063, "grad_norm": 0.1611328125, "learning_rate": 5.1465273827065145e-05, "loss": 0.0333, "step": 7636 }, { "epoch": 0.982082386728964, "grad_norm": 0.1640625, "learning_rate": 5.1455171354313105e-05, "loss": 0.0364, "step": 7637 }, { "epoch": 0.9822109820395216, "grad_norm": 0.1875, "learning_rate": 5.1445068822104434e-05, "loss": 0.0435, "step": 7638 }, { "epoch": 0.9823395773500793, "grad_norm": 0.1728515625, "learning_rate": 5.1434966230851964e-05, "loss": 0.0357, "step": 7639 }, { "epoch": 0.982468172660637, "grad_norm": 0.171875, "learning_rate": 5.1424863580968455e-05, "loss": 0.0359, "step": 7640 }, { "epoch": 0.9825967679711947, "grad_norm": 0.1533203125, "learning_rate": 5.141476087286667e-05, "loss": 0.0331, "step": 7641 }, { "epoch": 0.9827253632817523, "grad_norm": 0.1767578125, "learning_rate": 5.140465810695942e-05, "loss": 0.0469, "step": 7642 }, { "epoch": 0.98285395859231, "grad_norm": 0.1806640625, "learning_rate": 5.1394555283659483e-05, "loss": 0.0378, "step": 7643 }, { "epoch": 0.9829825539028677, "grad_norm": 0.18359375, "learning_rate": 5.1384452403379645e-05, "loss": 0.0381, "step": 7644 }, { "epoch": 0.9831111492134254, "grad_norm": 0.16796875, "learning_rate": 5.137434946653271e-05, "loss": 0.0338, "step": 7645 }, { "epoch": 0.983239744523983, "grad_norm": 0.177734375, "learning_rate": 5.136424647353145e-05, "loss": 0.0454, "step": 7646 }, { "epoch": 0.9833683398345408, "grad_norm": 0.1611328125, "learning_rate": 5.1354143424788684e-05, "loss": 0.0373, "step": 7647 }, { "epoch": 0.9834969351450984, "grad_norm": 0.1796875, "learning_rate": 5.134404032071719e-05, "loss": 0.0407, "step": 7648 }, { "epoch": 0.983625530455656, "grad_norm": 0.173828125, "learning_rate": 5.133393716172978e-05, "loss": 0.0359, "step": 7649 }, { "epoch": 0.9837541257662137, "grad_norm": 0.1640625, "learning_rate": 5.1323833948239264e-05, "loss": 0.0335, "step": 7650 }, { "epoch": 0.9838827210767714, "grad_norm": 0.1748046875, "learning_rate": 5.1313730680658436e-05, "loss": 0.0399, "step": 7651 }, { "epoch": 0.9840113163873291, "grad_norm": 0.177734375, "learning_rate": 5.130362735940012e-05, "loss": 0.0407, "step": 7652 }, { "epoch": 0.9841399116978867, "grad_norm": 0.193359375, "learning_rate": 5.12935239848771e-05, "loss": 0.0407, "step": 7653 }, { "epoch": 0.9842685070084444, "grad_norm": 0.1767578125, "learning_rate": 5.1283420557502204e-05, "loss": 0.0387, "step": 7654 }, { "epoch": 0.9843971023190021, "grad_norm": 0.1767578125, "learning_rate": 5.1273317077688245e-05, "loss": 0.04, "step": 7655 }, { "epoch": 0.9845256976295598, "grad_norm": 0.142578125, "learning_rate": 5.126321354584805e-05, "loss": 0.0318, "step": 7656 }, { "epoch": 0.9846542929401174, "grad_norm": 0.1630859375, "learning_rate": 5.125310996239443e-05, "loss": 0.0385, "step": 7657 }, { "epoch": 0.9847828882506752, "grad_norm": 0.158203125, "learning_rate": 5.124300632774018e-05, "loss": 0.0342, "step": 7658 }, { "epoch": 0.9849114835612328, "grad_norm": 0.1865234375, "learning_rate": 5.1232902642298166e-05, "loss": 0.0413, "step": 7659 }, { "epoch": 0.9850400788717905, "grad_norm": 0.15625, "learning_rate": 5.122279890648117e-05, "loss": 0.0291, "step": 7660 }, { "epoch": 0.9851686741823481, "grad_norm": 0.16796875, "learning_rate": 5.121269512070206e-05, "loss": 0.0359, "step": 7661 }, { "epoch": 0.9852972694929059, "grad_norm": 0.1630859375, "learning_rate": 5.120259128537364e-05, "loss": 0.0335, "step": 7662 }, { "epoch": 0.9854258648034635, "grad_norm": 0.1923828125, "learning_rate": 5.119248740090874e-05, "loss": 0.0488, "step": 7663 }, { "epoch": 0.9855544601140211, "grad_norm": 0.1748046875, "learning_rate": 5.1182383467720196e-05, "loss": 0.0381, "step": 7664 }, { "epoch": 0.9856830554245789, "grad_norm": 0.169921875, "learning_rate": 5.117227948622085e-05, "loss": 0.0384, "step": 7665 }, { "epoch": 0.9858116507351365, "grad_norm": 0.1904296875, "learning_rate": 5.1162175456823545e-05, "loss": 0.0433, "step": 7666 }, { "epoch": 0.9859402460456942, "grad_norm": 0.166015625, "learning_rate": 5.115207137994109e-05, "loss": 0.0339, "step": 7667 }, { "epoch": 0.9860688413562518, "grad_norm": 0.2001953125, "learning_rate": 5.114196725598637e-05, "loss": 0.0346, "step": 7668 }, { "epoch": 0.9861974366668096, "grad_norm": 0.181640625, "learning_rate": 5.113186308537219e-05, "loss": 0.0418, "step": 7669 }, { "epoch": 0.9863260319773672, "grad_norm": 0.177734375, "learning_rate": 5.1121758868511406e-05, "loss": 0.0401, "step": 7670 }, { "epoch": 0.9864546272879249, "grad_norm": 0.1748046875, "learning_rate": 5.111165460581687e-05, "loss": 0.0346, "step": 7671 }, { "epoch": 0.9865832225984825, "grad_norm": 0.1796875, "learning_rate": 5.110155029770143e-05, "loss": 0.0403, "step": 7672 }, { "epoch": 0.9867118179090403, "grad_norm": 0.166015625, "learning_rate": 5.109144594457793e-05, "loss": 0.0393, "step": 7673 }, { "epoch": 0.9868404132195979, "grad_norm": 0.1865234375, "learning_rate": 5.1081341546859216e-05, "loss": 0.0443, "step": 7674 }, { "epoch": 0.9869690085301556, "grad_norm": 0.1748046875, "learning_rate": 5.107123710495816e-05, "loss": 0.0424, "step": 7675 }, { "epoch": 0.9870976038407133, "grad_norm": 0.2041015625, "learning_rate": 5.10611326192876e-05, "loss": 0.0347, "step": 7676 }, { "epoch": 0.987226199151271, "grad_norm": 0.16015625, "learning_rate": 5.105102809026041e-05, "loss": 0.0383, "step": 7677 }, { "epoch": 0.9873547944618286, "grad_norm": 0.1865234375, "learning_rate": 5.104092351828944e-05, "loss": 0.0408, "step": 7678 }, { "epoch": 0.9874833897723863, "grad_norm": 0.189453125, "learning_rate": 5.1030818903787545e-05, "loss": 0.0493, "step": 7679 }, { "epoch": 0.987611985082944, "grad_norm": 0.162109375, "learning_rate": 5.102071424716761e-05, "loss": 0.0337, "step": 7680 }, { "epoch": 0.9877405803935017, "grad_norm": 0.158203125, "learning_rate": 5.101060954884248e-05, "loss": 0.0345, "step": 7681 }, { "epoch": 0.9878691757040593, "grad_norm": 0.1533203125, "learning_rate": 5.100050480922501e-05, "loss": 0.0311, "step": 7682 }, { "epoch": 0.987997771014617, "grad_norm": 0.17578125, "learning_rate": 5.0990400028728104e-05, "loss": 0.0393, "step": 7683 }, { "epoch": 0.9881263663251747, "grad_norm": 0.1787109375, "learning_rate": 5.0980295207764595e-05, "loss": 0.0458, "step": 7684 }, { "epoch": 0.9882549616357323, "grad_norm": 0.169921875, "learning_rate": 5.097019034674738e-05, "loss": 0.0343, "step": 7685 }, { "epoch": 0.98838355694629, "grad_norm": 0.1650390625, "learning_rate": 5.096008544608931e-05, "loss": 0.0368, "step": 7686 }, { "epoch": 0.9885121522568477, "grad_norm": 0.1826171875, "learning_rate": 5.0949980506203285e-05, "loss": 0.0356, "step": 7687 }, { "epoch": 0.9886407475674054, "grad_norm": 0.1748046875, "learning_rate": 5.0939875527502157e-05, "loss": 0.0401, "step": 7688 }, { "epoch": 0.988769342877963, "grad_norm": 0.1845703125, "learning_rate": 5.092977051039881e-05, "loss": 0.0461, "step": 7689 }, { "epoch": 0.9888979381885207, "grad_norm": 0.185546875, "learning_rate": 5.091966545530614e-05, "loss": 0.0435, "step": 7690 }, { "epoch": 0.9890265334990784, "grad_norm": 0.1728515625, "learning_rate": 5.090956036263701e-05, "loss": 0.0457, "step": 7691 }, { "epoch": 0.9891551288096361, "grad_norm": 0.150390625, "learning_rate": 5.089945523280431e-05, "loss": 0.0357, "step": 7692 }, { "epoch": 0.9892837241201937, "grad_norm": 0.171875, "learning_rate": 5.088935006622092e-05, "loss": 0.0454, "step": 7693 }, { "epoch": 0.9894123194307515, "grad_norm": 0.171875, "learning_rate": 5.0879244863299726e-05, "loss": 0.0374, "step": 7694 }, { "epoch": 0.9895409147413091, "grad_norm": 0.1904296875, "learning_rate": 5.086913962445361e-05, "loss": 0.0404, "step": 7695 }, { "epoch": 0.9896695100518668, "grad_norm": 0.1845703125, "learning_rate": 5.085903435009547e-05, "loss": 0.0499, "step": 7696 }, { "epoch": 0.9897981053624244, "grad_norm": 0.158203125, "learning_rate": 5.0848929040638194e-05, "loss": 0.0338, "step": 7697 }, { "epoch": 0.9899267006729822, "grad_norm": 0.1845703125, "learning_rate": 5.0838823696494655e-05, "loss": 0.0443, "step": 7698 }, { "epoch": 0.9900552959835398, "grad_norm": 0.16796875, "learning_rate": 5.082871831807777e-05, "loss": 0.0452, "step": 7699 }, { "epoch": 0.9901838912940975, "grad_norm": 0.181640625, "learning_rate": 5.081861290580042e-05, "loss": 0.0396, "step": 7700 }, { "epoch": 0.9903124866046551, "grad_norm": 0.162109375, "learning_rate": 5.0808507460075506e-05, "loss": 0.0339, "step": 7701 }, { "epoch": 0.9904410819152129, "grad_norm": 0.1806640625, "learning_rate": 5.0798401981315934e-05, "loss": 0.0408, "step": 7702 }, { "epoch": 0.9905696772257705, "grad_norm": 0.1796875, "learning_rate": 5.078829646993457e-05, "loss": 0.0439, "step": 7703 }, { "epoch": 0.9906982725363281, "grad_norm": 0.18359375, "learning_rate": 5.077819092634436e-05, "loss": 0.0428, "step": 7704 }, { "epoch": 0.9908268678468859, "grad_norm": 0.1572265625, "learning_rate": 5.076808535095816e-05, "loss": 0.0411, "step": 7705 }, { "epoch": 0.9909554631574435, "grad_norm": 0.1767578125, "learning_rate": 5.075797974418889e-05, "loss": 0.0402, "step": 7706 }, { "epoch": 0.9910840584680012, "grad_norm": 0.16796875, "learning_rate": 5.074787410644947e-05, "loss": 0.0376, "step": 7707 }, { "epoch": 0.9912126537785588, "grad_norm": 0.1630859375, "learning_rate": 5.073776843815277e-05, "loss": 0.0394, "step": 7708 }, { "epoch": 0.9913412490891166, "grad_norm": 0.15234375, "learning_rate": 5.0727662739711726e-05, "loss": 0.0325, "step": 7709 }, { "epoch": 0.9914698443996742, "grad_norm": 0.169921875, "learning_rate": 5.071755701153922e-05, "loss": 0.0366, "step": 7710 }, { "epoch": 0.9915984397102319, "grad_norm": 0.173828125, "learning_rate": 5.0707451254048174e-05, "loss": 0.0427, "step": 7711 }, { "epoch": 0.9917270350207896, "grad_norm": 0.1826171875, "learning_rate": 5.06973454676515e-05, "loss": 0.0479, "step": 7712 }, { "epoch": 0.9918556303313473, "grad_norm": 0.2080078125, "learning_rate": 5.068723965276212e-05, "loss": 0.0428, "step": 7713 }, { "epoch": 0.9919842256419049, "grad_norm": 0.1630859375, "learning_rate": 5.067713380979292e-05, "loss": 0.0385, "step": 7714 }, { "epoch": 0.9921128209524626, "grad_norm": 0.154296875, "learning_rate": 5.066702793915682e-05, "loss": 0.0371, "step": 7715 }, { "epoch": 0.9922414162630203, "grad_norm": 0.1640625, "learning_rate": 5.0656922041266754e-05, "loss": 0.0377, "step": 7716 }, { "epoch": 0.992370011573578, "grad_norm": 0.1767578125, "learning_rate": 5.0646816116535614e-05, "loss": 0.0382, "step": 7717 }, { "epoch": 0.9924986068841356, "grad_norm": 0.17578125, "learning_rate": 5.063671016537632e-05, "loss": 0.0426, "step": 7718 }, { "epoch": 0.9926272021946932, "grad_norm": 0.1748046875, "learning_rate": 5.062660418820179e-05, "loss": 0.0351, "step": 7719 }, { "epoch": 0.992755797505251, "grad_norm": 0.1640625, "learning_rate": 5.0616498185424945e-05, "loss": 0.035, "step": 7720 }, { "epoch": 0.9928843928158086, "grad_norm": 0.1669921875, "learning_rate": 5.060639215745872e-05, "loss": 0.0336, "step": 7721 }, { "epoch": 0.9930129881263663, "grad_norm": 0.1904296875, "learning_rate": 5.0596286104716e-05, "loss": 0.0423, "step": 7722 }, { "epoch": 0.993141583436924, "grad_norm": 0.1923828125, "learning_rate": 5.058618002760974e-05, "loss": 0.0375, "step": 7723 }, { "epoch": 0.9932701787474817, "grad_norm": 0.1884765625, "learning_rate": 5.057607392655284e-05, "loss": 0.0488, "step": 7724 }, { "epoch": 0.9933987740580393, "grad_norm": 0.166015625, "learning_rate": 5.0565967801958245e-05, "loss": 0.0376, "step": 7725 }, { "epoch": 0.993527369368597, "grad_norm": 0.1845703125, "learning_rate": 5.055586165423886e-05, "loss": 0.0447, "step": 7726 }, { "epoch": 0.9936559646791547, "grad_norm": 0.171875, "learning_rate": 5.054575548380761e-05, "loss": 0.0389, "step": 7727 }, { "epoch": 0.9937845599897124, "grad_norm": 0.181640625, "learning_rate": 5.053564929107745e-05, "loss": 0.0366, "step": 7728 }, { "epoch": 0.99391315530027, "grad_norm": 0.16015625, "learning_rate": 5.052554307646127e-05, "loss": 0.0303, "step": 7729 }, { "epoch": 0.9940417506108278, "grad_norm": 0.1865234375, "learning_rate": 5.051543684037202e-05, "loss": 0.0333, "step": 7730 }, { "epoch": 0.9941703459213854, "grad_norm": 0.1943359375, "learning_rate": 5.050533058322262e-05, "loss": 0.0476, "step": 7731 }, { "epoch": 0.9942989412319431, "grad_norm": 0.1591796875, "learning_rate": 5.0495224305426e-05, "loss": 0.0356, "step": 7732 }, { "epoch": 0.9944275365425007, "grad_norm": 0.154296875, "learning_rate": 5.0485118007395104e-05, "loss": 0.0359, "step": 7733 }, { "epoch": 0.9945561318530585, "grad_norm": 0.328125, "learning_rate": 5.047501168954285e-05, "loss": 0.0467, "step": 7734 }, { "epoch": 0.9946847271636161, "grad_norm": 0.1669921875, "learning_rate": 5.0464905352282174e-05, "loss": 0.0407, "step": 7735 }, { "epoch": 0.9948133224741738, "grad_norm": 0.1884765625, "learning_rate": 5.0454798996026e-05, "loss": 0.043, "step": 7736 }, { "epoch": 0.9949419177847314, "grad_norm": 0.1826171875, "learning_rate": 5.0444692621187285e-05, "loss": 0.0435, "step": 7737 }, { "epoch": 0.9950705130952892, "grad_norm": 0.1865234375, "learning_rate": 5.0434586228178934e-05, "loss": 0.0426, "step": 7738 }, { "epoch": 0.9951991084058468, "grad_norm": 0.166015625, "learning_rate": 5.0424479817413907e-05, "loss": 0.0337, "step": 7739 }, { "epoch": 0.9953277037164044, "grad_norm": 0.1748046875, "learning_rate": 5.041437338930515e-05, "loss": 0.0363, "step": 7740 }, { "epoch": 0.9954562990269622, "grad_norm": 0.16015625, "learning_rate": 5.0404266944265555e-05, "loss": 0.0363, "step": 7741 }, { "epoch": 0.9955848943375198, "grad_norm": 0.1708984375, "learning_rate": 5.0394160482708095e-05, "loss": 0.0356, "step": 7742 }, { "epoch": 0.9957134896480775, "grad_norm": 0.154296875, "learning_rate": 5.0384054005045686e-05, "loss": 0.0336, "step": 7743 }, { "epoch": 0.9958420849586351, "grad_norm": 0.166015625, "learning_rate": 5.0373947511691286e-05, "loss": 0.037, "step": 7744 }, { "epoch": 0.9959706802691929, "grad_norm": 0.19140625, "learning_rate": 5.036384100305783e-05, "loss": 0.0391, "step": 7745 }, { "epoch": 0.9960992755797505, "grad_norm": 0.1728515625, "learning_rate": 5.035373447955826e-05, "loss": 0.0419, "step": 7746 }, { "epoch": 0.9962278708903082, "grad_norm": 0.189453125, "learning_rate": 5.034362794160551e-05, "loss": 0.0411, "step": 7747 }, { "epoch": 0.9963564662008658, "grad_norm": 0.1630859375, "learning_rate": 5.0333521389612515e-05, "loss": 0.038, "step": 7748 }, { "epoch": 0.9964850615114236, "grad_norm": 0.19140625, "learning_rate": 5.032341482399224e-05, "loss": 0.0442, "step": 7749 }, { "epoch": 0.9966136568219812, "grad_norm": 0.1767578125, "learning_rate": 5.031330824515761e-05, "loss": 0.0378, "step": 7750 }, { "epoch": 0.9967422521325389, "grad_norm": 0.1806640625, "learning_rate": 5.030320165352157e-05, "loss": 0.0507, "step": 7751 }, { "epoch": 0.9968708474430966, "grad_norm": 0.18359375, "learning_rate": 5.029309504949706e-05, "loss": 0.0456, "step": 7752 }, { "epoch": 0.9969994427536543, "grad_norm": 0.173828125, "learning_rate": 5.0282988433497036e-05, "loss": 0.0409, "step": 7753 }, { "epoch": 0.9971280380642119, "grad_norm": 0.21875, "learning_rate": 5.0272881805934433e-05, "loss": 0.0364, "step": 7754 }, { "epoch": 0.9972566333747696, "grad_norm": 0.1513671875, "learning_rate": 5.02627751672222e-05, "loss": 0.0338, "step": 7755 }, { "epoch": 0.9973852286853273, "grad_norm": 0.2451171875, "learning_rate": 5.025266851777327e-05, "loss": 0.0436, "step": 7756 }, { "epoch": 0.997513823995885, "grad_norm": 0.169921875, "learning_rate": 5.0242561858000613e-05, "loss": 0.0388, "step": 7757 }, { "epoch": 0.9976424193064426, "grad_norm": 0.1748046875, "learning_rate": 5.023245518831715e-05, "loss": 0.0365, "step": 7758 }, { "epoch": 0.9977710146170004, "grad_norm": 0.181640625, "learning_rate": 5.022234850913585e-05, "loss": 0.0478, "step": 7759 }, { "epoch": 0.997899609927558, "grad_norm": 0.166015625, "learning_rate": 5.0212241820869643e-05, "loss": 0.0369, "step": 7760 }, { "epoch": 0.9980282052381156, "grad_norm": 0.1640625, "learning_rate": 5.020213512393149e-05, "loss": 0.0385, "step": 7761 }, { "epoch": 0.9981568005486733, "grad_norm": 0.19921875, "learning_rate": 5.0192028418734335e-05, "loss": 0.0511, "step": 7762 }, { "epoch": 0.998285395859231, "grad_norm": 0.16015625, "learning_rate": 5.0181921705691126e-05, "loss": 0.0368, "step": 7763 }, { "epoch": 0.9984139911697887, "grad_norm": 0.1552734375, "learning_rate": 5.0171814985214803e-05, "loss": 0.033, "step": 7764 }, { "epoch": 0.9985425864803463, "grad_norm": 0.158203125, "learning_rate": 5.016170825771831e-05, "loss": 0.0367, "step": 7765 }, { "epoch": 0.998671181790904, "grad_norm": 0.1767578125, "learning_rate": 5.0151601523614625e-05, "loss": 0.0403, "step": 7766 }, { "epoch": 0.9987997771014617, "grad_norm": 0.1669921875, "learning_rate": 5.014149478331667e-05, "loss": 0.0439, "step": 7767 }, { "epoch": 0.9989283724120194, "grad_norm": 0.166015625, "learning_rate": 5.0131388037237404e-05, "loss": 0.0322, "step": 7768 }, { "epoch": 0.999056967722577, "grad_norm": 0.16796875, "learning_rate": 5.012128128578977e-05, "loss": 0.036, "step": 7769 }, { "epoch": 0.9991855630331348, "grad_norm": 0.1845703125, "learning_rate": 5.011117452938672e-05, "loss": 0.0454, "step": 7770 }, { "epoch": 0.9993141583436924, "grad_norm": 0.16015625, "learning_rate": 5.010106776844122e-05, "loss": 0.0332, "step": 7771 }, { "epoch": 0.9994427536542501, "grad_norm": 0.181640625, "learning_rate": 5.009096100336621e-05, "loss": 0.0409, "step": 7772 }, { "epoch": 0.9995713489648077, "grad_norm": 0.1630859375, "learning_rate": 5.008085423457464e-05, "loss": 0.0344, "step": 7773 }, { "epoch": 0.9996999442753655, "grad_norm": 0.1533203125, "learning_rate": 5.007074746247945e-05, "loss": 0.0359, "step": 7774 }, { "epoch": 0.9998285395859231, "grad_norm": 0.16796875, "learning_rate": 5.006064068749362e-05, "loss": 0.0341, "step": 7775 }, { "epoch": 0.9999571348964807, "grad_norm": 0.1708984375, "learning_rate": 5.0050533910030076e-05, "loss": 0.0417, "step": 7776 }, { "epoch": 1.0000857302070385, "grad_norm": 0.1630859375, "learning_rate": 5.0040427130501767e-05, "loss": 0.0265, "step": 7777 }, { "epoch": 1.0002143255175961, "grad_norm": 0.1650390625, "learning_rate": 5.0030320349321655e-05, "loss": 0.0376, "step": 7778 }, { "epoch": 1.0003429208281538, "grad_norm": 0.1728515625, "learning_rate": 5.002021356690269e-05, "loss": 0.0347, "step": 7779 }, { "epoch": 1.0004715161387114, "grad_norm": 0.1640625, "learning_rate": 5.001010678365783e-05, "loss": 0.0369, "step": 7780 }, { "epoch": 1.000600111449269, "grad_norm": 0.16796875, "learning_rate": 5e-05, "loss": 0.0413, "step": 7781 }, { "epoch": 1.0007287067598267, "grad_norm": 0.166015625, "learning_rate": 4.998989321634219e-05, "loss": 0.0351, "step": 7782 }, { "epoch": 1.0008573020703846, "grad_norm": 0.1533203125, "learning_rate": 4.997978643309732e-05, "loss": 0.035, "step": 7783 }, { "epoch": 1.0009858973809422, "grad_norm": 0.169921875, "learning_rate": 4.996967965067836e-05, "loss": 0.0405, "step": 7784 }, { "epoch": 1.0011144926914999, "grad_norm": 0.16015625, "learning_rate": 4.9959572869498245e-05, "loss": 0.0318, "step": 7785 }, { "epoch": 1.0012430880020575, "grad_norm": 0.2060546875, "learning_rate": 4.994946608996995e-05, "loss": 0.0361, "step": 7786 }, { "epoch": 1.0013716833126152, "grad_norm": 0.150390625, "learning_rate": 4.9939359312506395e-05, "loss": 0.0297, "step": 7787 }, { "epoch": 1.0015002786231728, "grad_norm": 0.15234375, "learning_rate": 4.992925253752056e-05, "loss": 0.0333, "step": 7788 }, { "epoch": 1.0016288739337305, "grad_norm": 0.158203125, "learning_rate": 4.991914576542537e-05, "loss": 0.0362, "step": 7789 }, { "epoch": 1.0017574692442883, "grad_norm": 0.1552734375, "learning_rate": 4.99090389966338e-05, "loss": 0.0333, "step": 7790 }, { "epoch": 1.001886064554846, "grad_norm": 0.171875, "learning_rate": 4.9898932231558776e-05, "loss": 0.036, "step": 7791 }, { "epoch": 1.0020146598654036, "grad_norm": 0.1611328125, "learning_rate": 4.988882547061329e-05, "loss": 0.0353, "step": 7792 }, { "epoch": 1.0021432551759613, "grad_norm": 0.1748046875, "learning_rate": 4.9878718714210256e-05, "loss": 0.0425, "step": 7793 }, { "epoch": 1.002271850486519, "grad_norm": 0.1787109375, "learning_rate": 4.9868611962762614e-05, "loss": 0.0351, "step": 7794 }, { "epoch": 1.0024004457970765, "grad_norm": 0.181640625, "learning_rate": 4.985850521668336e-05, "loss": 0.0391, "step": 7795 }, { "epoch": 1.0025290411076342, "grad_norm": 0.1611328125, "learning_rate": 4.984839847638539e-05, "loss": 0.0343, "step": 7796 }, { "epoch": 1.002657636418192, "grad_norm": 0.1591796875, "learning_rate": 4.983829174228171e-05, "loss": 0.0299, "step": 7797 }, { "epoch": 1.0027862317287497, "grad_norm": 0.1435546875, "learning_rate": 4.982818501478521e-05, "loss": 0.0321, "step": 7798 }, { "epoch": 1.0029148270393073, "grad_norm": 0.150390625, "learning_rate": 4.981807829430889e-05, "loss": 0.0312, "step": 7799 }, { "epoch": 1.003043422349865, "grad_norm": 0.1533203125, "learning_rate": 4.980797158126567e-05, "loss": 0.0335, "step": 7800 }, { "epoch": 1.0031720176604226, "grad_norm": 0.15625, "learning_rate": 4.979786487606851e-05, "loss": 0.0329, "step": 7801 }, { "epoch": 1.0033006129709803, "grad_norm": 0.1591796875, "learning_rate": 4.978775817913035e-05, "loss": 0.0357, "step": 7802 }, { "epoch": 1.003429208281538, "grad_norm": 0.1865234375, "learning_rate": 4.9777651490864153e-05, "loss": 0.0459, "step": 7803 }, { "epoch": 1.0035578035920956, "grad_norm": 0.15625, "learning_rate": 4.976754481168285e-05, "loss": 0.0341, "step": 7804 }, { "epoch": 1.0036863989026534, "grad_norm": 0.1708984375, "learning_rate": 4.97574381419994e-05, "loss": 0.0387, "step": 7805 }, { "epoch": 1.003814994213211, "grad_norm": 0.1572265625, "learning_rate": 4.974733148222674e-05, "loss": 0.0369, "step": 7806 }, { "epoch": 1.0039435895237687, "grad_norm": 0.158203125, "learning_rate": 4.973722483277781e-05, "loss": 0.0336, "step": 7807 }, { "epoch": 1.0040721848343264, "grad_norm": 0.193359375, "learning_rate": 4.9727118194065585e-05, "loss": 0.0477, "step": 7808 }, { "epoch": 1.004200780144884, "grad_norm": 0.1484375, "learning_rate": 4.9717011566502976e-05, "loss": 0.0341, "step": 7809 }, { "epoch": 1.0043293754554417, "grad_norm": 0.1689453125, "learning_rate": 4.9706904950502956e-05, "loss": 0.0375, "step": 7810 }, { "epoch": 1.0044579707659993, "grad_norm": 0.169921875, "learning_rate": 4.9696798346478444e-05, "loss": 0.0383, "step": 7811 }, { "epoch": 1.0045865660765572, "grad_norm": 0.15234375, "learning_rate": 4.9686691754842405e-05, "loss": 0.03, "step": 7812 }, { "epoch": 1.0047151613871148, "grad_norm": 0.15234375, "learning_rate": 4.9676585176007765e-05, "loss": 0.0318, "step": 7813 }, { "epoch": 1.0048437566976725, "grad_norm": 0.154296875, "learning_rate": 4.9666478610387496e-05, "loss": 0.031, "step": 7814 }, { "epoch": 1.00497235200823, "grad_norm": 0.1513671875, "learning_rate": 4.965637205839449e-05, "loss": 0.0299, "step": 7815 }, { "epoch": 1.0051009473187877, "grad_norm": 0.1669921875, "learning_rate": 4.964626552044176e-05, "loss": 0.035, "step": 7816 }, { "epoch": 1.0052295426293454, "grad_norm": 0.1533203125, "learning_rate": 4.963615899694219e-05, "loss": 0.0355, "step": 7817 }, { "epoch": 1.005358137939903, "grad_norm": 0.169921875, "learning_rate": 4.9626052488308725e-05, "loss": 0.0425, "step": 7818 }, { "epoch": 1.005486733250461, "grad_norm": 0.15234375, "learning_rate": 4.961594599495434e-05, "loss": 0.0293, "step": 7819 }, { "epoch": 1.0056153285610185, "grad_norm": 0.1669921875, "learning_rate": 4.960583951729192e-05, "loss": 0.0359, "step": 7820 }, { "epoch": 1.0057439238715762, "grad_norm": 0.1611328125, "learning_rate": 4.959573305573447e-05, "loss": 0.0356, "step": 7821 }, { "epoch": 1.0058725191821338, "grad_norm": 0.1728515625, "learning_rate": 4.958562661069487e-05, "loss": 0.038, "step": 7822 }, { "epoch": 1.0060011144926915, "grad_norm": 0.142578125, "learning_rate": 4.9575520182586105e-05, "loss": 0.0325, "step": 7823 }, { "epoch": 1.0061297098032491, "grad_norm": 0.1630859375, "learning_rate": 4.9565413771821064e-05, "loss": 0.0388, "step": 7824 }, { "epoch": 1.0062583051138068, "grad_norm": 0.1669921875, "learning_rate": 4.9555307378812726e-05, "loss": 0.0365, "step": 7825 }, { "epoch": 1.0063869004243646, "grad_norm": 0.158203125, "learning_rate": 4.9545201003974e-05, "loss": 0.0322, "step": 7826 }, { "epoch": 1.0065154957349223, "grad_norm": 0.14453125, "learning_rate": 4.953509464771784e-05, "loss": 0.0277, "step": 7827 }, { "epoch": 1.00664409104548, "grad_norm": 0.158203125, "learning_rate": 4.952498831045716e-05, "loss": 0.0352, "step": 7828 }, { "epoch": 1.0067726863560376, "grad_norm": 0.1572265625, "learning_rate": 4.951488199260491e-05, "loss": 0.033, "step": 7829 }, { "epoch": 1.0069012816665952, "grad_norm": 0.1640625, "learning_rate": 4.950477569457401e-05, "loss": 0.0377, "step": 7830 }, { "epoch": 1.0070298769771528, "grad_norm": 0.171875, "learning_rate": 4.949466941677739e-05, "loss": 0.038, "step": 7831 }, { "epoch": 1.0071584722877105, "grad_norm": 0.1533203125, "learning_rate": 4.9484563159627997e-05, "loss": 0.0312, "step": 7832 }, { "epoch": 1.0072870675982684, "grad_norm": 0.1484375, "learning_rate": 4.947445692353874e-05, "loss": 0.035, "step": 7833 }, { "epoch": 1.007415662908826, "grad_norm": 0.162109375, "learning_rate": 4.946435070892257e-05, "loss": 0.0329, "step": 7834 }, { "epoch": 1.0075442582193836, "grad_norm": 0.16796875, "learning_rate": 4.9454244516192393e-05, "loss": 0.0393, "step": 7835 }, { "epoch": 1.0076728535299413, "grad_norm": 0.1650390625, "learning_rate": 4.944413834576116e-05, "loss": 0.0363, "step": 7836 }, { "epoch": 1.007801448840499, "grad_norm": 0.1650390625, "learning_rate": 4.943403219804177e-05, "loss": 0.0343, "step": 7837 }, { "epoch": 1.0079300441510566, "grad_norm": 0.15234375, "learning_rate": 4.942392607344717e-05, "loss": 0.0271, "step": 7838 }, { "epoch": 1.0080586394616142, "grad_norm": 0.162109375, "learning_rate": 4.9413819972390276e-05, "loss": 0.033, "step": 7839 }, { "epoch": 1.0081872347721719, "grad_norm": 0.1630859375, "learning_rate": 4.9403713895284015e-05, "loss": 0.0308, "step": 7840 }, { "epoch": 1.0083158300827297, "grad_norm": 0.1513671875, "learning_rate": 4.9393607842541315e-05, "loss": 0.0276, "step": 7841 }, { "epoch": 1.0084444253932874, "grad_norm": 0.1533203125, "learning_rate": 4.938350181457507e-05, "loss": 0.0353, "step": 7842 }, { "epoch": 1.008573020703845, "grad_norm": 0.15625, "learning_rate": 4.937339581179824e-05, "loss": 0.0312, "step": 7843 }, { "epoch": 1.0087016160144027, "grad_norm": 0.162109375, "learning_rate": 4.9363289834623696e-05, "loss": 0.0334, "step": 7844 }, { "epoch": 1.0088302113249603, "grad_norm": 0.173828125, "learning_rate": 4.935318388346442e-05, "loss": 0.0372, "step": 7845 }, { "epoch": 1.008958806635518, "grad_norm": 0.1533203125, "learning_rate": 4.934307795873325e-05, "loss": 0.0297, "step": 7846 }, { "epoch": 1.0090874019460756, "grad_norm": 0.15234375, "learning_rate": 4.93329720608432e-05, "loss": 0.0292, "step": 7847 }, { "epoch": 1.0092159972566335, "grad_norm": 0.146484375, "learning_rate": 4.9322866190207085e-05, "loss": 0.0294, "step": 7848 }, { "epoch": 1.0093445925671911, "grad_norm": 0.146484375, "learning_rate": 4.931276034723789e-05, "loss": 0.0328, "step": 7849 }, { "epoch": 1.0094731878777488, "grad_norm": 0.1767578125, "learning_rate": 4.930265453234849e-05, "loss": 0.0342, "step": 7850 }, { "epoch": 1.0096017831883064, "grad_norm": 0.1650390625, "learning_rate": 4.929254874595183e-05, "loss": 0.0339, "step": 7851 }, { "epoch": 1.009730378498864, "grad_norm": 0.158203125, "learning_rate": 4.928244298846079e-05, "loss": 0.0352, "step": 7852 }, { "epoch": 1.0098589738094217, "grad_norm": 0.1611328125, "learning_rate": 4.927233726028829e-05, "loss": 0.0315, "step": 7853 }, { "epoch": 1.0099875691199793, "grad_norm": 0.1669921875, "learning_rate": 4.926223156184725e-05, "loss": 0.0331, "step": 7854 }, { "epoch": 1.0101161644305372, "grad_norm": 0.166015625, "learning_rate": 4.925212589355055e-05, "loss": 0.0338, "step": 7855 }, { "epoch": 1.0102447597410948, "grad_norm": 0.177734375, "learning_rate": 4.924202025581112e-05, "loss": 0.0367, "step": 7856 }, { "epoch": 1.0103733550516525, "grad_norm": 0.2109375, "learning_rate": 4.923191464904185e-05, "loss": 0.0317, "step": 7857 }, { "epoch": 1.0105019503622101, "grad_norm": 0.173828125, "learning_rate": 4.922180907365566e-05, "loss": 0.04, "step": 7858 }, { "epoch": 1.0106305456727678, "grad_norm": 0.1708984375, "learning_rate": 4.921170353006543e-05, "loss": 0.0404, "step": 7859 }, { "epoch": 1.0107591409833254, "grad_norm": 0.1533203125, "learning_rate": 4.920159801868408e-05, "loss": 0.0302, "step": 7860 }, { "epoch": 1.010887736293883, "grad_norm": 0.1669921875, "learning_rate": 4.919149253992449e-05, "loss": 0.0369, "step": 7861 }, { "epoch": 1.011016331604441, "grad_norm": 0.1455078125, "learning_rate": 4.9181387094199585e-05, "loss": 0.0282, "step": 7862 }, { "epoch": 1.0111449269149986, "grad_norm": 0.1640625, "learning_rate": 4.9171281681922234e-05, "loss": 0.0343, "step": 7863 }, { "epoch": 1.0112735222255562, "grad_norm": 0.1630859375, "learning_rate": 4.9161176303505356e-05, "loss": 0.0372, "step": 7864 }, { "epoch": 1.0114021175361139, "grad_norm": 0.1767578125, "learning_rate": 4.915107095936183e-05, "loss": 0.0373, "step": 7865 }, { "epoch": 1.0115307128466715, "grad_norm": 0.1640625, "learning_rate": 4.914096564990455e-05, "loss": 0.0309, "step": 7866 }, { "epoch": 1.0116593081572292, "grad_norm": 0.1708984375, "learning_rate": 4.913086037554641e-05, "loss": 0.0396, "step": 7867 }, { "epoch": 1.0117879034677868, "grad_norm": 0.166015625, "learning_rate": 4.912075513670029e-05, "loss": 0.0369, "step": 7868 }, { "epoch": 1.0119164987783444, "grad_norm": 0.173828125, "learning_rate": 4.9110649933779093e-05, "loss": 0.0398, "step": 7869 }, { "epoch": 1.0120450940889023, "grad_norm": 0.158203125, "learning_rate": 4.910054476719569e-05, "loss": 0.037, "step": 7870 }, { "epoch": 1.01217368939946, "grad_norm": 0.1875, "learning_rate": 4.9090439637363e-05, "loss": 0.032, "step": 7871 }, { "epoch": 1.0123022847100176, "grad_norm": 0.1669921875, "learning_rate": 4.908033454469386e-05, "loss": 0.0387, "step": 7872 }, { "epoch": 1.0124308800205752, "grad_norm": 0.166015625, "learning_rate": 4.90702294896012e-05, "loss": 0.0317, "step": 7873 }, { "epoch": 1.0125594753311329, "grad_norm": 0.138671875, "learning_rate": 4.906012447249784e-05, "loss": 0.0257, "step": 7874 }, { "epoch": 1.0126880706416905, "grad_norm": 0.1513671875, "learning_rate": 4.905001949379672e-05, "loss": 0.0283, "step": 7875 }, { "epoch": 1.0128166659522482, "grad_norm": 0.169921875, "learning_rate": 4.903991455391071e-05, "loss": 0.0342, "step": 7876 }, { "epoch": 1.012945261262806, "grad_norm": 0.181640625, "learning_rate": 4.9029809653252626e-05, "loss": 0.038, "step": 7877 }, { "epoch": 1.0130738565733637, "grad_norm": 0.1650390625, "learning_rate": 4.901970479223543e-05, "loss": 0.0356, "step": 7878 }, { "epoch": 1.0132024518839213, "grad_norm": 0.1484375, "learning_rate": 4.900959997127191e-05, "loss": 0.0326, "step": 7879 }, { "epoch": 1.013331047194479, "grad_norm": 0.1494140625, "learning_rate": 4.8999495190775e-05, "loss": 0.031, "step": 7880 }, { "epoch": 1.0134596425050366, "grad_norm": 0.1474609375, "learning_rate": 4.898939045115753e-05, "loss": 0.0294, "step": 7881 }, { "epoch": 1.0135882378155943, "grad_norm": 0.1796875, "learning_rate": 4.897928575283241e-05, "loss": 0.0426, "step": 7882 }, { "epoch": 1.013716833126152, "grad_norm": 0.1767578125, "learning_rate": 4.896918109621245e-05, "loss": 0.0381, "step": 7883 }, { "epoch": 1.0138454284367098, "grad_norm": 0.17578125, "learning_rate": 4.895907648171057e-05, "loss": 0.0369, "step": 7884 }, { "epoch": 1.0139740237472674, "grad_norm": 0.1494140625, "learning_rate": 4.894897190973959e-05, "loss": 0.0331, "step": 7885 }, { "epoch": 1.014102619057825, "grad_norm": 0.142578125, "learning_rate": 4.8938867380712405e-05, "loss": 0.0275, "step": 7886 }, { "epoch": 1.0142312143683827, "grad_norm": 0.1416015625, "learning_rate": 4.892876289504185e-05, "loss": 0.0236, "step": 7887 }, { "epoch": 1.0143598096789403, "grad_norm": 0.1748046875, "learning_rate": 4.8918658453140795e-05, "loss": 0.0398, "step": 7888 }, { "epoch": 1.014488404989498, "grad_norm": 0.130859375, "learning_rate": 4.890855405542209e-05, "loss": 0.0282, "step": 7889 }, { "epoch": 1.0146170003000556, "grad_norm": 0.150390625, "learning_rate": 4.8898449702298584e-05, "loss": 0.0326, "step": 7890 }, { "epoch": 1.0147455956106135, "grad_norm": 0.1455078125, "learning_rate": 4.888834539418314e-05, "loss": 0.0315, "step": 7891 }, { "epoch": 1.0148741909211711, "grad_norm": 0.166015625, "learning_rate": 4.8878241131488605e-05, "loss": 0.0341, "step": 7892 }, { "epoch": 1.0150027862317288, "grad_norm": 0.15625, "learning_rate": 4.886813691462783e-05, "loss": 0.0314, "step": 7893 }, { "epoch": 1.0151313815422864, "grad_norm": 0.1845703125, "learning_rate": 4.885803274401364e-05, "loss": 0.0473, "step": 7894 }, { "epoch": 1.015259976852844, "grad_norm": 0.154296875, "learning_rate": 4.8847928620058916e-05, "loss": 0.0304, "step": 7895 }, { "epoch": 1.0153885721634017, "grad_norm": 0.177734375, "learning_rate": 4.883782454317646e-05, "loss": 0.0407, "step": 7896 }, { "epoch": 1.0155171674739594, "grad_norm": 0.14453125, "learning_rate": 4.8827720513779166e-05, "loss": 0.0254, "step": 7897 }, { "epoch": 1.015645762784517, "grad_norm": 0.1552734375, "learning_rate": 4.8817616532279796e-05, "loss": 0.0305, "step": 7898 }, { "epoch": 1.0157743580950749, "grad_norm": 0.1630859375, "learning_rate": 4.880751259909127e-05, "loss": 0.0334, "step": 7899 }, { "epoch": 1.0159029534056325, "grad_norm": 0.1591796875, "learning_rate": 4.8797408714626387e-05, "loss": 0.035, "step": 7900 }, { "epoch": 1.0160315487161902, "grad_norm": 0.1630859375, "learning_rate": 4.8787304879297954e-05, "loss": 0.0362, "step": 7901 }, { "epoch": 1.0161601440267478, "grad_norm": 0.1669921875, "learning_rate": 4.877720109351885e-05, "loss": 0.0357, "step": 7902 }, { "epoch": 1.0162887393373055, "grad_norm": 0.1513671875, "learning_rate": 4.8767097357701845e-05, "loss": 0.0295, "step": 7903 }, { "epoch": 1.016417334647863, "grad_norm": 0.169921875, "learning_rate": 4.875699367225982e-05, "loss": 0.0375, "step": 7904 }, { "epoch": 1.0165459299584207, "grad_norm": 0.1826171875, "learning_rate": 4.8746890037605585e-05, "loss": 0.043, "step": 7905 }, { "epoch": 1.0166745252689786, "grad_norm": 0.1484375, "learning_rate": 4.873678645415196e-05, "loss": 0.0312, "step": 7906 }, { "epoch": 1.0168031205795363, "grad_norm": 0.1669921875, "learning_rate": 4.872668292231175e-05, "loss": 0.0352, "step": 7907 }, { "epoch": 1.016931715890094, "grad_norm": 0.150390625, "learning_rate": 4.87165794424978e-05, "loss": 0.0267, "step": 7908 }, { "epoch": 1.0170603112006515, "grad_norm": 0.1484375, "learning_rate": 4.8706476015122906e-05, "loss": 0.0277, "step": 7909 }, { "epoch": 1.0171889065112092, "grad_norm": 0.1650390625, "learning_rate": 4.86963726405999e-05, "loss": 0.0413, "step": 7910 }, { "epoch": 1.0173175018217668, "grad_norm": 0.1748046875, "learning_rate": 4.868626931934156e-05, "loss": 0.0409, "step": 7911 }, { "epoch": 1.0174460971323245, "grad_norm": 0.15625, "learning_rate": 4.867616605176075e-05, "loss": 0.0297, "step": 7912 }, { "epoch": 1.0175746924428823, "grad_norm": 0.1669921875, "learning_rate": 4.866606283827023e-05, "loss": 0.0321, "step": 7913 }, { "epoch": 1.01770328775344, "grad_norm": 0.154296875, "learning_rate": 4.8655959679282826e-05, "loss": 0.0317, "step": 7914 }, { "epoch": 1.0178318830639976, "grad_norm": 0.1455078125, "learning_rate": 4.864585657521134e-05, "loss": 0.0259, "step": 7915 }, { "epoch": 1.0179604783745553, "grad_norm": 0.162109375, "learning_rate": 4.863575352646856e-05, "loss": 0.0324, "step": 7916 }, { "epoch": 1.018089073685113, "grad_norm": 0.177734375, "learning_rate": 4.862565053346731e-05, "loss": 0.0381, "step": 7917 }, { "epoch": 1.0182176689956706, "grad_norm": 0.177734375, "learning_rate": 4.8615547596620366e-05, "loss": 0.0379, "step": 7918 }, { "epoch": 1.0183462643062282, "grad_norm": 0.1787109375, "learning_rate": 4.860544471634053e-05, "loss": 0.0389, "step": 7919 }, { "epoch": 1.018474859616786, "grad_norm": 0.173828125, "learning_rate": 4.8595341893040576e-05, "loss": 0.0385, "step": 7920 }, { "epoch": 1.0186034549273437, "grad_norm": 0.16796875, "learning_rate": 4.858523912713334e-05, "loss": 0.0407, "step": 7921 }, { "epoch": 1.0187320502379014, "grad_norm": 0.1494140625, "learning_rate": 4.857513641903155e-05, "loss": 0.0289, "step": 7922 }, { "epoch": 1.018860645548459, "grad_norm": 0.1630859375, "learning_rate": 4.856503376914805e-05, "loss": 0.0351, "step": 7923 }, { "epoch": 1.0189892408590167, "grad_norm": 0.1708984375, "learning_rate": 4.855493117789558e-05, "loss": 0.0364, "step": 7924 }, { "epoch": 1.0191178361695743, "grad_norm": 0.15625, "learning_rate": 4.8544828645686913e-05, "loss": 0.0299, "step": 7925 }, { "epoch": 1.019246431480132, "grad_norm": 0.1494140625, "learning_rate": 4.853472617293488e-05, "loss": 0.0325, "step": 7926 }, { "epoch": 1.0193750267906898, "grad_norm": 0.162109375, "learning_rate": 4.852462376005219e-05, "loss": 0.0331, "step": 7927 }, { "epoch": 1.0195036221012475, "grad_norm": 0.17578125, "learning_rate": 4.8514521407451676e-05, "loss": 0.0409, "step": 7928 }, { "epoch": 1.019632217411805, "grad_norm": 0.166015625, "learning_rate": 4.850441911554605e-05, "loss": 0.0381, "step": 7929 }, { "epoch": 1.0197608127223627, "grad_norm": 0.1552734375, "learning_rate": 4.849431688474813e-05, "loss": 0.0319, "step": 7930 }, { "epoch": 1.0198894080329204, "grad_norm": 0.1552734375, "learning_rate": 4.8484214715470656e-05, "loss": 0.0335, "step": 7931 }, { "epoch": 1.020018003343478, "grad_norm": 0.1435546875, "learning_rate": 4.847411260812641e-05, "loss": 0.029, "step": 7932 }, { "epoch": 1.0201465986540357, "grad_norm": 0.1337890625, "learning_rate": 4.846401056312811e-05, "loss": 0.0278, "step": 7933 }, { "epoch": 1.0202751939645933, "grad_norm": 0.158203125, "learning_rate": 4.845390858088858e-05, "loss": 0.029, "step": 7934 }, { "epoch": 1.0204037892751512, "grad_norm": 0.1640625, "learning_rate": 4.844380666182051e-05, "loss": 0.0372, "step": 7935 }, { "epoch": 1.0205323845857088, "grad_norm": 0.15625, "learning_rate": 4.843370480633669e-05, "loss": 0.031, "step": 7936 }, { "epoch": 1.0206609798962665, "grad_norm": 0.158203125, "learning_rate": 4.842360301484986e-05, "loss": 0.0303, "step": 7937 }, { "epoch": 1.0207895752068241, "grad_norm": 0.1513671875, "learning_rate": 4.841350128777276e-05, "loss": 0.0347, "step": 7938 }, { "epoch": 1.0209181705173818, "grad_norm": 0.1650390625, "learning_rate": 4.840339962551816e-05, "loss": 0.0367, "step": 7939 }, { "epoch": 1.0210467658279394, "grad_norm": 0.1630859375, "learning_rate": 4.839329802849876e-05, "loss": 0.0307, "step": 7940 }, { "epoch": 1.021175361138497, "grad_norm": 0.173828125, "learning_rate": 4.838319649712735e-05, "loss": 0.0375, "step": 7941 }, { "epoch": 1.021303956449055, "grad_norm": 0.1611328125, "learning_rate": 4.8373095031816617e-05, "loss": 0.0326, "step": 7942 }, { "epoch": 1.0214325517596126, "grad_norm": 0.166015625, "learning_rate": 4.836299363297934e-05, "loss": 0.0382, "step": 7943 }, { "epoch": 1.0215611470701702, "grad_norm": 0.17578125, "learning_rate": 4.835289230102821e-05, "loss": 0.0385, "step": 7944 }, { "epoch": 1.0216897423807278, "grad_norm": 0.1494140625, "learning_rate": 4.834279103637599e-05, "loss": 0.0309, "step": 7945 }, { "epoch": 1.0218183376912855, "grad_norm": 0.1572265625, "learning_rate": 4.833268983943537e-05, "loss": 0.0332, "step": 7946 }, { "epoch": 1.0219469330018431, "grad_norm": 0.1611328125, "learning_rate": 4.832258871061912e-05, "loss": 0.0316, "step": 7947 }, { "epoch": 1.0220755283124008, "grad_norm": 0.16015625, "learning_rate": 4.831248765033993e-05, "loss": 0.0332, "step": 7948 }, { "epoch": 1.0222041236229586, "grad_norm": 0.1455078125, "learning_rate": 4.830238665901051e-05, "loss": 0.0302, "step": 7949 }, { "epoch": 1.0223327189335163, "grad_norm": 0.1640625, "learning_rate": 4.829228573704359e-05, "loss": 0.035, "step": 7950 }, { "epoch": 1.022461314244074, "grad_norm": 0.158203125, "learning_rate": 4.828218488485186e-05, "loss": 0.0295, "step": 7951 }, { "epoch": 1.0225899095546316, "grad_norm": 0.1572265625, "learning_rate": 4.827208410284809e-05, "loss": 0.0331, "step": 7952 }, { "epoch": 1.0227185048651892, "grad_norm": 0.1376953125, "learning_rate": 4.82619833914449e-05, "loss": 0.0242, "step": 7953 }, { "epoch": 1.0228471001757469, "grad_norm": 0.1533203125, "learning_rate": 4.8251882751055057e-05, "loss": 0.0346, "step": 7954 }, { "epoch": 1.0229756954863045, "grad_norm": 0.1669921875, "learning_rate": 4.824178218209123e-05, "loss": 0.0307, "step": 7955 }, { "epoch": 1.0231042907968624, "grad_norm": 0.1630859375, "learning_rate": 4.8231681684966145e-05, "loss": 0.0336, "step": 7956 }, { "epoch": 1.02323288610742, "grad_norm": 0.17578125, "learning_rate": 4.8221581260092465e-05, "loss": 0.0389, "step": 7957 }, { "epoch": 1.0233614814179777, "grad_norm": 0.158203125, "learning_rate": 4.82114809078829e-05, "loss": 0.0364, "step": 7958 }, { "epoch": 1.0234900767285353, "grad_norm": 0.1513671875, "learning_rate": 4.8201380628750145e-05, "loss": 0.0298, "step": 7959 }, { "epoch": 1.023618672039093, "grad_norm": 0.14453125, "learning_rate": 4.8191280423106854e-05, "loss": 0.0243, "step": 7960 }, { "epoch": 1.0237472673496506, "grad_norm": 0.16015625, "learning_rate": 4.8181180291365745e-05, "loss": 0.0339, "step": 7961 }, { "epoch": 1.0238758626602082, "grad_norm": 0.158203125, "learning_rate": 4.817108023393948e-05, "loss": 0.0332, "step": 7962 }, { "epoch": 1.024004457970766, "grad_norm": 0.171875, "learning_rate": 4.816098025124073e-05, "loss": 0.0345, "step": 7963 }, { "epoch": 1.0241330532813238, "grad_norm": 0.1337890625, "learning_rate": 4.815088034368218e-05, "loss": 0.0249, "step": 7964 }, { "epoch": 1.0242616485918814, "grad_norm": 0.1669921875, "learning_rate": 4.81407805116765e-05, "loss": 0.0311, "step": 7965 }, { "epoch": 1.024390243902439, "grad_norm": 0.162109375, "learning_rate": 4.813068075563635e-05, "loss": 0.0358, "step": 7966 }, { "epoch": 1.0245188392129967, "grad_norm": 0.1708984375, "learning_rate": 4.812058107597441e-05, "loss": 0.0354, "step": 7967 }, { "epoch": 1.0246474345235543, "grad_norm": 0.2109375, "learning_rate": 4.811048147310332e-05, "loss": 0.0407, "step": 7968 }, { "epoch": 1.024776029834112, "grad_norm": 0.1650390625, "learning_rate": 4.810038194743575e-05, "loss": 0.034, "step": 7969 }, { "epoch": 1.0249046251446696, "grad_norm": 0.16796875, "learning_rate": 4.8090282499384334e-05, "loss": 0.032, "step": 7970 }, { "epoch": 1.0250332204552275, "grad_norm": 0.1611328125, "learning_rate": 4.8080183129361787e-05, "loss": 0.0343, "step": 7971 }, { "epoch": 1.0251618157657851, "grad_norm": 0.158203125, "learning_rate": 4.807008383778069e-05, "loss": 0.0362, "step": 7972 }, { "epoch": 1.0252904110763428, "grad_norm": 0.142578125, "learning_rate": 4.80599846250537e-05, "loss": 0.0226, "step": 7973 }, { "epoch": 1.0254190063869004, "grad_norm": 0.1591796875, "learning_rate": 4.804988549159348e-05, "loss": 0.0294, "step": 7974 }, { "epoch": 1.025547601697458, "grad_norm": 0.1494140625, "learning_rate": 4.803978643781264e-05, "loss": 0.029, "step": 7975 }, { "epoch": 1.0256761970080157, "grad_norm": 0.1376953125, "learning_rate": 4.8029687464123866e-05, "loss": 0.0303, "step": 7976 }, { "epoch": 1.0258047923185734, "grad_norm": 0.1669921875, "learning_rate": 4.801958857093973e-05, "loss": 0.0349, "step": 7977 }, { "epoch": 1.0259333876291312, "grad_norm": 0.1494140625, "learning_rate": 4.8009489758672915e-05, "loss": 0.0308, "step": 7978 }, { "epoch": 1.0260619829396889, "grad_norm": 0.1455078125, "learning_rate": 4.799939102773599e-05, "loss": 0.0278, "step": 7979 }, { "epoch": 1.0261905782502465, "grad_norm": 0.181640625, "learning_rate": 4.7989292378541636e-05, "loss": 0.0377, "step": 7980 }, { "epoch": 1.0263191735608042, "grad_norm": 0.1845703125, "learning_rate": 4.7979193811502426e-05, "loss": 0.041, "step": 7981 }, { "epoch": 1.0264477688713618, "grad_norm": 0.16796875, "learning_rate": 4.796909532703101e-05, "loss": 0.0329, "step": 7982 }, { "epoch": 1.0265763641819194, "grad_norm": 0.1435546875, "learning_rate": 4.7958996925539995e-05, "loss": 0.0259, "step": 7983 }, { "epoch": 1.026704959492477, "grad_norm": 0.1552734375, "learning_rate": 4.7948898607441945e-05, "loss": 0.0342, "step": 7984 }, { "epoch": 1.026833554803035, "grad_norm": 0.166015625, "learning_rate": 4.7938800373149525e-05, "loss": 0.0332, "step": 7985 }, { "epoch": 1.0269621501135926, "grad_norm": 0.1796875, "learning_rate": 4.7928702223075294e-05, "loss": 0.0431, "step": 7986 }, { "epoch": 1.0270907454241502, "grad_norm": 0.138671875, "learning_rate": 4.7918604157631886e-05, "loss": 0.0224, "step": 7987 }, { "epoch": 1.0272193407347079, "grad_norm": 0.1640625, "learning_rate": 4.790850617723186e-05, "loss": 0.0381, "step": 7988 }, { "epoch": 1.0273479360452655, "grad_norm": 0.169921875, "learning_rate": 4.789840828228784e-05, "loss": 0.0354, "step": 7989 }, { "epoch": 1.0274765313558232, "grad_norm": 0.13671875, "learning_rate": 4.788831047321239e-05, "loss": 0.0302, "step": 7990 }, { "epoch": 1.0276051266663808, "grad_norm": 0.1708984375, "learning_rate": 4.7878212750418124e-05, "loss": 0.0383, "step": 7991 }, { "epoch": 1.0277337219769387, "grad_norm": 0.1689453125, "learning_rate": 4.7868115114317584e-05, "loss": 0.0376, "step": 7992 }, { "epoch": 1.0278623172874963, "grad_norm": 0.1591796875, "learning_rate": 4.7858017565323384e-05, "loss": 0.0309, "step": 7993 }, { "epoch": 1.027990912598054, "grad_norm": 0.1552734375, "learning_rate": 4.784792010384806e-05, "loss": 0.0315, "step": 7994 }, { "epoch": 1.0281195079086116, "grad_norm": 0.1318359375, "learning_rate": 4.783782273030423e-05, "loss": 0.0243, "step": 7995 }, { "epoch": 1.0282481032191693, "grad_norm": 0.1484375, "learning_rate": 4.782772544510443e-05, "loss": 0.0271, "step": 7996 }, { "epoch": 1.028376698529727, "grad_norm": 0.185546875, "learning_rate": 4.781762824866121e-05, "loss": 0.0364, "step": 7997 }, { "epoch": 1.0285052938402846, "grad_norm": 0.1630859375, "learning_rate": 4.780753114138717e-05, "loss": 0.0372, "step": 7998 }, { "epoch": 1.0286338891508422, "grad_norm": 0.1708984375, "learning_rate": 4.779743412369482e-05, "loss": 0.035, "step": 7999 }, { "epoch": 1.0287624844614, "grad_norm": 0.154296875, "learning_rate": 4.778733719599676e-05, "loss": 0.0295, "step": 8000 }, { "epoch": 1.0287624844614, "eval_loss": 0.038489215075969696, "eval_runtime": 1043.1503, "eval_samples_per_second": 94.163, "eval_steps_per_second": 1.177, "step": 8000 } ], "logging_steps": 1, "max_steps": 15552, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.4767318680456397e+18, "train_batch_size": 80, "trial_name": null, "trial_params": null }